89 lines
2.7 KiB
Python
89 lines
2.7 KiB
Python
"""
|
||
سورس اجرای پردازش های مختلف روی اجزای قانونی
|
||
شامل: کلاسیفیکیشن، تشخیص موجودیت های نامدار، استخراج بردار کلمات، استخراج کلیدواژه ها و سادهسازی(بازنمایی) متن
|
||
"""
|
||
|
||
from p1_classifier import do_classify
|
||
from p2_ner_recognizer import do_ner_recognize
|
||
from p3_words_embedder import do_word_embedder
|
||
from p4_keyword_extractor import do_keyword_extract
|
||
from p5_simplifier import do_representation
|
||
|
||
from elastic_helper import ElasticHelper
|
||
import json
|
||
|
||
def get_sections():
|
||
|
||
# region خواندن کل سکشن ها از فایل جیسون
|
||
# sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
|
||
# eh_obj = ElasticHelper()
|
||
# sections = eh_obj.iterateJsonFile(sections_path, True)
|
||
# sections = convert_to_dict(sections)
|
||
# endregion
|
||
|
||
# region خواندن تعداد محدودی از سکشن ها از طریق فایل جیسون
|
||
with open('./data/recent_sections.json', 'r', encoding='utf-8') as file:
|
||
sections = json.load(file)
|
||
# endregion
|
||
|
||
return sections
|
||
|
||
def convert_to_dict(sections):
|
||
sections_dict = {}
|
||
for item in sections:
|
||
id = item['id']
|
||
source = item['source']
|
||
sections_dict[id] = source
|
||
|
||
return sections_dict
|
||
|
||
def main():
|
||
|
||
# get sections to do nlp processes
|
||
sections = get_sections()
|
||
|
||
|
||
# temp_sections = {}
|
||
# for i, item in enumerate(sections):
|
||
# if i>3:
|
||
# break
|
||
# temp_sections[item] = sections[item]
|
||
|
||
# sections = temp_sections
|
||
|
||
# dictsections = {}
|
||
# for item in sections:
|
||
# if not item['id'] == 'qs2180272':
|
||
# continue
|
||
# dictsections[item['id']] = item['source']
|
||
# break
|
||
# sections = dictsections
|
||
|
||
# 1. classify
|
||
sections = do_classify(sections)
|
||
|
||
# 2. ner_recognize
|
||
sections = do_ner_recognize(sections)
|
||
|
||
# 3. word embedder
|
||
sections = do_word_embedder(sections)
|
||
|
||
# 4. keyword extract
|
||
# keyword_extract_result, sections = do_keyword_extract(sections)
|
||
# if keyword_extract_result:
|
||
# print(f'keyword extraction finished successfully!')
|
||
|
||
# 5. simpify
|
||
# representation_result, sections = do_representation(sections)
|
||
# if representation_result:
|
||
# print(f'representation finished successfully!')
|
||
|
||
with open(f'./data/sections_full_metadata.json', 'w', encoding='utf-8') as output_file:
|
||
data = json.dumps(sections, ensure_ascii=False, indent=2)
|
||
output_file.write(data)
|
||
|
||
print('all nlp processes finished successfully!')
|
||
|
||
|
||
main()
|