data_processes/do_nlp_processes.py
2025-08-16 14:24:11 +03:30

89 lines
2.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
سورس اجرای پردازش های مختلف روی اجزای قانونی
شامل: کلاسیفیکیشن، تشخیص موجودیت های نامدار، استخراج بردار کلمات، استخراج کلیدواژه ها و ساده‌سازی(بازنمایی) متن
"""
from p1_classifier import do_classify
from p2_ner_recognizer import do_ner_recognize
from p3_words_embedder import do_word_embedder
from p4_keyword_extractor import do_keyword_extract
from p5_simplifier import do_representation
from elastic_helper import ElasticHelper
import json
def get_sections():
# region خواندن کل سکشن ها از فایل جیسون
# sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
# eh_obj = ElasticHelper()
# sections = eh_obj.iterateJsonFile(sections_path, True)
# sections = convert_to_dict(sections)
# endregion
# region خواندن تعداد محدودی از سکشن ها از طریق فایل جیسون
with open('./data/recent_sections.json', 'r', encoding='utf-8') as file:
sections = json.load(file)
# endregion
return sections
def convert_to_dict(sections):
sections_dict = {}
for item in sections:
id = item['id']
source = item['source']
sections_dict[id] = source
return sections_dict
def main():
# get sections to do nlp processes
sections = get_sections()
# temp_sections = {}
# for i, item in enumerate(sections):
# if i>3:
# break
# temp_sections[item] = sections[item]
# sections = temp_sections
# dictsections = {}
# for item in sections:
# if not item['id'] == 'qs2180272':
# continue
# dictsections[item['id']] = item['source']
# break
# sections = dictsections
# 1. classify
sections = do_classify(sections)
# 2. ner_recognize
sections = do_ner_recognize(sections)
# 3. word embedder
sections = do_word_embedder(sections)
# 4. keyword extract
# keyword_extract_result, sections = do_keyword_extract(sections)
# if keyword_extract_result:
# print(f'keyword extraction finished successfully!')
# 5. simpify
# representation_result, sections = do_representation(sections)
# if representation_result:
# print(f'representation finished successfully!')
with open(f'./data/sections_full_metadata.json', 'w', encoding='utf-8') as output_file:
data = json.dumps(sections, ensure_ascii=False, indent=2)
output_file.write(data)
print('all nlp processes finished successfully!')
main()