From b7f15bd84608c056cb8a4f64e70a1b5e2f400c29 Mon Sep 17 00:00:00 2001 From: ajokar Date: Sat, 16 Aug 2025 14:24:11 +0330 Subject: [PATCH] add some readme files --- do_nlp_processes.py | 46 +- normalizer copy.py | 1370 ------------------------------- p1_classifier.py | 19 +- p2_ner_recognizer.py | 4 +- p3_words_embedder.py | 11 +- p4_keyword_extractor.py | 15 +- p5_simplifier.py | 20 +- readme/readme-classifier.md | 71 ++ readme/readme-words-embedder.md | 70 ++ 9 files changed, 218 insertions(+), 1408 deletions(-) delete mode 100644 normalizer copy.py create mode 100644 readme/readme-classifier.md create mode 100644 readme/readme-words-embedder.md diff --git a/do_nlp_processes.py b/do_nlp_processes.py index c030814..4a41a78 100644 --- a/do_nlp_processes.py +++ b/do_nlp_processes.py @@ -2,19 +2,30 @@ سورس اجرای پردازش های مختلف روی اجزای قانونی شامل: کلاسیفیکیشن، تشخیص موجودیت های نامدار، استخراج بردار کلمات، استخراج کلیدواژه ها و ساده‌سازی(بازنمایی) متن """ + from p1_classifier import do_classify from p2_ner_recognizer import do_ner_recognize from p3_words_embedder import do_word_embedder -# from p4_keyword_extractor import do_keyword_extract -# from p5_simplifier import do_simplify +from p4_keyword_extractor import do_keyword_extract +from p5_simplifier import do_representation from elastic_helper import ElasticHelper +import json def get_sections(): - sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip" - eh_obj = ElasticHelper() - sections = eh_obj.iterateJsonFile(sections_path, True) - sections = convert_to_dict(sections) + + # region خواندن کل سکشن ها از فایل جیسون + # sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip" + # eh_obj = ElasticHelper() + # sections = eh_obj.iterateJsonFile(sections_path, True) + # sections = convert_to_dict(sections) + # endregion + + # region خواندن تعداد محدودی از سکشن ها از طریق فایل جیسون + with open('./data/recent_sections.json', 'r', encoding='utf-8') as file: + sections = json.load(file) + # endregion + return sections def convert_to_dict(sections): @@ -31,6 +42,15 @@ def main(): # get sections to do nlp processes sections = get_sections() + + # temp_sections = {} + # for i, item in enumerate(sections): + # if i>3: + # break + # temp_sections[item] = sections[item] + + # sections = temp_sections + # dictsections = {} # for item in sections: # if not item['id'] == 'qs2180272': @@ -46,14 +66,22 @@ def main(): sections = do_ner_recognize(sections) # 3. word embedder - #sections = do_word_embedder(sections) + sections = do_word_embedder(sections) # 4. keyword extract - # result_kw = do_keyword_extract(sections) + # keyword_extract_result, sections = do_keyword_extract(sections) + # if keyword_extract_result: + # print(f'keyword extraction finished successfully!') # 5. simpify - # result_simp = do_simplify(sections) + # representation_result, sections = do_representation(sections) + # if representation_result: + # print(f'representation finished successfully!') + with open(f'./data/sections_full_metadata.json', 'w', encoding='utf-8') as output_file: + data = json.dumps(sections, ensure_ascii=False, indent=2) + output_file.write(data) + print('all nlp processes finished successfully!') diff --git a/normalizer copy.py b/normalizer copy.py deleted file mode 100644 index 6c92ca2..0000000 --- a/normalizer copy.py +++ /dev/null @@ -1,1370 +0,0 @@ -from re import sub -import copy -import os -from tokenizer import Tokenizer -from data_helper import DataHelper -import general_functions as gf -import traceback,sys - -class Normalizer(): - - def __init__(self, - half_space_char='\u200c', - date_normalizing_needed=False, - pinglish_conversion_needed=False, - train_file_path="resource/tokenizer/Bijan_khan_chunk.txt", - token_merger_path="resource/tokenizer/TokenMerger.pckl"): - self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/" - - self.dic1_path = self.dir_path + 'resource/normalizer/Dic1_new.txt' - self.dic2_path = self.dir_path + 'resource/normalizer/Dic2_new.txt' - self.dic3_path = self.dir_path + 'resource/normalizer/Dic3_new.txt' - self.dic1 = self.load_dictionary(self.dic1_path) - self.dic2 = self.load_dictionary(self.dic2_path) - self.dic3 = self.load_dictionary(self.dic3_path) - - self.date_normalizing_needed = date_normalizing_needed - self.pinglish_conversion_needed = pinglish_conversion_needed - self.data_helper = DataHelper() - - - if self.date_normalizing_needed or self.pinglish_conversion_needed: - self.tokenizer = Tokenizer() - self.date_normalizer = DateNormalizer() - self.pinglish_conversion = PinglishNormalizer() - - - - def load_dictionary(self, file_path): - dict = {} - with open(file_path, 'r', encoding='utf-8') as f: - g = f.readlines() - for Wrds in g: - wrd = Wrds.split(' ') - dict[wrd[0].strip()] = sub('\n', '', wrd[1].strip()) - return dict - - def sub_alphabets(self, doc_string): - # try: - # doc_string = doc_string.decode('utf-8') - # except UnicodeEncodeError: - # pass - # a0 = "ء" - # b0 = "ئ" - # c0 = sub(a0, b0, doc_string) - a11 = r"ﺁ|آ" - b11 = r"آ" - c11 = sub(a11, b11, doc_string) - a1 = r"ٲ|ٱ|إ|ﺍ|أ" - b1 = r"ا" - c1 = sub(a1, b1, c11) - a2 = r"ﺐ|ﺏ|ﺑ" - b2 = r"ب" - c2 = sub(a2, b2, c1) - a3 = r"ﭖ|ﭗ|ﭙ|ﺒ|ﭘ" - b3 = r"پ" - c3 = sub(a3, b3, c2) - a4 = r"ﭡ|ٺ|ٹ|ﭞ|ٿ|ټ|ﺕ|ﺗ|ﺖ|ﺘ" - b4 = r"ت" - c4 = sub(a4, b4, c3) - a5 = r"ﺙ|ﺛ" - b5 = r"ث" - c5 = sub(a5, b5, c4) - a6 = r"ﺝ|ڃ|ﺠ|ﺟ" - b6 = r"ج" - c6 = sub(a6, b6, c5) - a7 = r"ڃ|ﭽ|ﭼ" - b7 = r"چ" - c7 = sub(a7, b7, c6) - a8 = r"ﺢ|ﺤ|څ|ځ|ﺣ" - b8 = r"ح" - c8 = sub(a8, b8, c7) - a9 = r"ﺥ|ﺦ|ﺨ|ﺧ" - b9 = r"خ" - c9 = sub(a9, b9, c8) - a10 = r"ڏ|ډ|ﺪ|ﺩ" - b10 = r"د" - c10 = sub(a10, b10, c9) - a11 = r"ﺫ|ﺬ|ﻧ" - b11 = r"ذ" - c11 = sub(a11, b11, c10) - a12 = r"ڙ|ڗ|ڒ|ڑ|ڕ|ﺭ|ﺮ" - b12 = r"ر" - c12 = sub(a12, b12, c11) - a13 = r"ﺰ|ﺯ" - b13 = r"ز" - c13 = sub(a13, b13, c12) - a14 = r"ﮊ" - b14 = r"ژ" - c14 = sub(a14, b14, c13) - a15 = r"ݭ|ݜ|ﺱ|ﺲ|ښ|ﺴ|ﺳ" - b15 = r"س" - c15 = sub(a15, b15, c14) - a16 = r"ﺵ|ﺶ|ﺸ|ﺷ" - b16 = r"ش" - c16 = sub(a16, b16, c15) - a17 = r"ﺺ|ﺼ|ﺻ" - b17 = r"ص" - c17 = sub(a17, b17, c16) - a18 = r"ﺽ|ﺾ|ﺿ|ﻀ" - b18 = r"ض" - c18 = sub(a18, b18, c17) - a19 = r"ﻁ|ﻂ|ﻃ|ﻄ" - b19 = r"ط" - c19 = sub(a19, b19, c18) - a20 = r"ﻆ|ﻇ|ﻈ" - b20 = r"ظ" - c20 = sub(a20, b20, c19) - a21 = r"ڠ|ﻉ|ﻊ|ﻋ" - b21 = r"ع" - c21 = sub(a21, b21, c20) - a22 = r"ﻎ|ۼ|ﻍ|ﻐ|ﻏ" - b22 = r"غ" - c22 = sub(a22, b22, c21) - a23 = r"ﻒ|ﻑ|ﻔ|ﻓ" - b23 = r"ف" - c23 = sub(a23, b23, c22) - a24 = r"ﻕ|ڤ|ﻖ|ﻗ" - b24 = r"ق" - c24 = sub(a24, b24, c23) - a25 = r"ڭ|ﻚ|ﮎ|ﻜ|ﮏ|ګ|ﻛ|ﮑ|ﮐ|ڪ|ك" - b25 = r"ک" - c25 = sub(a25, b25, c24) - a26 = r"ﮚ|ﮒ|ﮓ|ﮕ|ﮔ" - b26 = r"گ" - c26 = sub(a26, b26, c25) - a27 = r"ﻝ|ﻞ|ﻠ|ڵ" - b27 = r"ل" - c27 = sub(a27, b27, c26) - a28 = r"ﻡ|ﻤ|ﻢ|ﻣ" - b28 = r"م" - c28 = sub(a28, b28, c27) - a29 = r"ڼ|ﻦ|ﻥ|ﻨ" - b29 = r"ن" - c29 = sub(a29, b29, c28) - a30 = r"ވ|ﯙ|ۈ|ۋ|ﺆ|ۊ|ۇ|ۏ|ۅ|ۉ|ﻭ|ﻮ|ؤ" - b30 = r"و" - c30 = sub(a30, b30, c29) - a31 = r"ﺔ|ﻬ|ھ|ﻩ|ﻫ|ﻪ|ۀ|ە|ة|ہ" - b31 = r"ه" - c31 = sub(a31, b31, c30) - a32 = r"ﭛ|ﻯ|ۍ|ﻰ|ﻱ|ﻲ|ں|ﻳ|ﻴ|ﯼ|ې|ﯽ|ﯾ|ﯿ|ێ|ے|ى|ي" - b32 = r"ی" - c32 = sub(a32, b32, c31) - a33 = r'¬' - b33 = r'‌' - c33 = sub(a33, b33, c32) - pa0 = r'•|·|●|·|・|∙|。|ⴰ' - pb0 = r'.' - pc0 = sub(pa0, pb0, c33) - pa1 = r',|٬|٫|‚|,' - pb1 = r'،' - pc1 = sub(pa1, pb1, pc0) - pa2 = r'؟|ʕ' - pb2 = r'' - pc2 = sub(pa2, pb2, pc1) - na0 = r'۰|٠' - nb0 = r'0' - nc0 = sub(na0, nb0, pc2) - na1 = r'۱|١' - nb1 = r'1' - nc1 = sub(na1, nb1, nc0) - na2 = r'۲|٢' - nb2 = r'2' - nc2 = sub(na2, nb2, nc1) - na3 = r'۳|٣' - nb3 = r'3' - nc3 = sub(na3, nb3, nc2) - na4 = r'۴|٤' - nb4 = r'4' - nc4 = sub(na4, nb4, nc3) - na5 = r'۵' - nb5 = r'5' - nc5 = sub(na5, nb5, nc4) - na6 = r'۶|٦' - nb6 = r'6' - nc6 = sub(na6, nb6, nc5) - na7 = r'۷|٧' - nb7 = r'7' - nc7 = sub(na7, nb7, nc6) - na8 = r'۸|٨' - nb8 = r'8' - nc8 = sub(na8, nb8, nc7) - na9 = r'۹|٩' - nb9 = r'9' - nc9 = sub(na9, nb9, nc8) - np2 = r'²' - nm2 = r'2' - ng2 = sub(np2, nm2, nc9) - ea1 = r'|ِ|ُ|َ|ٍ|ٌ|ً|' - eb1 = r'' - ec1 = sub(ea1, eb1, ng2) - ea1 = r'ـ|_' - eb1 = r'' - ec2 = sub(ea1, eb1, ec1) - Sa1 = r'( )+' - Sb1 = r' ' - Sc1 = sub(Sa1, Sb1, ec2) - Sa2 = r'(\n)+' - Sb2 = r'\n' - Sc2 = sub(Sa2, Sb2, Sc1) - Sd1 = '\u00A0'# No-Break Space (NBSP) character - Sd2 = Sc2.replace(Sd1, '') - - return Sd2.strip() - - def space_correction(self, doc_string): - a00 = r'^(بی|می|نمی)( )' - b00 = r'\1‌' - c00 = sub(a00, b00, doc_string) - a0 = r'( )(می|نمی|بی)( )' - b0 = r'\1\2‌' - c0 = sub(a0, b0, c00) - a1 = r'( )(هایی|ها|های|ایی|هایم|هایت|هایش|هایمان|هایتان|هایشان|ات|ان|ین' \ - r'|انی|بان|ام|ای|یم|ید|اید|اند|بودم|بودی|بود|بودیم|بودید|بودند|ست)( )' - b1 = r'‌\2\3' - c1 = sub(a1, b1, c0) - a2 = r'( )(شده|نشده)( )' - b2 = r'‌\2‌' - c2 = sub(a2, b2, c1) - a3 = r'( )(طلبان|طلب|گرایی|گرایان|شناس|شناسی|گذاری|گذار|گذاران|شناسان|گیری|پذیری|بندی|آوری|سازی|' \ - r'بندی|کننده|کنندگان|گیری|پرداز|پردازی|پردازان|آمیز|سنجی|ریزی|داری|دهنده|آمیز|پذیری' \ - r'|پذیر|پذیران|گر|ریز|ریزی|رسانی|یاب|یابی|گانه|گانه‌ای|انگاری|گا|بند|رسانی|دهندگان|دار)( )' - b3 = r'‌\2\3' - c3 = sub(a3, b3, c2) - return c3 - - def space_correction_plus1(self, doc_string): - out_sentences = '' - for wrd in doc_string.split(' '): - try: - out_sentences = out_sentences + ' ' + self.dic1[wrd] - except KeyError: - out_sentences = out_sentences + ' ' + wrd - return out_sentences - - def space_correction_plus2(self, doc_string): - out_sentences = '' - wrds = doc_string.split(' ') - L = wrds.__len__() - if L < 2: - return doc_string - cnt = 1 - for i in range(0, L - 1): - w = wrds[i] + wrds[i + 1] - try: - out_sentences = out_sentences + ' ' + self.dic2[w] - cnt = 0 - except KeyError: - if cnt == 1: - out_sentences = out_sentences + ' ' + wrds[i] - cnt = 1 - if cnt == 1: - out_sentences = out_sentences + ' ' + wrds[i + 1] - return out_sentences - - def space_correction_plus3(self, doc_string): - # Dict = {'گفتوگو': 'گفت‌وگو'} - out_sentences = '' - wrds = doc_string.split(' ') - L = wrds.__len__() - if L < 3: - return doc_string - cnt = 1 - cnt2 = 0 - for i in range(0, L - 2): - w = wrds[i] + wrds[i + 1] + wrds[i + 2] - try: - out_sentences = out_sentences + ' ' + self.dic3[w] - cnt = 0 - cnt2 = 2 - except KeyError: - if cnt == 1 and cnt2 == 0: - out_sentences = out_sentences + ' ' + wrds[i] - else: - cnt2 -= 1 - cnt = 1 - if cnt == 1 and cnt2 == 0: - out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2] - elif cnt == 1 and cnt2 == 1: - out_sentences = out_sentences + ' ' + wrds[i + 2] - return out_sentences - - def normalize(self, doc_string, new_line_elimination=False, return_dates = False): - normalized_string = gf.normalYehKe(doc_string) - normalized_string = self.sub_alphabets(doc_string) - #normalized_string = self.data_helper.clean_text(normalized_string, new_line_elimination).strip() - - #normalized_string = self.space_correction(self.space_correction_plus1(self.space_correction_plus2(self.space_correction_plus3(normalized_string)))).strip() - - # !!!آئین نامه را به آئین‌نامه تبدیل می کند و دو توکن را به یکی تبدیل می کند - #normalized_string = (self.space_correction_plus1(self.space_correction_plus2(self.space_correction_plus3(normalized_string)))).strip() - - #if self.pinglish_conversion_needed: - #normalized_string = self.pinglish_conversion.pingilish2persian(self.tokenizer.tokenize_words(normalized_string)) - - if self.date_normalizing_needed: - token_list = self.tokenizer.tokenize_words(normalized_string) - # نرمالایز کردن اعداد حروفی و ... - normalized_string, additional_token_index_array = self.date_normalizer.normalize_numbers2(token_list) - # نرمالایز و تشخیص تاریخ ها - normalized_string, dates, recognized_dates = self.date_normalizer.general_normalize_date(normalized_string, additional_token_index_array) - #normalized_string_list = normalized_string.strip().split() - #normalized_string, dates, recognized_dates = self.date_normalizer.normalize_dates(normalized_string_list, additional_token_index_array, token_list_len= len(normalized_string_list), previous_slice_index_array=[0,]) - # کنترل اعدادی که پشت سر هم آمده اند و با واو از هم جدا شده اند - normalized_string, recognized_numbers = self.date_normalizer.handle_continuous_numbers(normalized_string) - # در مواردی مانند وسیصد که واو به عدد سیصد چسبیده، ابتدا این دو را از هم جدا می کنیم - # تا عدد اصلی را به دست بیاوریم. در این صورت یک توکن اضافه تولید می شود که با این متد، توکن های اضافی را حذف می کنیم - normalized_string =self.date_normalizer.remove_additional_tokens(normalized_string.split(), additional_token_index_array) - if return_dates: - # مرتب کردن آرایه تاریخ های پیدا شده بر اساس توکن شروع عبارت حاوی تاریخ - recognized_dates.sort(key=lambda x: int(x['start_date_token_index']), reverse=False) - - return normalized_string, dates, recognized_dates, recognized_numbers - else: - return normalized_string - - -class DateNormalizer(): - def __init__(self): - - self.month_dict = { "فروردین": 1,"فروردینماه": 1,'فروردین\u200cماه':1, - "اردیبهشت": 2,"اردیبهشتماه": 2,'اردیبهشت\u200cماه':2, - "خرداد": 3,"خردادماه": 3, - "تیر": 4,"تیرماه": 4, - "مرداد": 5,"مردادماه": 5, - "شهریور": 6,"شهریورماه": 6, - "مهر": 7,"مهرماه": 7, - "آبان": 8,"آبانماه": 8,'آبان\u200cماه':8, - "آذر": 9,"آذرماه": 9, - "دی": 10,"دیماه": 10,'دی\u200cماه':10, - "بهمن": 11,"بهمنماه": 11,'بهمن\u200cماه':11, - "اسفند": 12,"اسفندماه": 12} - self.num_dict = {"صد": 100,"وصد": 100, "یکصد": 100, "ویکصد": 100, "هزار": 1000,"وهزار": 1000, - "یکهزار": 1000,"ویکهزار": 1000, - "میلیون": 1000000,"ملیون": 1000000, "ومیلیون": 1000000,"وملیون": 1000000, - "یکمیلیون": 1000000,"یکملیون": 1000000, - "ویکمیلیون": 1000000,"ویکملیون": 1000000, "دویست": 200,"ودویست": 200, - "ده": 10,"وده": 10, "نه": 9,"ونه": 9, "هشت": 8,"وهشت": 8, "هفت": 7,"وهفت": 7, - "شش": 6,"وشش": 6, "پنج": 5,"وپنج": 5,"چهار": 4,"وچهار": 4, "سه": 3,"وسه": 3, - "دو": 2,"ودو": 2, "یک": 1,"ویک": 1, "یازده": 11,"ویازده": 11, "سیزده": 13, "وسیزده": 13, - "چهارده": 14, "دوازده": 12, "پانزده": 15, "شانزده": 16, "هفده": 17,"هیفده": 17, - "وچهارده": 14, "ودوازده": 12, "وپانزده": 15, "وشانزده": 16, "وهفده": 17,"وهیفده": 17, - "هجده": 18,"هیجده": 18, "نوزده": 19, "بیست": 20, "سی": 30, "چهل": 40, "پنجاه": 50, - "وهجده": 18,"وهیجده": 18, "ونوزده": 19, "وبیست": 20, "وسی": 30, "وچهل": 40, "وپنجاه": 50, - "شصت": 60, "هفتاد": 70, "نود": 90, "سیصد": 300, "چهارصد": 400, - "وشصت": 60, "وهفتاد": 70, "ونود": 90, "وسیصد": 300, "وچهارصد": 400, - "پانصد": 500, "ششصد": 600, "هفتصد": 700, "هشتصد": 800, "نهصد": 900, - "وپانصد": 500, "وششصد": 600, "وهفتصد": 700, "وهشتصد": 800, "ونهصد": 900, - "هشتاد": 80, " ": 0, "میلیارد": 1000000000,"ملیارد": 1000000000, - "یکمیلیارد": 1000000000,"یکملیارد": 1000000000, - "وهشتاد": 80, " ": 0, "ومیلیارد": 1000000000,"وملیارد": 1000000000, - "ویکمیلیارد": 1000000000,"ویکملیارد": 1000000000, - "صدم": 100, "هزارم": 1000, "دویستم": 200, - "وصدم": 100, "وهزارم": 1000, "ودویستم": 200, - "دهم": 10, "نهم": 9, "هشتم": 8, "هفتم": 7, "ششم": 6, "پنجم": 5, - "ودهم": 10, "ونهم": 9, "وهشتم": 8, "وهفتم": 7, "وششم": 6, "وپنجم": 5, - "چهارم": 4, "سوم": 3, "دوم": 2, "یکم": 1, "اول": 1, "یازدهم": 11, "سیزدهم": 13, - "وچهارم": 4, "وسوم": 3, "ودوم": 2, "ویکم": 1, "واول": 1, "ویازدهم": 11, "وسیزدهم": 13, - "چهاردهم": 14, "دوازدهم": 12, "پانزدهم": 15, "شانزدهم": 16, "هفدهم": 17,"هیفدهم": 17, - "وچهاردهم": 14, "ودوازدهم": 12, "وپانزدهم": 15, "وشانزدهم": 16, "وهفدهم": 17,"وهیفدهم": 17, - "هجدهم": 18,"هیجدهم": 18, "نوزدهم": 19, "بیستم": 20, "چهلم": 40, "پنجاهم": 50, - "وهجدهم": 18,"وهیجدهم": 18, "ونوزدهم": 19, "وبیستم": 20, "وچهلم": 40, "وپنجاهم": 50, - "شصتم": 60, "هفتادم": 70, "نودم": 90, "سیصدم": 300, "چهارصدم": 400, - "وشصتم": 60, "وهفتادم": 70, "ونودم": 90, "وسیصدم": 300, "وچهارصدم": 400, - "پانصدم": 500, "ششصدم": 600, "هفتصدم": 700, "هشتصدم": 800, "نهصدم": 900, - "وپانصدم": 500, "وششصدم": 600, "وهفتصدم": 700, "وهشتصدم": 800, "ونهصدم": 900, - "هشتادم": 80,"وهشتادم": 80} - - def find_date_part(self, token_list): - import re - for index, element in enumerate(token_list): - # بررسی الگوی تاریخ 1398/12/18 - pattern_date = r'^(\d{4}\s*/\s*([1-9]|0[1-9]|1[0-2])\s*/\s*([1-9]|0[1-9]|[12][0-9]|3[01]))$' - date_pattern = re.compile(pattern_date) - match_date = date_pattern.match(element) - if match_date: - date_parts = match_date.string.split('/') - year = int(re.search(r'\d+', date_parts[0]).group()) - month = int(re.search(r'\d+', date_parts[1]).group()) - day = int(re.search(r'\d+', date_parts[2]).group()) - # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد - if day > 999: - day = int(re.search(r'\d+', date_parts[0]).group()) - year = int(re.search(r'\d+', date_parts[2]).group()) - formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day) - return formal_date, index, index, index - - # 24 /12 /1401 بررسی الگوی تاریخ - pattern_date = r'\s*\d{2}\s*/\s*\d{2}\s*/\s*\d{4}\s*' - date_pattern = re.compile(pattern_date) - match_date = date_pattern.match(element) - if match_date: - date_parts = match_date.string.split('/') - year = int(re.search(r'\d+', date_parts[0]).group()) - month = int(re.search(r'\d+', date_parts[1]).group()) - day = int(re.search(r'\d+', date_parts[2]).group()) - # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد - if day > 999: - day = int(re.search(r'\d+', date_parts[0]).group()) - year = int(re.search(r'\d+', date_parts[2]).group()) - formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day) - return formal_date, index, index, index - - # بررسی الگوی تاریخ 1402،10،06 - patterndate2 = r'^(\d{4}\s*،\s*([1-9]|0[1-9]|1[0-2])\s*،\s*([1-9]|0[1-9]|[12][0-9]|3[01]))$' - date_pattern = re.compile(patterndate2) - match_date2 = date_pattern.match(element) - if match_date2: - date_parts = match_date2.string.split('،') - year = int(re.search(r'\d+', date_parts[0]).group()) - month = int(re.search(r'\d+', date_parts[1]).group()) - day = int(re.search(r'\d+', date_parts[2]).group()) - # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد - if day > 999: - day = int(re.search(r'\d+', date_parts[0]).group()) - year = int(re.search(r'\d+', date_parts[2]).group()) - formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day) - return formal_date, index, index, index - - # بررسی الگوی تاریخ 13ر04ر1345 - patterndate2 = r'^(\d{4}\s*ر\s*([1-9]|0[1-9]|1[0-2])\s*ر\s*([1-9]|0[1-9]|[12][0-9]|3[01]))$' - date_pattern = re.compile(patterndate2) - match_date2 = date_pattern.match(element) - if match_date2: - date_parts = match_date2.string.split('ر') - year = int(re.search(r'\d+', date_parts[0]).group()) - month = int(re.search(r'\d+', date_parts[1]).group()) - day = int(re.search(r'\d+', date_parts[2]).group()) - # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد - if day > 999: - day = int(re.search(r'\d+', date_parts[0]).group()) - year = int(re.search(r'\d+', date_parts[2]).group()) - formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day) - return formal_date, index, index, index - - if element == "/": - if index-1 >= 0 and index+1 < len(token_list) \ - and token_list[index -1].isdigit() and token_list[index+1].isdigit(): - if index+3 < len(token_list) and token_list[index+2] == "/" \ - and token_list[index + 3].isdigit(): - if int(token_list[index-1]) < 1450 and int(token_list[index+1]) < 13 and \ - int(token_list[index+3]) < 32: - formal_date = [int(token_list[index-1]), int(token_list[index+1]), int(token_list[index+3])] - formal_date = "y" + str(formal_date[0]) + "m" + str(formal_date[1]) + "d" + str(formal_date[2]) - #return formal_date, index-1, index+3, index - return formal_date, index, index, index - elif int(token_list[index-1]) < 32 and int(token_list[index+1]) < 13 and \ - int(token_list[index+3]) < 1450: - formal_date = [int(token_list[index-1]), int(token_list[index+1]), int(token_list[index+3])] - formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) - #return formal_date, index-1, index+3, index - return formal_date, index, index, index - else: - formal_date = [int(token_list[index-1]), int(token_list[index+1]), int(token_list[index+3])] - formal_date = "num(" + str(formal_date[0]) + "/" + str(formal_date[1]) + "/" + str(formal_date[2]) + ")" - #return formal_date, index-1, index+3, index - return formal_date, index, index, index - elif (int(token_list[index-1]) < 1450 and int(token_list[index-1]) > 1250) and int(token_list[index+1]) < 13: - formal_date = [int(token_list[index-1]), int(token_list[index+ 1]), 0] - formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) - #return formal_date, index-1 , index+1, index - return formal_date, index , index, index - else: - formal_date = [int(token_list[index-1]), int(token_list[index+ 1])] - formal_date = "num(" + str(formal_date[0]) + "/" + str(formal_date[1]) + ")" - #return formal_date, index-1 , index+1, index - return formal_date, index , index, index - - if element in self.month_dict or element == "سال": - if index + 1 < len(token_list) and index - 1 > -2: - try: - start_date_index = end_date_index = 0 - if(token_list[index + 1] == 'ماه'): - if(token_list[index + 2] == 'سال' and str(token_list[index + 3]).isdigit()): - - if(int(token_list[index + 3])==1000): - # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد - # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم - start_date_index = index - 1 - day = int(token_list[index - 1]) - if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()): - day += int(token_list[index - 3])# رقم دهگان روز - start_date_index = index - 3 - year = 1000 - if(len(token_list)>= index + 5): - if(token_list[index + 4]=='و' and token_list[index + 5].isdigit()): - year += int(token_list[index + 5])# رقم صدگان سال - end_date_index = index + 5 - if(len(token_list)>= index + 7): - if(token_list[index + 6]=='و' and token_list[index + 7].isdigit()): - year += int(token_list[index + 7])# رقم دهگان سال - end_date_index = index + 7 - if(len(token_list)>= index + 9): - if(token_list[index + 8]=='و' and token_list[index + 9].isdigit()): - year += int(token_list[index + 9])# رقم یکان سال - end_date_index = index + 9 - - formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 8 تیر ماه سال 1000 و 300 و 20 و 5 - - else: - start_date_index = index - 1 - end_date_index = index + 3 - day = int(token_list[index - 1]) - if(token_list[index - 2]=='و' and int(token_list[index - 1])<10 and int(token_list[index - 3])<31 and token_list[index - 3].isdigit()): - start_date_index = index - 3 - day += int(token_list[index - 3])# رقم دهگان روز - - formal_date = [day, int(self.month_dict[token_list[index]]), int(token_list[index + 3])]# مثلا 20 و 7 تیر ماه سال 1368 - - - formal_date = [day, int(self.month_dict[token_list[index]]), int(token_list[index + 3])]# مثلا 27 تیر ماه سال 1368 - - elif(str(token_list[index + 2]).isdigit()): - if(int(token_list[index + 2])==1000): - - if token_list[index-1].strip() == 'ام': # مثلا 30 ام تیر ماه 1000 و 300 و 80 و 2 - start_date_index = index - 2 - day = int(token_list[index - 2]) - else: - # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد - # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم - start_date_index = index - 1 - day = int(token_list[index - 1]) - - if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()): - day += int(token_list[index - 3])# رقم دهگان روز - start_date_index = index - 3 - year = 1000 - if(len(token_list)>= index + 4): - if(token_list[index + 3]=='و' and token_list[index + 4].isdigit()): - year += int(token_list[index + 4])# رقم صدگان سال - end_date_index = index + 4 - if(len(token_list)>= index + 6): - if(token_list[index + 5]=='و' and token_list[index + 6].isdigit()): - year += int(token_list[index + 6])# رقم دهگان سال - end_date_index = index + 6 - if(len(token_list)>= index + 8): - if(token_list[index + 7]=='و' and token_list[index + 8].isdigit()): - year += int(token_list[index + 8])# رقم یکان سال - end_date_index = index + 8 - formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 7 تیر ماه 1000 و 300 و 20 و 5 - - else: - formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])]# مثلا 27 تیر ماه سال 1368 - start_date_index = index - 1 - end_date_index = index + 2 - #formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])] # مثلا 27 تیر ماه 1368 - - elif(token_list[index + 1] == 'سال' and (not(token_list[index + 2]).isdigit())): - - formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])] # مثلا 27 تیر سال 1368 - start_date_index = index - 1 - end_date_index = index + 2 - else: - if(token_list[index + 1] == 'سال' and str(token_list[index + 2]).isdigit()): - - if(int(token_list[index + 2])==1000): - - # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد - # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم - start_date_index = index - 1 - day = int(token_list[index - 1]) - if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()): - day += int(token_list[index - 3])# رقم دهگان روز - start_date_index = index - 3 - year = 1000 - if(len(token_list)>= index + 4): - if(token_list[index + 3]=='و' and token_list[index + 4].isdigit()): - year += int(token_list[index + 4])# رقم صدگان سال - end_date_index = index + 4 - if(len(token_list)>= index + 6): - if(token_list[index + 5]=='و' and token_list[index + 6].isdigit()): - year += int(token_list[index + 6])# # رقم دهگان سال !!! برای تاریخ بین 1399 تا 1410 هم همین روال درست بر می گرداند، هرچند منطق غلطی دارد چون مثلا سال 1402 رقم صدگان ندارد - end_date_index = index + 6 - if(len(token_list)>= index + 8): - if((len(token_list)>= index + 7 and token_list[index + 7]=='و') and (len(token_list)>= index + 8 and token_list[index + 8].isdigit())): - year += int(token_list[index + 8])# رقم یکان سال - end_date_index = index + 8 - - formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 8 تیر ماه سال 1000 و 300 و 20 و 5 - - else: - formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])]# مثلا 27 تیر سال 1368 - start_date_index = index - 1 - end_date_index = index + 2 - - elif(str(token_list[index + 1]).isdigit()): - if(int(token_list[index + 1])==1000): - # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد - # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم - start_date_index = index - 1 - day = int(token_list[index - 1]) - if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()): - day += int(token_list[index - 3])# رقم دهگان روز - start_date_index = index - 3 - year = 1000 - if(len(token_list)>= index + 3): - if(token_list[index + 2]=='و' and token_list[index + 3].isdigit()): - year += int(token_list[index + 3])# رقم صدگان سال - end_date_index = index + 3 - if(len(token_list)>= index + 5): - if(token_list[index + 4]=='و' and token_list[index + 5].isdigit()): - year += int(token_list[index + 5])# رقم دهگان سال !!! برای تاریخ بین 1399 تا 1410 هم همین روال درست بر می گرداند، هرچند منطق غلطی دارد چون مثلا سال 1402 رقم صدگان ندارد - end_date_index = index + 5 - if(len(token_list)>= index + 7): - if(token_list[index + 6]=='و' and token_list[index + 7].isdigit()): - year += int(token_list[index + 7])# رقم یکان سال - end_date_index = index + 7 - formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 7 تیر ماه 1000 و 300 و 20 و 5 - else: - formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 1])]# مثلا 27 تیر 1320‌ - start_date_index = index - 1 - end_date_index = index + 1 - - else: - formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 1])]# مثلا 27 تیر 1365 - start_date_index = index - 1 - end_date_index = index + 1 - formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) - if token_list[index - 1] and token_list[index + 1]: - return formal_date, start_date_index, end_date_index, start_date_index - - except Exception as e: - error = e.args[0] - try: - formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), 0] - formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])#مثلا y1358m12d0 - return formal_date, index-1, index, index-1 - except: - try: - # مثلا سال 1400 - if token_list[index] == "سال": - formal_date = [int(token_list[index + 1]),0, 0] - formal_date = "y" + str(formal_date[0]) + "m" + str(formal_date[1]) + "d" + str(formal_date[2])# y1358m0d0 - return formal_date, index, index+1, index - elif token_list[index+1] == "ماه" and index + 1 < len(token_list): - if index + 2 < len(token_list) and token_list[index + 2].isdigit() and \ - int(token_list[index-2]) < 1450: - formal_date = [0,int(self.month_dict[token_list[index]]),int(token_list[index+2])] - formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) # y0m12d5 - return formal_date, index, index+2, index - else: - formal_date = [0,int(self.month_dict[token_list[index]]),0] - formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) #y0m12d0 - return formal_date, index, index+1, index - elif index + 1 < len(token_list) and token_list[index + 1].isdigit() and int(token_list[index-2]) < 1450: - formal_date = [0,int(self.month_dict[token_list[index]]),int(token_list[index+1])] - formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) # y0m12d5 - return formal_date, index, index+1, index - elif index-2 >= 0 and index+2 < len(token_list) and \ - token_list[index - 1] == "/" and token_list[index+1] == "/" and \ - token_list[index - 2].isdigit() and int(token_list[index-2]) < 32 and \ - token_list[index + 2].isdigit() and int(token_list[index+2]) < 1450: - formal_date = [int(token_list[index-2]),int(self.month_dict[token_list[index]]),int(token_list[index+2])] - formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) - return formal_date, index-2, index+2, index-2 - elif index + 2 < len(token_list) and token_list[index + 1] == 'سال' and \ - token_list[index + 2].isdigit() and int(token_list[index-2]) < 1450 : - formal_date = [0,int(self.month_dict[token_list[index]]),int(token_list[index+2])] - formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) - return formal_date, index, index+1, index - #else: - # print("Name : %s -> after1: %s -> after2: %s -> after3: %s" % (token_list[index], token_list[index+1], token_list[index+2], token_list[index+3])) - except: - pass - - def general_normalize_date(self,normalized_string, additional_token_index_array): - # 24 /12 /1401 جهت بررسی الگوی تاریخ - normalized_string, recognized_dates = DateNormalizer.separated_date_format_finder(normalized_string) - normalized_string_list = normalized_string.strip().split() - # جهت یافتن فرمت های مختلف تاریخ غیر از فرمت بررسی شده در بالا - normalized_string, dates, recognized_dates_2 = \ - self.normalize_dates(normalized_string_list, additional_token_index_array, - token_list_len = len(normalized_string_list), previous_slice_index_array=[0,]) - # تجمیع همه تاریخ ها - for item in recognized_dates_2: - recognized_dates.append(item) - return normalized_string, dates, recognized_dates - - # 24 /12 /1401 متد بررسی الگوی تاریخ - def separated_date_format_finder(normalized_string): - import re - date_pattern = r'\d{1,2} /\d{1,2} /\d{2,4}' - regex = re.compile(date_pattern) - match_dates = regex.finditer(normalized_string) - recognized_dates = [] - for date_item in match_dates: - position = date_item.span() - founded_item = date_item.group() - start_index = date_item.start() + 1 - end_index = date_item.end() - 1 - current_date = founded_item.replace(' ', '') - date_parts = current_date.split('/') - formal_date = "y" + str(date_parts[2]) + "m" + str(date_parts[1]) + "d" + str(date_parts[0]) - pattern_tokens_state = gf.token_state_finder(normalized_string, start_index, end_index) - recognized_dates.append( - { - "original_date" : founded_item, - "date" : formal_date, - "start_index" : start_index, - "end_index" : end_index, - "date_token_index" : pattern_tokens_state["start_token_state"], - "start_date_token_index": pattern_tokens_state["start_token_state"], - "end_date_token_index" : pattern_tokens_state["end_token_state"], - }) - normalized_string_list = normalized_string.strip().split() - for date_item in recognized_dates: - normalized_string_list[int(date_item['date_token_index'])] = date_item['date'] - normalized_string_list[int(date_item['start_date_token_index'])+1] = 'tttt' - normalized_string_list[int(date_item['end_date_token_index'])] = 'tttt' - normalized_string_temp = '' - for token in normalized_string_list: - normalized_string_temp = normalized_string_temp + ' ' + token - return normalized_string_temp.strip(), recognized_dates - - def normalize_dates(self, token_list, additional_token_index_array, token_list_len, previous_slice_index_array= []): - try: - finded = self.find_date_part(token_list) - except Exception as e: - finded = None - - filename = 'law_section_errors.txt' - exc_type, exc_value, exc_traceback = sys.exc_info() - frame = exc_traceback.tb_frame - function_array = traceback.extract_tb(exc_traceback) - current_function = function_array[len(function_array)-1] - error = f''' - filename : {current_function.filename} - function : {current_function.name} - err line no : {current_function.lineno} - err line : {current_function.line} - err message : {e} - ''' - gf.save_error(error, filename) - - recognized_dates = [] - if finded != None: - date_part = finded[0] - start_date_token_index = finded[1] - end_date_token_index = finded[2] - date_token_index = finded[3] - befor_date_part = " ".join(x for x in token_list[:start_date_token_index]) - after_date_part = [x for x in token_list[end_date_token_index + 1:]] - previous_slice_index = token_list_len - len(after_date_part) - previous_slice_index_array.append(previous_slice_index) - after_normalized, after_dates, recognized_dates = self.normalize_dates(after_date_part, additional_token_index_array, token_list_len, previous_slice_index_array) - if after_dates == '': - after_dates = [] - if recognized_dates == '': - recognized_dates = [] - after_dates.insert(0, date_part) - previous_slice_index = previous_slice_index_array.pop(len(previous_slice_index_array)-2) - recognized_dates.append( - { - "date" : date_part, - "date_token_index" : date_token_index + previous_slice_index, - "start_date_token_index": start_date_token_index + previous_slice_index, - "end_date_token_index" : end_date_token_index + previous_slice_index - }) - - i = 0 - while((date_token_index - start_date_token_index) > i ): - befor_date_part = ''.join([befor_date_part," tttt"]) # به عنوان توکنی که از جنس تاریخ بوده t - i += 1 - i = 0 - while((end_date_token_index - date_token_index) > i ): - date_part = ''.join([date_part," tttt"]) # به عنوان توکنی که از جنس تاریخ بوده t - i += 1 - - return befor_date_part + " " + date_part + " " + after_normalized, after_dates, recognized_dates - else: - return " ".join(x for x in token_list), '','' - - def list2num(self, numerical_section_list): - value = 1 - l = len(numerical_section_list)>3 - if l : - value = 0 - for index, el in enumerate(numerical_section_list): - if self.is_number(el): - value = self.num_dict[el] - elif el == '000': - #value *= 1000 - pass - elif l == True: - value += float(el) - else: - value *= float(el) - return value - - def convert2num(self, numerical_section_list): - value = 0 - tmp_section_list = [] - for index, el in enumerate(numerical_section_list): - if self.is_number(el) or (el.replace('.', '', 1).isdigit()): - tmp_section_list.append(el) - elif el == "و": - value += self.list2num(tmp_section_list) - tmp_section_list[:] = [] - if len(tmp_section_list) > 0: - value += self.list2num(tmp_section_list) - tmp_section_list[:] = [] - try: - if (value-int(value) == 0): - return int(value) - else: - return value - except: - return 0 - - - def is_number(self, word): - return word in self.num_dict - - def find_number_location(self, token_list, addWithVaa = False): - start_index = 0 - number_section =[] - for i , el in enumerate(token_list): - if self.is_number(el) or (el.replace('.', '', 1).isdigit()): - start_index = i - number_section.append(start_index) - break - - i = start_index+1 - while(i < len(token_list)): - if token_list[i] == "و" and (i+1)= 0: - # تشخیص اینکه یک سلسله از اعداد که با واو از هم جدا شده اند تبصره های مربوط به یک ماده نباشند - is_tabsare_numbers = (token_list[token_index-2] + ' ' + token_list[token_index-1]=='تبصره های') - if token_index - 1 >= 0: - # تشخیص اینکه یک سلسله از اعداد که با واو از هم جدا شده اند مواد یک قانون نباشند - is_mavad_numbers = (token_list[token_index-1]=='مواد') - if token_index - 1 >= 0: - # تشخیص اینکه یک سلسله از اعداد که با واو از هم جدا شده اند ماده های یک قانون نباشند - is_madde_numbers = (token_list[token_index-1]=='ماده') - if is_tabsare_numbers or is_mavad_numbers or is_madde_numbers: - flag = False - start_token_index = end_token_index = token_index - number_parts_array.clear() - current_token_index = token_index - number_parts_array.append(token) - if current_token_index + 1 < len(token_list): - while ((flag) and (token_list[current_token_index+1] == 'و' or token_list[current_token_index+1].isdigit())): - number_parts_array.append(token_list[current_token_index+1]) - current_token_index += 1 - end_token_index = current_token_index - if not (current_token_index + 1 < len(token_list)): - break - - final_number = number_completter(number_parts_array) - token_list[token_index] = str(final_number) - recognized_numbers.append({ - 'number_value' : final_number, - 'number_token_list': number_parts_array, - 'start_token_index': start_token_index, - 'end_token_index' : end_token_index - }) - # جایگذاری مقدار رشته بی ارزش به جای توکن های مربوط به عدد جاری - # به منظور اینکه متن اصلی از نظر تعداد توکن تغییر نکند - for i in range(token_index + 1 , len(number_parts_array) + token_index): - token_list[i] = 'nnnn' - converted_string = converted_string + ' ' + token_list[token_index] - if token_index + 1 < len(token_list): - if flag == False and (token_list[token_index+1]=='و'): - flag = False - else: - flag = True - else: - converted_string = converted_string + ' ' + token - - return converted_string, recognized_numbers - - - - - def remove_additional_tokens(self,token_list,additional_token_index_array): - converted_string = '' - for index in additional_token_index_array: - del token_list[index] - - for token in token_list: - converted_string = converted_string + ' ' + token - return converted_string - -class PinglishNormalizer(): - def __init__(self): - self.data_helper = DataHelper() - self.file_dir = os.path.dirname(os.path.realpath(__file__)) + "/" - - self.en_dict_filename = self.file_dir + "resource/tokenizer/enDict" - self.en_dict = self.data_helper.load_var(self.en_dict_filename) - - self.fa_dict_filename = self.file_dir + "resource/tokenizer/faDict" - self.fa_dict = self.data_helper.load_var(self.fa_dict_filename) - - - def pingilish2persian(self, pinglish_words_list): - - for i, word in enumerate(pinglish_words_list): - if word in self.en_dict: - pinglish_words_list[i] = self.en_dict[word]#.decode("utf-8") - #inp = inp.replace(word, enDict[word], 1) - else: - ch = self.characterize(word) - pr = self.map_char(ch) - amir = self.make_word(pr) - for wd in amir: - am = self.escalation(wd) - asd = ''.join(am) - if asd in self.fa_dict: - pinglish_words_list[i] = asd#.decode("utf-8") - #inp = inp.replace(word, asd, 1) - inp = " ".join(x for x in pinglish_words_list) - return inp - - def characterize(self, word): - list_of_char = [] - i = 0 - while i < len(word): - char = word[i] - sw_out = self.switcher(char) - if (sw_out == None): - esp_out = None - if(i < len(word) - 1): - esp_out = self.esp_check(word[i], word[i + 1]) - if(esp_out == None): - list_of_char.append(word[i]) - else: - list_of_char.append(esp_out) - i += 1 - else: - list_of_char.append(sw_out) - i += 1 - return list_of_char - - def switcher(self, ch): - switcher = { - "c": None, - "k": None, - "z": None, - "s": None, - "g": None, - "a": None, - "u": None, - "e": None, - "o": None - } - return switcher.get(ch, ch) - - def esp_check(self, char1, char2): - st = char1 + char2 - if (st == "ch"): - return "ch" - elif (st == "kh"): - return "kh" - elif (st == "zh"): - return "zh" - elif (st == "sh"): - return "sh" - elif (st == "gh"): - return "gh" - elif (st == "aa"): - return "aa" - elif (st == "ee"): - return "ee" - elif (st == "oo"): - return "oo" - elif (st == "ou"): - return "ou" - else: - return None - - def map_char(self, word): - listm = [] - sw_out = self.map_switcher(word[0]) - i = 0 - if (sw_out == None): - listm.append(["ا"]) - i += 1 - if (word[0] == "oo"): - listm.append(["او"]) - i += 1 - while i < len(word): - listm.append(self.char_switcher(word[i])) - i += 1 - if word[len(word) - 1] == "e": - listm.append(["ه"]) - elif word[len(word) - 1] == "a": - listm.append(["ا"]) - elif word[len(word) - 1] == "o": - listm.append(["و"]) - elif word[len(word) - 1] == "u": - listm.append(["و"]) - - return listm - - def map_switcher(self, ch): - switcher = { - "a": None, - "e": None, - "o": None, - "u": None, - "ee": None, - - "ou": None - } - return switcher.get(ch, ch) - - def make_word(self, chp): - word_list = [[]] - for char in chp: - word_list_temp = [] - for tmp_word_list in word_list: - for chch in char: - tmp = copy.deepcopy(tmp_word_list) - tmp.append(chch) - word_list_temp.append(tmp) - word_list = word_list_temp - return word_list - - def escalation(self, word): - tmp = [] - i = 0 - t = len(word) - while i < t - 1: - tmp.append(word[i]) - if word[i] == word[i + 1]: - i += 1 - i += 1 - if i != t: - tmp.append(word[i]) - return tmp - - def char_switcher(self, ch): - switcher = { - 'a': ["", "ا"], - 'c': ["ث", "ص", "ص"], - 'h': ["ه", "ح"], - 'b': ["ب"], - 'p': ["پ"], - 't': ["ت", "ط"], - 's': ["س", "ص", "ث"], - 'j': ["ج"], - 'ch': ["چ"], - 'kh': ["خ"], - 'q': ["ق", "غ"], - 'd': ["د"], - 'z': ["ز", "ذ", "ض", "ظ"], - 'r': ["ر"], - 'zh': ["ژ"], - 'sh': ["ش"], - 'gh': [",ق", "غ"], - 'f': ["ف"], - 'k': ["ک"], - 'g': ["گ"], - 'l': ["ل"], - 'm': ["م"], - 'n': ["ن"], - 'v': ["و"], - 'aa': ["ا"], - 'ee': ["ی"], - 'oo': ["و"], - 'ou': ["و"], - 'i': ["ی"], - 'y': ["ی"], - ' ': [""], - 'w': ["و"], - 'e': ["", "ه"], - 'o': ["", "و"] - } - return switcher.get(ch, "") - -# این روال جهت جمع و ضرب کردن بخش های مختلف از صدگان و هزارگان و بالاتر از یک عدد که با حروف در متن وجود دارد کار می کند -def number_completter2(number_parts_array): - zarb_value = 0 - temp_number_array = [] - sum_number_parts = 0 - final_number = 0 - for index,item in enumerate(number_parts_array): - if item.isdigit(): - temp_number_array.append(int(item)) - if index + 1 >= len(number_parts_array): - for item3 in temp_number_array: - final_number += int(item3) - return final_number - current_value = number_parts_array[index+1] - if (current_value.isdigit() and (int(current_value) == 1000 or int(current_value) == 1000000 or int(current_value) == 1000000000)):# or int(current_value) == 1000000 - zarb_value = int(number_parts_array[index+1]) - for num_item in temp_number_array: - sum_number_parts += num_item - final_number = sum_number_parts * zarb_value - temp_array2 = [] - x = index + 2 - while x < len(number_parts_array): - # for x in range(index+2,len(number_parts_array)): - temp_array2.append(number_parts_array[x]) - if x + 1 < len(number_parts_array): - if number_parts_array[x+1].isdigit() and (int(number_parts_array[x+1]) == 1000 or int(number_parts_array[x+1]) == 1000000 or int(number_parts_array[x+1]) == 1000000000): - if int(number_parts_array[x+1]) > zarb_value: # 1000000>1000 - zarb_value = int(number_parts_array[x+1]) - # تابع بازگشتی برای محاسبه عدد نهایی - final_number += number_completter2(temp_array2) - final_number *= zarb_value - temp_array2.clear() - zarb_value = 0 - x += 2 # به این دلیل که مقدار ضرب را در آرایه تمپ ذخیره نکند - else: # 1000!>1000000 - zarb_value = int(number_parts_array[x+1]) - temp_num = 0 - if (zarb_value == 1000 or zarb_value == 1000000) and len(number_parts_array) > x + 1: - num_array = number_parts_array[x+2:len(number_parts_array)] - temp_num = number_completter(num_array) - if temp_num == None: - temp_num = 0 - - '''for i in range(x+2,len(number_parts_array)): - if(number_parts_array[i].isdigit()): - temp_num += int(number_parts_array[i]) - i+=1''' - temp_num2 = 1 - for num_item2 in temp_array2: - if(num_item2.isdigit()): - temp_num2 += int(num_item2) - - temp_num2 *= zarb_value - final_number += temp_num + temp_num2 - break - else: - x += 1 # - else: # اگر از رنج آرایه خارج شدیم - temp_num = 0 - # مقادیر موجود در آرایه تمپ را جمع کن - for num_item3 in temp_array2: - if(num_item3.isdigit()): - temp_num += int(num_item3) - # حاصل جمع اعداد موجود در آرایه ذخیره را در عدد نهایی که قبلا داشته ایم ضرب کن و از حلقه خارج شو - final_number *= temp_num - #final_number += temp_num - break - return final_number - -# این روال جهت جمع و ضرب کردن بخش های مختلف از صدگان و هزارگان و بالاتر از یک عدد که با حروف در متن وجود دارد کار می کند -def number_completter(number_parts_array): - zarb_value = 0 - previous_zarb_value = 0 - previous_number_parts_sum = 0 - temp_number_array = [] - number_parts_sum = 0 - final_number = 0 - current_number_part = 0 - for index,item in enumerate(number_parts_array): - if item.isdigit(): - if (not(int(item) == 1000 or int(item) == 1000000 or int(item) == 1000000000)): - temp_number_array.append(item) - continue - elif((int(item) == 1000 or int(item) == 1000000 or int(item) == 1000000000)): - zarb_value = int(item) - for num_item in temp_number_array: - number_parts_sum += int(num_item) - if number_parts_sum == 0 and previous_number_parts_sum == 0: - number_parts_sum = 1 - temp_number_array.clear() - - else:# for example 952 - zarb_value = 1 - for num_item in temp_number_array: - number_parts_sum += int(num_item) - current_number_part = number_parts_sum + previous_number_parts_sum - continue - if previous_zarb_value < zarb_value:# for example 1000 < 1000000000 - current_number_part = previous_number_parts_sum + number_parts_sum - current_number_part = zarb_value * current_number_part - else:# previous_zarb_value > zarb_value - if number_parts_sum == 0: - number_parts_sum = 1 - current_number_part = zarb_value * number_parts_sum - current_number_part += previous_number_parts_sum - - previous_number_parts_sum = current_number_part - current_number_part = 0 - previous_zarb_value = zarb_value - number_parts_sum = 0 - if len(temp_number_array) != 0: - remained_parts_sum = 0 - for num_item in temp_number_array: - remained_parts_sum += int(num_item) - final_number = previous_number_parts_sum + remained_parts_sum - return final_number - - final_number = previous_number_parts_sum - return final_number - - - - - - \ No newline at end of file diff --git a/p1_classifier.py b/p1_classifier.py index b3fffe1..68693eb 100644 --- a/p1_classifier.py +++ b/p1_classifier.py @@ -4,20 +4,18 @@ """ from transformers import pipeline from normalizer import cleaning -from elastic_helper import ElasticHelper import transformers import json import datetime import pandas as pd from transformers import AutoTokenizer -print(transformers.__version__) +print(f'transformers version: {transformers.__version__}') # finetuned model for classification path model_checkpoint = './models/classifier/findtuned_classification_hoosh_with_path_v2__30/checkpoint-1680' tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) - -# window_size = tokenizer.model_max_length#512#200 +print(f'Classification Model Loaded: {model_checkpoint}') window_size = 512 """ (یعنی سایز پنجره) به این دلیل که تعداد توکن های ورودی به مدل، محدود به متغیر بالاست @@ -27,8 +25,8 @@ window_size = 512 step_size = 350#100 # تعداد کلاس هایی که بازای هر سکشن از مدل درخواست می کنیم Top_k = 4 - -classifier = pipeline("text-classification", model_checkpoint, framework="pt") +# set device = 0 => to use GPU +classifier = pipeline("text-classification", model_checkpoint, framework="pt", device=0) def get_class(sentences, top_k:int=4): # sentences = cleaning(sentences) @@ -177,14 +175,15 @@ def do_classify(sections): for index, id in enumerate(sections): - source = sections[id]['source'] + source = sections[id] classification_result, classification_status, desc = single_section_classification(id, source) if not classification_status: print(f'id: {id} classification error. error description: {desc}') # ساماندهی کلاس های پیش بینی شده در عنوان بهترین کلاس و دیگر کلاسها بر اساس امتیاز تخمین مدل و ذخیره در دیکشنری - new_sections_dict[id] = classification_result + # new_sections_dict[id] = classification_result + sections[id]['ai_codes'] = classification_result """ برای حالت تست که می خواهیم عملکرد مدل کلاسیفایر را ارزیابی کنیم، بدین جهت که تنوعی از قوانین مختلف را بررسی کنیم، عنوان قوانین را ذخیره می کنیم تا از تکرار بررسی سکشن های متعدد از یک قانون پرهیز شود""" # qanon_title = source['qanon_title'] @@ -198,8 +197,8 @@ def do_classify(sections): print(f'end: {datetime.datetime.now()}') print('classification finished!') - classified_sections_dict = new_sections_dict + # classified_sections_dict = new_sections_dict - return classified_sections_dict + return sections diff --git a/p2_ner_recognizer.py b/p2_ner_recognizer.py index ec1f85e..aea486e 100644 --- a/p2_ner_recognizer.py +++ b/p2_ner_recognizer.py @@ -15,7 +15,7 @@ model = "./models/ner/2025-07-22--20-44-37--HooshvareLab--bert-fa-base-uncased-n tagger = SequenceTagger.load(model) print('model read and tagger initialized') -today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}' +today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}' def prepare_data(ner_obj_list): ner_data_list = [] @@ -208,7 +208,7 @@ def do_ner_recognize(sections): ner_data_list = prepare_data(ner_obj_list) sections[id]['ners_v2'] = ner_data_list print(f'ner process: {index+1}/{len_sections}') - print(f'len_sections ner recognization finished!') + print(f'{len_sections} ner recognization finished!') return sections diff --git a/p3_words_embedder.py b/p3_words_embedder.py index 4a00214..a5389d5 100644 --- a/p3_words_embedder.py +++ b/p3_words_embedder.py @@ -6,7 +6,8 @@ import json import datetime import numpy as np -today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}' +date = datetime.datetime.now() +today = f'{date.year}-{date.month}-{date.day}-{date.hour}' model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25 # model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'#87-30 @@ -19,14 +20,16 @@ model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25 model = SentenceTransformer(model_name) def do_word_embedder(sections): - for index, item in enumerate(sections): - embeddings = single_section_embedder(sections[item]['content']) - sections[item]['embeddings'] = embeddings + for index, id in enumerate(sections): + embeddings = single_section_embedder(sections[id]['content']) + sections[id]['embeddings'] = embeddings.tolist() with open(f'./data/embeddings/sections_embeddings_{today}.json', 'w', encoding='utf-8') as output_file: data = json.dumps(sections, ensure_ascii=False) output_file.write(data) + return sections + def single_section_embedder(sentence): """ این متد، متن ورودی را تبدیل به بردار متناظر آن می کند diff --git a/p4_keyword_extractor.py b/p4_keyword_extractor.py index 2fd7fd1..530d59d 100644 --- a/p4_keyword_extractor.py +++ b/p4_keyword_extractor.py @@ -15,7 +15,7 @@ os.environ['HF_HOME'] = "/home/admin/HFHOME" model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" #model_id = "meta-llama/Llama-3.1-70B-Instruct" -today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}' +today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}' bnb_config = BitsAndBytesConfig( load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16 @@ -61,7 +61,7 @@ def generate(formatted_prompt): # Gemini Prompt USER_PROMPT = f"""از متن ارائه شده، حداقل {keywords_count} عبارت کلیدی مهم و معنادار را استخراج کنید. خروجی باید به صورت زیر باشد: • یک لیست فارسی -• شماره ترتیبی در ابتدای هر عبارت +• هیچ عدد یا علامت و نمادی در ابتدا یا انتهای هر عبارت کلیدی قرار نگیرد • هر عبارت در یک خط جداگانه • بدون هیچ توضیح اضافی در ابتدا یا انتهای پاسخ موارد زیر در استخراج عبارات کلیدی الزامی است: @@ -94,7 +94,7 @@ def generate(formatted_prompt): max_new_tokens=2048, eos_token_id=terminators, do_sample=True, - temperature=0.6, + temperature=0.1, top_p=0.9, ) response = outputs[0][input_ids.shape[-1]:] @@ -168,9 +168,9 @@ def do_keyword_extract(sections): period_ids_text += f"{id} \n" - print(f"section: {counter}-id: {id}") + print(f"section kw extracting: {counter} - id: {id}") # temp_dict.append(item) - if counter % 1000 == 0: + if counter % 5000 == 0: outputfile = open(f'./data/keyword/sections_kw_llama8b_{str(file_counter)}_{today}.json', "a+", encoding='utf-8') outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent=2)) outputfile.close() @@ -196,12 +196,13 @@ def do_keyword_extract(sections): print(f"elapsed time: {(end_time-start_time)/86400} Days!!! ") print("end") - return True + operation_result = True + return operation_result, sections if __name__ == "__main__": print(f'start: {datetime.datetime.now()}') sections = get_sections() - sections = do_keyword_extract(sections) + operation_result = do_keyword_extract(sections) print(f'end: {datetime.datetime.now()}') \ No newline at end of file diff --git a/p5_simplifier.py b/p5_simplifier.py index add1944..f4bf981 100644 --- a/p5_simplifier.py +++ b/p5_simplifier.py @@ -6,7 +6,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer import torch import json -today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}' +today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}' if torch.cuda.is_available(): model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" @@ -56,17 +56,20 @@ def single_section_representation(content): outputs = model.generate( input_ids, - max_new_tokens=500, + max_new_tokens=2048, eos_token_id=terminators, do_sample=True, - temperature=0.7, + temperature=0.1, top_p=0.85, ) response = outputs[0][input_ids.shape[-1]:] sentences = tokenizer.decode(response, skip_special_tokens=True) + # حذف جملات تکراری + sentences = list(set(sentences)) + result = True - desc = 'operation successful' + desc = 'Operation successful' return result, desc, sentences except Exception as error: @@ -77,7 +80,7 @@ def single_section_representation(content): def do_representation(sections): print(f"start time: {datetime.datetime.now()}") - for index, id in sections: + for index, id in enumerate(sections): result, desc, sentences = single_section_representation(sections[id]['content']) if not result: error_content = f'id: {id} - error: {desc}\n' @@ -85,11 +88,16 @@ def do_representation(sections): file.write(error_content) sections[id]['represented_sentences'] = sentences - + print(f'representation process. section {index+1}/{len(sections)} - id: {id}') + + with open(f'./data/represent/sections_represent_llama8b_{today}.json', "w", encoding='utf-8') as outputfile: + outputfile.write(json.dumps(sections, ensure_ascii=False, indent = 4)) print(f"end time: {datetime.datetime.now()}") print(" *** finished! *** ") + operation_result = True + return operation_result, sections if __name__ == "__main__": pass \ No newline at end of file diff --git a/readme/readme-classifier.md b/readme/readme-classifier.md new file mode 100644 index 0000000..e27f201 --- /dev/null +++ b/readme/readme-classifier.md @@ -0,0 +1,71 @@ +# Section Classification Script + +This project provides a Python script (`classification.py`) for classifying text sections using a fine-tuned transformer model. The script is designed to suggest the most relevant classes for each section of text, which is useful for legal documents, content categorization, and similar NLP tasks. + +## Requirements + +Before using this script, please install the required libraries: + +```bash +pip install transformers pandas +``` + +You also need a fine-tuned classification model and its tokenizer. Update the `model_checkpoint` path in the script to point to your model. + +## How It Works + +- The script loads a fine-tuned transformer model for text classification. +- It processes each section of text, possibly splitting long texts into windows to fit the model's input size. +- For each section, it predicts the top classes and saves the results. + +## Main Functions + +- `get_class(sentences, top_k=4)`: Classifies a sentence or text and returns the top `k` classes. +- `mean_classes(input_classes)`: Aggregates class results from multiple windows of a long text. +- `get_window_classes(text)`: Handles splitting long texts into windows and aggregates their classification results. +- `single_section_classification(id, section_source)`: Classifies a single section and returns the best and other suggested classes. +- `do_classify(sections)`: Classifies all sections in a dictionary and saves the results to a JSON file. + +## Usage Example + +Suppose you have your sections data as a dictionary: + +```python +sections = { + "1": {"content": "First section text", "other_info": {"full_path": "..."}, "qanon_title": "..."}, + "2": {"content": "Second section text", "other_info": {"full_path": "..."}, "qanon_title": "..."} +} +``` + +You can classify all sections as follows: + +```python +from classification import do_classify + +result = do_classify(sections) +``` + +After running, the results will be saved in a JSON file in the `./data/classification/` directory. + +## Output Structure + +Each section will have a new field `ai_codes` with the classification results: + +```json +"1": { + "content": "First section text", + "ai_codes": { + "best-class": {"label": "ClassA", "score": 0.85}, + "other-classes": [ + {"label": "ClassB", "score": 0.10}, + {"label": "ClassC", "score": 0.05} + ] + } +} +``` + +## Notes + +- Make sure the model path in `model_checkpoint` is correct and the model files are available. +- The script supports Persian and other languages, depending on your model. +- The output JSON file will be saved in `./data/classification/`. diff --git a/readme/readme-words-embedder.md b/readme/readme-words-embedder.md new file mode 100644 index 0000000..eb6dc62 --- /dev/null +++ b/readme/readme-words-embedder.md @@ -0,0 +1,70 @@ +# Sentence Embedding Generator + +This project provides a Python script (`embedding.py`) for generating sentence embeddings using the [Sentence Transformers]library. + +## Requirements + +Before using this script, please install the required libraries: + +```bash +pip install sentence-transformers numpy +``` + +## How It Works + +- The script uses the pre-trained model: `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`. +- There are two main functions: + - `single_section_embedder(sentence)`: Takes a sentence (string) and returns its embedding as a vector. + - `do_word_embedder(sections)`: Takes a dictionary of sections (each with a `content` field), generates embeddings for each section, and saves the results as a JSON file. + +## Usage + +### 1. Get Embedding for a Single Sentence + +```python +from embedding import single_section_embedder + +sentence = "This is a sample sentence." +embedding = single_section_embedder(sentence) +print(embedding) +``` + +### 2. Generate Embeddings for Multiple Sections and Save to File + +Suppose your data is structured like this: + +```python +sections = { + "1": {"content": "First section text"}, + "2": {"content": "Second section text"} +} +``` + +You can generate and save embeddings as follows: + +```python +from embedding import do_word_embedder + +result = do_word_embedder(sections) +``` + +After running, a file named like `sections_embeddings_YEAR-MONTH-DAY-HOUR.json` will be created in the `./data/embeddings/` directory, containing the embeddings for each section. + +## Output Structure + +The output is a JSON file where each section has its embedding added: + +```json +{ + "1": { + "content": "First section text", + "embeddings": [0.123, 0.456, ...] + }, + ... +} +``` + +## Notes + +- Make sure the folder `./data/embeddings/` exists before running the script. +- The script supports Persian language.