From faadbda0089eafc11d0fb156008801e899a9a663 Mon Sep 17 00:00:00 2001 From: ajokar Date: Tue, 17 Sep 2024 16:44:55 +0000 Subject: [PATCH] Upload files to "import_data" --- import_data/convert_ner_to_dataset.py | 78 +++ import_data/data_helper.py | 152 +++++ import_data/diff_result_objects.py | 150 +++++ import_data/funcs.py | 112 ++++ import_data/general_functions.py | 810 ++++++++++++++++++++++++++ 5 files changed, 1302 insertions(+) create mode 100644 import_data/convert_ner_to_dataset.py create mode 100644 import_data/data_helper.py create mode 100644 import_data/diff_result_objects.py create mode 100644 import_data/funcs.py create mode 100644 import_data/general_functions.py diff --git a/import_data/convert_ner_to_dataset.py b/import_data/convert_ner_to_dataset.py new file mode 100644 index 0000000..db292fc --- /dev/null +++ b/import_data/convert_ner_to_dataset.py @@ -0,0 +1,78 @@ +""" +در این فایل، متن قانون به همراه آبجکتی از موجودیت های نامدار که قبلا مدل تشخیص داده است به عنوان +ورودی در نظر گرفته می شود و خروجی آن، یک دیتاست با فرمت +NER +خواهد بود +""" + +import numpy as np +from funcs import save_to_file_by_address, read_from_json + +def create_dataset_part(content, ners): + tokens = content.split() + token_list = [[item,"O"] for item in tokens] + np_tokens = np.array(token_list) + + for ner in ners: + pass + begin = ner["begin"] + end = ner["end"] + + # تصحیح عنوان نوع موجودیت ها + key = ner["key"] + if key == "H_REF": + key = "Href" + elif key == "REF": + key = "Ref" + elif key == "ORG": + key = "Org" + elif key == "DATE": + key = "Date" + elif key == "DATE2": + key = "Date" + elif key == "DATE3": + key = "Date" + + np_tokens[begin][1] = f"B-{key}" + for index in range(begin+1, end): + np_tokens[index][1] = f"I-{key}" + + return np_tokens + +def create_dataset(all_dataset_parts): + final_dataset_text = "" + for sentence_tokens in all_dataset_parts: + for token in sentence_tokens: + final_dataset_text = final_dataset_text + ''.join(f"{token[0]} {token[1]}\n") + pass + final_dataset_text += "\n" + + return final_dataset_text + +sections_110_addresss = "./data/sections_110_ner.json" +sections_list = read_from_json(sections_110_addresss) + +ners = [] +content = '' +token_list = [] + +all_sections = [] +for section in sections_list: + + section_id = section["id"] + content = section["content"] + ners = section["ners_v1"] + np_tokens = create_dataset_part(content=content, ners=ners) + all_sections.append(np_tokens) + +final_dataset = create_dataset(all_sections).strip() + +path = "./data/ner_dataset_110.txt" + +save_to_file_by_address(path, final_dataset) + +print(' operation finished! ') + + + + diff --git a/import_data/data_helper.py b/import_data/data_helper.py new file mode 100644 index 0000000..fe3a854 --- /dev/null +++ b/import_data/data_helper.py @@ -0,0 +1,152 @@ +import pickle +import re +import string +import os + + +class DataHelper(): + def __init__(self): + pass + + def clean_text(self, text_doc, new_line_elimination): + punctuations = r')(}{:؟!،؛»«.' + r"/<>?.,:;" + punctuations = '[' + punctuations + string.punctuation + ']' + punctuations = punctuations.replace("@", "") + + text_doc.strip() + + # pattern = ur'\s*@[a-zA-Z0-9]*\s*' + # tmp = re.findall(pattern, text_doc) + # newstring = re.sub(pattern, eliminate_pattern, text_doc) + + + #finding the numbers + pattern = r"[-+]?\d*\.\d+|\d+" + nums_list = re.findall(pattern, text_doc) + newstring = re.sub(pattern, 'floatingpointnumber', text_doc) + + + #pattern = '\s*' + punctuations + '+' + '\s*' + #tmp = re.findall(pattern, newstring) + #newstring = re.sub(pattern, self.add_space, newstring) + + # pattern = u'([a-zA-Z0-9]+)(\s*)(' + punctuations + u')(\s*)([a-zA-Z0-9]+)' + # rep = ur'\1\3\5' + # tmp = re.findall(pattern, newstring) + # newstring = re.sub(pattern, rep, newstring) + + pattern = r'[\n]+' + tmp = re.findall(pattern, newstring) + if new_line_elimination: + newstring = re.sub(pattern, " ", newstring) + else: + # newstring = re.sub(pattern, "\n", newstring) + pass + + punctuations = r")(}{:؟!-،؛»«.@$&%" + r"/<>?.,:;" + latinLettersDigits = r"a-zA-Z0-9" + pattern = r'[^' + punctuations + latinLettersDigits + 'آ-ی' + '‌' + '\d\s:]' + tmp = re.findall(pattern, newstring) + newstring = re.sub(pattern, self.eliminate_pattern, newstring) + + pattern = r'[ ]+' + tmp = re.findall(pattern, newstring) + newstring = re.sub(pattern, ' ', newstring) + + for number in nums_list: + pattern = 'floatingpointnumber' + newstring = re.sub(pattern, number, newstring, 1) + + return newstring + + def add_space(self, mystring): + mystring = mystring.group() # this method return the string matched by re + mystring = mystring.strip(' ') # ommiting the whitespace around the pucntuation + mystring = " " + mystring + " " # adding a space after and before punctuation + return mystring + + def replace_newline_with_dot(self, mystring): + return ' . ' + + def eliminate_pattern(self, mystring): + return "" + + def load_var(self, load_path): + file = open(load_path, 'rb') + variable = pickle.load(file) + file.close() + return variable + + def save_var(self, save_path, variable): + print("saving vars ...") + file = open(save_path, 'wb') + pickle.dump(variable, file) + print("variable saved.") + file.close() + + def build_stem_dictionary(self, normalizer, verb_tense_path, mokasar_noun_path): + path_dir = "resource/Persian_Dependency_Treebank/Data/2ndRep" + lexicon_stem = set() + verb_stem = set() + #verb_tense_map = {} + verb_p2f_map = {} + verb_f2p_map = {} + for fileName in os.listdir(path_dir): + file_path = path_dir + "/" + fileName + with open(file_path, "r") as input: + input_content = input.readlines() + for el in input_content: + el = normalizer.sub_alphabets(el) + el = el.split("\t") + if (len(el) > 2): + if (el[3] == 'V'): + tmp_pos = "V" + else: + tmp_pos = "N" + stem_word = el[2] + stem_word = stem_word.split("#") + stem_word = [x.strip('\u200c') for x in stem_word] + if (tmp_pos == "V" and len(stem_word) == 2): + if (len(stem_word[0]) != 0 and len(stem_word[1]) != 0): + verb_p2f_map[stem_word[0]] = stem_word[1] + verb_f2p_map[stem_word[1]] = stem_word[0] + verb_stem.add(stem_word[0]) + verb_stem.add(stem_word[1]) + if(tmp_pos == 'V' and len(stem_word) == 3): + if(len(stem_word[0]) != 0 and len(stem_word[1]) != 0 and len(stem_word[2]) !=0): + #verb_prifix.add(stem_word[0]) + verb_p2f_map[stem_word[1]] = stem_word[2] + verb_f2p_map[stem_word[2]] = stem_word[1] + verb_stem.add(stem_word[1]) + verb_stem.add(stem_word[2]) + for t in stem_word: + if len(t) > 1: + if (tmp_pos == 'N'): + lexicon_stem.add(t) + + with open(verb_tense_path, "r") as bon_file: + bon_file_content = bon_file.readlines() + for el in bon_file_content: + el = el.strip() + el = normalizer.sub_alphabets(el) + el = el.split() + el = [x.strip('\u200c') for x in el] + + verb_p2f_map[el[0]] = el[1] + verb_f2p_map[el[1]] = el[0] + verb_stem.add(el[0]) + verb_stem.add(el[1]) + + irregular_noun = {} + with open(mokasar_noun_path, "r") as input: + input_content = input.readlines() + for el in input_content: + el = normalizer.sub_alphabets(el) + el = el.replace("\t\t", "\t") + el = el.strip().split("\t") + el = [x.strip('\u200c') for x in el] + irregular_noun[el[0]] = el[1] + lexicon_stem.add(el[0]) + + verb_tense_map = [verb_p2f_map, verb_f2p_map] + return lexicon_stem, verb_stem, verb_tense_map, irregular_noun diff --git a/import_data/diff_result_objects.py b/import_data/diff_result_objects.py new file mode 100644 index 0000000..5610cf7 --- /dev/null +++ b/import_data/diff_result_objects.py @@ -0,0 +1,150 @@ +import pandas as pd +from sqlalchemy import create_engine +from sqlalchemy.exc import OperationalError +import re +import requests +import json +from decimal import Decimal + +# توکن و هدر برای ارسال درخواست به API +TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTg3MTY3MTMsImp0aSI6Im1aY0MwSEdIV3dxb1ppWVwvb2VqMlwvT2FWc3FTOFIwSTkiLCJpc3MiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZXhwIjoxNzIwMDE2NzEyLCJhdWQiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZGF0YSI6eyJpZCI6NywiZmlyc3RfbmFtZSI6Ilx1MDY0NVx1MDYzNVx1MDYzN1x1MDY0MVx1MDZjYyIsImxhc3RfbmFtZSI6Ilx1MDYyOFx1MDY0N1x1MDYyZlx1MDYyN1x1MDY0Nlx1MDZjYyIsImVtYWlsIjoiamFtdXNiMjc0NzRAZ21haWwuY29tIiwidXNlcm5hbWUiOiJtYmVoZGFuaSIsInVzZXJfbGV2ZWwiOjF9fQ.NhxbjdXMCEb_ninOBKpzbUsaAmxva1zpShuesXrVpEQ' +ACCEPT = "application/json" +HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT} + +# اطلاعات اتصال به پایگاه داده +db_host = 'DESKTOP-0STSURA\\MUSTAFA' +db_name = 'Qavanin' +db_driver = 'ODBC Driver 17 for SQL Server' +db_trusted_connection = 'yes' + +# ایجاد یک رشته اتصال (connection string) +connection_string = f"mssql+pyodbc://@{db_host}/{db_name}?driver={db_driver}&Trusted_Connection={db_trusted_connection}" + +# ایجاد یک engine با استفاده از SQLAlchemy +engine = create_engine(connection_string) + +# بررسی اتصال به پایگاه داده +try: + with engine.connect() as connection: + print("اتصال به سرور موفقیت‌آمیز بود.") +except OperationalError as e: + print(f"خطا در اتصال به سرور: {e}") + raise + +# اجرای کوئری SQL و خواندن داده‌ها به یک DataFrame +query = """ +SELECT + (select Top(1) gl5.SECTIONTEXT from lwSectionLog gl5 where gl1.[F_LWSECTIONLOGID_EFFECTED]=gl5.ID ) as effect_prev_text, + gl2.[SECTIONTEXT] as effected_text +FROM [Qavanin].[dbo].[lwSectionChange] gl1 +LEFT JOIN lwSection gl2 on gl2.ID=gl1.[F_LWSECTIONID_EFFECTED] +LEFT JOIN lwLaw gl4 on gl1.F_LWLAWID_EFFECTIVE=gl4.ID +WHERE + gl4.ISLAW = 1 + AND gl1.[F_LWSECTIONID_EFFECTED] is not null +ORDER BY gl1.[F_LWSECTIONID_EFFECTED], gl4.APPROVEDATE +""" + +df = pd.read_sql(query, engine) + +# تابع برای حذف کاراکترهای غیرمجاز +def remove_illegal_chars(value): + if isinstance(value, str): + return re.sub(r'[\000-\010]|[\013-\014]|[\016-\037]', '', value) + return value + +df = df.applymap(remove_illegal_chars) + +df = df[['effect_prev_text', 'effected_text']] + +num_rows = len(df) +num_chunks = 15 +chunk_size = num_rows // num_chunks + (1 if num_rows % num_chunks != 0 else 0) + +BASE_URL = "https://api.tavasi.ir/repo/dataset/multi/add/qasection/keyword" + +class JSONEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, Decimal): + return float(obj) + return json.JSONEncoder.default(self, obj) + +def isNeedHtml(html): + if "/)*(?P' + exceptTags + ')(?P[^>]+)*>' + html = re.sub(reg1, '', html) + return html + +def createIndex(content, values): + result_objects = [{ + "task": "diff", + "key": "qasection_ref_state_diff", + "label": "تغییرات در قانون", + "values": [{"text": values, "score": 4}] # مثال امتیاز تشابه + }] + output = { + "content": content, + "domain": "تغییرات قانون", + "ref_id": "", + "ref_url": "", + "task": "diff", + "result_objects": result_objects, + } + return output + +def send_data_to_api(data_list): + bulk_data = [] + for item in data_list: + content = item["effect_prev_text"] + values = item["effected_text"] + + # بررسی خالی نبودن content و values + if not content or not values: + continue + + # بررسی وجود تگ HTML خاص + if isNeedHtml(content) or isNeedHtml(values): + continue + + # حذف تگ‌های HTML از content و values + content = removeHtmlTags(content) + values = removeHtmlTags(values) + + data = createIndex(content, values) + bulk_data.append(data) + + if len(bulk_data) > 10: + payload = json.dumps(bulk_data, cls=JSONEncoder) + response = requests.post(BASE_URL, headers=HEADERS, data=payload) + bulk_data = [] + + if bulk_data: + payload = json.dumps(bulk_data, cls=JSONEncoder) + response = requests.post(BASE_URL, headers=HEADERS, data=payload) + +# for i, line in mainList : +for i in range(num_chunks): + start_row = i * chunk_size + end_row = start_row + chunk_size + chunk_df = df.iloc[start_row:end_row] + + data_list = [] + for index, row in chunk_df.iterrows(): + data_list.append({ + "effect_prev_text": row['effect_prev_text'], + "effected_text": row['effected_text'] + }) + + send_data_to_api(data_list) + +print("تمام داده‌ها با موفقیت ارسال شدند.") diff --git a/import_data/funcs.py b/import_data/funcs.py new file mode 100644 index 0000000..b54f891 --- /dev/null +++ b/import_data/funcs.py @@ -0,0 +1,112 @@ +import re +import os +import json +from pandas import read_excel +def remove_signs(): + str = read_file() + # lines = + pattern = r"\(|\)" + str = re.sub(pattern,'', str) + # str = re.sub(')','', str) + # str = re.sub('/','', str) + + return str + +def read_file(): + with open('./data/DATASET_2.txt', 'r', encoding='utf-8') as file: + text = '' + try: + text = str(file.read()) + except: + pass + return text + +def read_file_by_address(file_address): + with open(file_address, 'r', encoding='utf-8') as file: + text = '' + try: + text = str(file.read()) + except: + pass + return text + +def save_to_file(result): + with open('./data/DATASET_3.txt', 'a+', encoding='utf-8') as file: + previous_result = '' + try: + previous_result = file.read() + except: + pass + file.write(result) + file.close() + +def save_to_file_by_address(file_address, text): + with open(file_address, 'a+', encoding='utf-8') as file: + previous_result = '' + try: + previous_result = file.read() + except: + pass + file.write(text) + file.close() + + +def read_from_excel(file_address, column_name): + # خواندن فایل اکسل + data = read_excel(file_address) + + # استخراج محتوای ستون مورد نظر + column_data = data[column_name] + return column_data + +def add_columndata_to_excel(file_address, column_name, columndata): + + # خواندن فایل اکسل + data = read_excel(file_address) + + # اضافه کردن ستون جدید به داده‌ها + data[column_name] = columndata + + # ذخیره کردن داده‌ها در فایل اکسل + data.to_excel(file_address, index=False) + +def write_to_json(dict, file_address): + + # تبدیل دیکشنری به فرمت JSON + json_data = json.dumps(dict, indent=2, ensure_ascii=False) + + # ذخیره فایل + with open(file_address, 'w+', encoding='utf-8') as file: + file.write(json_data) + + return True + +def read_from_json(file_address): + data_dict = [] + # خواندن اطلاعات از فایل JSON + with open(file_address, 'r', encoding='utf-8') as file: + loaded_data = json.load(file) + + # نمایش اطلاعات خوانده شده + for item in loaded_data: + data_dict.append(item) + return data_dict + + +def separated_date_format_finder(date_ner): + result = False + date_ner = date_ner.replace('.','/') + date_ner = date_ner.replace('،','/') + date_ner = date_ner.replace('ر','/') + #date_pattern = r'\d{1,2} /\d{1,2} /\d{2,4}|\d{1,2}/\d{1,2}/\d{2,4}|\d{2,4} /\d{1,2} /\d{1,2}|\d{2,4}/\d{1,2}/\d{1,2}' + date_pattern = r'\b(?:(?:1[0-2]|0?[1-9])/?(?:3[01]|[12][0-9]|0?[1-9])/?(?:14[0-7][0-9]|13[0-9][0-9]|128[0-9])|(?:1[0-2]|0?[1-9])/?(?:3[01]|[12][0-9]|0?[1-9])/?(?:14[0-7][0-9]|13[0-9][0-9]|128[0-9]|[0-9]{2}))\b' + regex = re.compile(date_pattern) + match_dates = regex.finditer(date_ner) + for date_item in match_dates: + result = True + break + return result + +if __name__ == "__main__": + + pass \ No newline at end of file diff --git a/import_data/general_functions.py b/import_data/general_functions.py new file mode 100644 index 0000000..821f73b --- /dev/null +++ b/import_data/general_functions.py @@ -0,0 +1,810 @@ +from normalizer import Normalizer +from tokenizer import * +import jalali +import re +from re import sub +import textwrap +from html import escape +from bs4 import BeautifulSoup + +# from lxml import etree +import datetime +#enumerate(token_list): +_normalizer = Normalizer(date_normalizing_needed=True) + +yeAr = r"ﻱ|ې|ێ|ے|ى|ي|ئ" +yeFr= r"ی" +keAr = r"ڭ|ﻚ|ﮎ|ﻜ|ﮏ|ګ|ﻛ|ﮑ|ﮐ|ڪ|ك" +keFr = r"ک" +mark1 = r'#\[#' +mark2 = r'#\]#' +hTag1 = r'<' +hTag2 = r'>' +tableTag=["table","tr", "th", "td", "TABLE", "TR", "TH", "TD"] +strTable = '' +for tag in tableTag: + if strTable != '': + strTable += '|' + strTable += '('+tag+')' +regTable = r'<(?P\/)*(?P'+strTable+')(?P[^>]+)*>' +regTableReplace = r'#[#\g\g\g#]#' + + +def isNeedHtml(html): + if "]+>' + + if exeptionTag.__len__ : + exceptTags = '' + for tag in exeptionTag: + if exceptTags != '': + exceptTags += '|' + exceptTags += '('+tag+')' + reg1 = r'<(?P/)*(?P'+exceptTags+')(?P[^>]+)*>' + html = sub(reg1, regTableReplace, html) + + + soup = BeautifulSoup(html, "html.parser") + text = soup.get_text("\n", strip=True) + + if exeptionTag.__len__ : + text = sub(mark1, hTag1, text) + text = sub(mark2, hTag2, text) + + return text + + +def removeHtmlNoTableTag(html): + + # خطا داره و هنگ می کنه در test2.py + html = sub(regTable, regTableReplace, html) + soup = BeautifulSoup(html, "html.parser") + text = soup.get_text("\n", strip=True) + + text = sub(mark1, hTag1, text) + text = sub(mark2, hTag2, text) + + return text + +def normalizerData(data): + global _normalizer + normalTitle, dates = _normalizer.normalize(data, return_dates=True) + tdates = [] + for d in dates: + if not d.startswith("num"): + try: + tsd = jdate2timestamp(d) + cdate = d + except: + try: + d = d.replace("y", "") + d = d.replace("m", "/") + d = d.replace("d", "/") + m = re.match(r"^(\d{4})\D(\d{1,2})\D(\d{1,2})$", d) + if m: + [year, month, day] = [ + int(m.group(1)), + int(m.group(2)), + int(m.group(3)), + ] + if year > 1200 and year < 1550: + if month < 1 or month > 12: + month = 1 + if day < 1 or day > 31: + day = 1 + cdate = str(year) + "/" + str(month) + "/" + str(day) + tsd = jdate2timestamp(cdate) + else: + # cdate = "1403/03/03" + # tsd = jdate2timestamp(cdate) + continue + else: + # cdate = "1403/03/03" + # tsd = jdate2timestamp(cdate) + continue + except: + # print("Error in:"+ d +" for id: " + id) + # cdate = "1403/03/03" + # tsd = jdate2timestamp(cdate) + continue + tdates.append({"date": cdate, "timestamp": tsd, "index": 0, "slice": ""}) + + return normalTitle,tdates + +def normalizerDate2(inputString): + global _normalizer + normalizedString, dates, recognized_dates, recognized_numbers = _normalizer.normalize(inputString, return_dates=True) + tdates = [] + for date_item in recognized_dates: + date_part = date_item['date'] + date_token_index = date_item['date_token_index'] + start_date_token_index = date_item['start_date_token_index'] + end_date_token_index = date_item['end_date_token_index'] + if not date_part.startswith("num"): + try: + cdate = date_part + tsd = jdate2timestamp(date_part) + except: + try: + date_part = date_part.replace("y", "") + date_part = date_part.replace("m", "/") + date_part = date_part.replace("d", "/") + m = re.match(r"^(\d{4})\D(\d{1,2})\D(\d{1,2})$", date_part) + if m: + [year, month, day] = [ + int(m.group(1)), + int(m.group(2)), + int(m.group(3)), + ] + if year > 1200 and year < 1550: + if month < 1 or month > 12: + month = 1 + if day < 1 or day > 31: + day = 1 + cdate = str(year) + "/" + str(month) + "/" + str(day) + tsd = jdate2timestamp(cdate) + else: + # cdate = "1403/03/03" + # tsd = jdate2timestamp(cdate) + continue + # else: + # # cdate = "1403/03/03" + # # tsd = jdate2timestamp(cdate) + # continue + except: + # print("Error in:"+ d +" for id: " + id) + # cdate = "1403/03/03" + # tsd = jdate2timestamp(cdate) + continue + import tokenizer as t + inputString_token = t.Tokenizer.tokenize_words(None,inputString) + # if start_date_token_index == end_date_token_index: + # end_date_token_index += 1 + # original_date_part = inputString_token[start_date_token_index:end_date_token_index] + # else: + original_date_part = inputString_token[start_date_token_index:end_date_token_index + 1] + original_date = '' + for part in original_date_part: + original_date = original_date + ' ' + part + original_date = original_date.strip() + tdates.append({"converted_date": date_item['date'], + "date": cdate , + "original_date" : original_date, + # "timestamp": tsd, + "date_token_index": date_token_index, + "start_date_token_index": start_date_token_index, + "end_date_token_index":end_date_token_index}) + ''' + for d in dates: + if not d.startswith("num"): + try: + tsd = jdate2timestamp(d) + cdate = d + except: + try: + d = d.replace("y", "") + d = d.replace("m", "/") + d = d.replace("d", "/") + m = re.match(r"^(\d{4})\D(\d{1,2})\D(\d{1,2})$", d) + if m: + [year, month, day] = [ + int(m.group(1)), + int(m.group(2)), + int(m.group(3)), + ] + if year > 1200 and year < 1550: + if month < 1 or month > 12: + month = 1 + if day < 1 or day > 31: + day = 1 + cdate = str(year) + "/" + str(month) + "/" + str(day) + tsd = jdate2timestamp(cdate) + else: + # cdate = "1403/03/03" + # tsd = jdate2timestamp(cdate) + continue + else: + # cdate = "1403/03/03" + # tsd = jdate2timestamp(cdate) + continue + except: + # print("Error in:"+ d +" for id: " + id) + # cdate = "1403/03/03" + # tsd = jdate2timestamp(cdate) + continue + tdates.append({"date": cdate, "timestamp": tsd, "index": 0, "slice": ""})''' + return normalizedString,tdates,recognized_numbers + +def OtherDateFormatNormalizer(inputString,pattern): + mainTextTemp = inputString + regex_pattern_Mah = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\s(\d{1,4})\sو\s(\d{1,3})\sو\s(\d{1,2})\sو\s(\d{1})\s" # y0m4d4 ماه 1000 و 300 و 50 و 4 + regex_pattern_MahSal = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\sسال\s(\d{1,4})\sو\s(\d{1,3})\sو\s(\d{1,2})\sو\s(\d{1})\s" # y0m4d4 ماه سال 1000 و 300 و 50 و 4 + regex_pattern_MahSal2 = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\sسال\sy(\d{1,4})m(\d{1,2})d(\d{1,2})\sو\s(\d{1,3})\sو\s(\d{1,2})\sو\s(\d{1})\s" # y0m4d4 ماه سال y1000m0d0 و 300 و 50 و 4 + regex_pattern_MahSal3 = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\sسال\sy(\d{1,4})m(\d{1,2})d(\d{1,2})" # y0m3d1 ماه سال y1353m0d0 + + if(pattern==1): + regex = re.compile(regex_pattern_Mah) + elif(pattern==2): + regex = re.compile(regex_pattern_MahSal) + elif(pattern==3): + regex = re.compile(regex_pattern_MahSal2) + elif(pattern==4): + regex = re.compile(regex_pattern_MahSal3) + + matches = regex.finditer(inputString) + for match in matches: + foundedPattern = match.group() + foundedPatternTemp = match.group() + if(pattern==1): + foundedPattern = foundedPattern.replace('ماه','') + else: + foundedPattern = foundedPattern.replace('سال','') + foundedPattern = foundedPattern.replace('ماه','') + foundedPattern = foundedPattern.strip() + tempString = foundedPattern + standardDatePattern = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})" + #regex = re.compile(regex_pattern_Mah) + matchItems = re.finditer(standardDatePattern,tempString) + for item in matchItems: + tempPattern = item.group() + tempString = tempString.replace(tempPattern,'') + tempString = tempString.strip() + tempString = tempString.replace('و','') + tempString = tempString.strip() + tempArray = tempString.split() + year = 0 + for item in tempArray: + dateMatch = re.finditer(standardDatePattern,item) + regexFlag = True + for dateItem in dateMatch: + yearStr = dateItem.group()[1:5] + year += int(yearStr) + regexFlag = False + break + if(item.isalnum() and regexFlag): + year += int(item) + tempPattern = tempPattern.replace('y0','y'+str(year)) + mainTextTemp = mainTextTemp.replace(foundedPatternTemp,tempPattern+' ') + return mainTextTemp + + #foundedPattern = jdate2timestamp(foundedPattern) + #convertedText = regex.sub(foundedPattern,convertedText) + +def normalizerLongData(data): + dates = [] + if len(data) > 10000: + textParts = textwrap.wrap(data, 10000, break_long_words=False) + for part in textParts: + dates.extend(normalizerData(part)) + else: + dates = normalizerData(data) + return dates + +# ################## +# در ویندوز برای اعداد منفی که تاریخهای قبلی بود را خطا می داد +# rr = gdt.timestamp() +# ################# +def jdate2timestamp_old(dt): + ndt = dt.replace("y", "") + ndt = ndt.replace("m", "/") + ndt = ndt.replace("d", "/") + gd = jalali.Persian(ndt).gregorian_datetime() + # print(gd) + ztime = datetime.time(0, 0, 0, 0) + gdt = datetime.datetime.combine(gd, ztime) + # print(gdt) + rr = gdt.timestamp() + tst = int( round(rr) * 1000) + return tst + +def jdate2timestamp(dt): + ndt = dt.replace("y", "") + ndt = ndt.replace("m", "/") + ndt = ndt.replace("d", "/") + gd = jalali.Persian(ndt).gregorian_datetime() + base = datetime.date(1970, 1, 1) + rr = (gd-base).total_seconds() + tst = int( round(rr) * 1000) + return tst + + + +def getSortTimestamp(ts_date): + empty_date = -15000000000 + ts_ts = empty_date + try: + if ts_date != "": + ts_ts = jdate2timestamp(ts_date) + except: + ts_ts = empty_date + + return ts_ts + +def normalize_content(content): + text = normalYehKe(content) + text = _normalizer.sub_alphabets(text) + # کلماتی که با نیم فاصله از هم جدا شده اند، را به هم می چسباند + # در این صورت، اگر با یک اسپیس جایگزین شود، یک توکن به متن اصلی اضافه می کند + text = sub('\u200c','',text) + pattern = r',|٬|٫|‚|,|؟|ʕ|_|ـ' + text = sub(pattern,'', text) + + if text.__contains__('\u200c'): + print(text) + + return text + +def normalYehKe(text): + if(text == None) : + return '' + + c1 = sub(yeAr, yeFr, text) + c2 = sub(keAr, keFr, c1) + c2 = c2.replace('\u00A0', '') + return c2.strip() + +_term_list = [] +def setTermList(): + global _term_list + if(_term_list.__len__() > 0): + return + _term_list = [ + { + "begin": jdate2timestamp("1285/07/14"), + "end": jdate2timestamp("1287/04/2"), + "term": "مجلس شورای ملی-دوره1", + "term_number": 1, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1288/8/24"), + "end": jdate2timestamp("1290/10/3"), + "term": "مجلس شورای ملی-دوره2", + "term_number": 2, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1293/9/14"), + "end": jdate2timestamp("1294/8/21"), + "term": "مجلس شورای ملی-دوره3", + "term_number": 3, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1300/4/1"), + "end": jdate2timestamp("1302/3/30"), + "term": "مجلس شورای ملی-دوره4", + "term_number": 4, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1302/11/22"), + "end": jdate2timestamp("1304/11/22"), + "term": "مجلس شورای ملی-دوره5", + "term_number": 5, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1305/4/19"), + "end": jdate2timestamp("1307/5/22"), + "term": "مجلس شورای ملی-دوره6", + "term_number": 6, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1307/7/19"), + "end": jdate2timestamp("1309/8/14"), + "term": "مجلس شورای ملی-دوره7", + "term_number": 7, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1309/9/24"), + "end": jdate2timestamp("1311/10/24"), + "term": "مجلس شورای ملی-دوره8", + "term_number": 8, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1311/12/24"), + "end": jdate2timestamp("1314/1/24"), + "term": "مجلس شورای ملی-دوره9", + "term_number": 9, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1314/3/15"), + "end": jdate2timestamp("1316/3/22"), + "term": "مجلس شورای ملی-دوره10", + "term_number": 10, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1316/6/20"), + "end": jdate2timestamp("1318/6/27"), + "term": "مجلس شورای ملی-دوره11", + "term_number": 11, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1318/8/3"), + "end": jdate2timestamp("1320/8/9"), + "term": "مجلس شورای ملی-دوره12", + "term_number": 12, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1320/8/22"), + "end": jdate2timestamp("1322/9/1"), + "term": "مجلس شورای ملی-دوره13", + "term_number": 13, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1322/12/16"), + "end": jdate2timestamp("1324/12/21"), + "term": "مجلس شورای ملی-دوره14", + "term_number": 14, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1326/4/25"), + "end": jdate2timestamp("1328/5/6"), + "term": "مجلس شورای ملی-دوره15", + "term_number": 15, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1328/11/20"), + "end": jdate2timestamp("1330/11/29"), + "term": "مجلس شورای ملی-دوره16", + "term_number": 16, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1331/2/7"), + "end": jdate2timestamp("1332/8/28"), + "term": "مجلس شورای ملی-دوره17", + "term_number": 17, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1332/12/27"), + "end": jdate2timestamp("1335/1/26"), + "term": "مجلس شورای ملی-دوره18", + "term_number": 18, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1335/3/10"), + "end": jdate2timestamp("1339/3/29"), + "term": "مجلس شورای ملی-دوره19", + "term_number": 19, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1339/12/2"), + "end": jdate2timestamp("1340/2/19"), + "term": "مجلس شورای ملی-دوره20", + "term_number": 20, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1342/7/14"), + "end": jdate2timestamp("1346/7/13"), + "term": "مجلس شورای ملی-دوره21", + "term_number": 21, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1346/7/14"), + "end": jdate2timestamp("1350/6/9"), + "term": "مجلس شورای ملی-دوره22", + "term_number": 22, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1350/6/9"), + "end": jdate2timestamp("1354/6/16"), + "term": "مجلس شورای ملی-دوره23", + "term_number": 23, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1354/6/17"), + "end": jdate2timestamp("1357/11/20"), + "term": "مجلس شورای ملی-دوره24", + "term_number": 24, + "majles_name": "شورای ملی", + }, + { + "begin": jdate2timestamp("1359/3/7"), + "end": jdate2timestamp("1363/3/6"), + "term": "مجلس شورای اسلامی-دوره1", + "term_number": 1, + "majles_name": "شورای اسلامی", + }, + { + "begin": jdate2timestamp("1363/3/7"), + "end": jdate2timestamp("1367/3/6"), + "term": "مجلس شورای اسلامی-دوره2", + "term_number": 2, + "majles_name": "شورای اسلامی", + }, + { + "begin": jdate2timestamp("1367/3/7"), + "end": jdate2timestamp("1371/3/6"), + "term": "مجلس شورای اسلامی-دوره3", + "term_number": 3, + "majles_name": "شورای اسلامی", + }, + { + "begin": jdate2timestamp("1371/3/7"), + "end": jdate2timestamp("1375/3/11"), + "term": "مجلس شورای اسلامی-دوره4", + "term_number": 4, + "majles_name": "شورای اسلامی", + }, + { + "begin": jdate2timestamp("1375/3/12"), + "end": jdate2timestamp("1379/3/6"), + "term": "مجلس شورای اسلامی-دوره5", + "term_number": 5, + "majles_name": "شورای اسلامی", + }, + { + "begin": jdate2timestamp("1379/3/7"), + "end": jdate2timestamp("1383/3/6"), + "term": "مجلس شورای اسلامی-دوره6", + "term_number": 6, + "majles_name": "شورای اسلامی", + }, + { + "begin": jdate2timestamp("1383/3/7"), + "end": jdate2timestamp("1387/3/6"), + "term": "مجلس شورای اسلامی-دوره7", + "term_number": 7, + "majles_name": "شورای اسلامی", + }, + { + "begin": jdate2timestamp("1387/3/7"), + "end": jdate2timestamp("1391/3/6"), + "term": "مجلس شورای اسلامی-دوره8", + "term_number": 8, + "majles_name": "شورای اسلامی", + }, + { + "begin": jdate2timestamp("1391/3/7"), + "end": jdate2timestamp("1395/3/7"), + "term": "مجلس شورای اسلامی-دوره9", + "term_number": 9, + "majles_name": "شورای اسلامی", + }, + { + "begin": jdate2timestamp("1395/3/8"), + "end": jdate2timestamp("1399/3/6"), + "term": "مجلس شورای اسلامی-دوره10", + "term_number": 10, + "majles_name": "شورای اسلامی", + }, + { + "begin": jdate2timestamp("1399/3/7"), + "end": jdate2timestamp("1403/3/6"), + "term": "مجلس شورای اسلامی-دوره11", + "term_number": 11, + "majles_name": "شورای اسلامی", + }, + ] + + +def getTermQanon(ts_date_timestamp, ts_ref): + setTermList() + global _term_list + term = "" + term_number = 0 + majles_name = "" + + if ts_ref == "هيات وزيران (دوره فترت)": + term = ts_ref + if ts_ref == "نخست وزير (مصدق)": + term = ts_ref + if ts_ref == "وزير عدليه (داور)": + term = ts_ref + if ts_ref == "شوراي انقلاب جمهوري اسلامي ايران": + term = ts_ref + + majles_name = term + if term == "": + for i in range(len(_term_list) - 1, -1, -1): + begin = _term_list[i]["begin"] + end = _term_list[i]["end"] + if ts_date_timestamp >= begin and ts_date_timestamp <= end: + term = _term_list[i]["term"] + term_number = _term_list[i]["term_number"] + majles_name = _term_list[i]["majles_name"] + break + + error = "" + if term == "": + # if ts_date_timestamp >= _term_list[0]["begin"] and ts_date_timestamp <= _term_list[len(_term_list)-1]["end"] : + if ts_date_timestamp <= _term_list[len(_term_list) - 1]["end"]: + for i in range(0, len(_term_list) - 1, 1): + end = _term_list[i]["end"] + if ts_date_timestamp <= end: + term = _term_list[i]["term"] + term_number = _term_list[i]["term_number"] + majles_name = _term_list[i]["majles_name"] + error = "تاریخ بین دو دوره" + break + else: + term_number = -1 + error = "تاریخ خارج از محدوده" + + return term, term_number, majles_name, error + +# این متد یک متن و ایندکس آغاز و پایان یک عبارت درون آن متن را دریافت می کند +# و شماره توکن آغازین و توکن پایانی مربوط به عبارت در متن را بر می گرداند +def token_state_finder(normalized_section_content, start_index, end_index): + before_substring = normalized_section_content[0:start_index-1].strip() + pattern_substring = normalized_section_content[start_index-1:end_index+1].strip() + before_substring_token_list = before_substring.strip().split() + pattern_token_list = pattern_substring.strip().split() + start_token_state = len(before_substring_token_list) + end_token_state = len(before_substring_token_list) + (len(pattern_token_list)-1) + pattern_tokens_state ={ + "start_token_state": start_token_state, + "end_token_state" : end_token_state + } + return pattern_tokens_state + +def find_number_indexes_in_string(normalized_string,recognized_numbers): + complete_recognized_numbers = [] + for item in recognized_numbers: + number_start_index, number_end_index = find_token_indexes_in_string(normalized_string,item['start_token_index'],item['end_token_index']) + content = normalized_string.split() + # if item['start_token_index']==item['end_token_index']: + # # حذف این بخش و باقی گذاشتن دستور ذیل الز زیر در کفایت درست کار کردن متد بررسی شود + + # number_token_list = content[item['start_token_index']] + # else: + number_token_list = content[item['start_token_index']:item['end_token_index']+1] + complete_recognized_numbers.append( + { + 'number_value' : item['number_value'], + 'number_token_list' : number_token_list, + 'start_token_index' : item['start_token_index'], + 'end_token_index' : item['end_token_index'], + "start_number_state": number_start_index, + "end_number_state" : number_end_index + } + ) + return complete_recognized_numbers + +# این متد متن اصلی یک متن، توکن آغازین و توکن پایانی مربوط به یک عبارت را می گیرد +# و ایندکس آغاز و ایندکس پایان متن وارد شده را بر می گرداند +def find_token_indexes_in_string(normalized_string,start_token_state,end_token_state): + before_tokens = normalized_string.split()[0:start_token_state] + content_tokens = normalized_string.split()[start_token_state:end_token_state + 1] + content_start_index = 0 + content_end_index = 0 + # شمردن تعداد کاراکترهای هر توکن در لیست توکن قبل از عدد + for token in before_tokens: + content_start_index += len(token) + # اضافه کردن تعداد فاصله های خالی یا همان اسپیس به عدد ایندکس شروع عدد + content_start_index += len(before_tokens) + 1 + + # شمردن تعداد کاراکترهای هر توکن در لیست توکن مربوط به عدد + for token in content_tokens: + content_end_index += len(token) + # اضافه کردن تعداد فاصله های خالی یا همان اسپیس به عدد ایندکس پایان عدد + content_end_index += (content_start_index - 1) + (len(content_tokens) - 1) + + return content_start_index, content_end_index + +# این متد، متنی را دریافت می کند و الگوهای تعریف شده را در آن جستجو می کند و آرایه ای از عبارات مطابق با هر الگو، +# شماره ایندکس شروع و پایان هر عبارت، عنوان و محتوای الگو، و شماره توکن شروع و توکن پایانی هر عبارت +# پیدا شده را بر می گرداند +def regex_patterns_finder(sectoin_content): + regex_patterns = { + "asle N asasi": r"اصل\s*شماره\s*(\d+)\s*قانون\s*اساسی\s*جمهوری\s*اسلامی\s*ایران", # اصل شماره فلان قانون اساسی جمهوری اسلامی ایران + "qanone asasi": r"(?