Upload files to "import_data"
This commit is contained in:
parent
6f7b04efdc
commit
faadbda008
78
import_data/convert_ner_to_dataset.py
Normal file
78
import_data/convert_ner_to_dataset.py
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
"""
|
||||||
|
در این فایل، متن قانون به همراه آبجکتی از موجودیت های نامدار که قبلا مدل تشخیص داده است به عنوان
|
||||||
|
ورودی در نظر گرفته می شود و خروجی آن، یک دیتاست با فرمت
|
||||||
|
NER
|
||||||
|
خواهد بود
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from funcs import save_to_file_by_address, read_from_json
|
||||||
|
|
||||||
|
def create_dataset_part(content, ners):
|
||||||
|
tokens = content.split()
|
||||||
|
token_list = [[item,"O"] for item in tokens]
|
||||||
|
np_tokens = np.array(token_list)
|
||||||
|
|
||||||
|
for ner in ners:
|
||||||
|
pass
|
||||||
|
begin = ner["begin"]
|
||||||
|
end = ner["end"]
|
||||||
|
|
||||||
|
# تصحیح عنوان نوع موجودیت ها
|
||||||
|
key = ner["key"]
|
||||||
|
if key == "H_REF":
|
||||||
|
key = "Href"
|
||||||
|
elif key == "REF":
|
||||||
|
key = "Ref"
|
||||||
|
elif key == "ORG":
|
||||||
|
key = "Org"
|
||||||
|
elif key == "DATE":
|
||||||
|
key = "Date"
|
||||||
|
elif key == "DATE2":
|
||||||
|
key = "Date"
|
||||||
|
elif key == "DATE3":
|
||||||
|
key = "Date"
|
||||||
|
|
||||||
|
np_tokens[begin][1] = f"B-{key}"
|
||||||
|
for index in range(begin+1, end):
|
||||||
|
np_tokens[index][1] = f"I-{key}"
|
||||||
|
|
||||||
|
return np_tokens
|
||||||
|
|
||||||
|
def create_dataset(all_dataset_parts):
|
||||||
|
final_dataset_text = ""
|
||||||
|
for sentence_tokens in all_dataset_parts:
|
||||||
|
for token in sentence_tokens:
|
||||||
|
final_dataset_text = final_dataset_text + ''.join(f"{token[0]} {token[1]}\n")
|
||||||
|
pass
|
||||||
|
final_dataset_text += "\n"
|
||||||
|
|
||||||
|
return final_dataset_text
|
||||||
|
|
||||||
|
sections_110_addresss = "./data/sections_110_ner.json"
|
||||||
|
sections_list = read_from_json(sections_110_addresss)
|
||||||
|
|
||||||
|
ners = []
|
||||||
|
content = ''
|
||||||
|
token_list = []
|
||||||
|
|
||||||
|
all_sections = []
|
||||||
|
for section in sections_list:
|
||||||
|
|
||||||
|
section_id = section["id"]
|
||||||
|
content = section["content"]
|
||||||
|
ners = section["ners_v1"]
|
||||||
|
np_tokens = create_dataset_part(content=content, ners=ners)
|
||||||
|
all_sections.append(np_tokens)
|
||||||
|
|
||||||
|
final_dataset = create_dataset(all_sections).strip()
|
||||||
|
|
||||||
|
path = "./data/ner_dataset_110.txt"
|
||||||
|
|
||||||
|
save_to_file_by_address(path, final_dataset)
|
||||||
|
|
||||||
|
print(' operation finished! ')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
152
import_data/data_helper.py
Normal file
152
import_data/data_helper.py
Normal file
|
@ -0,0 +1,152 @@
|
||||||
|
import pickle
|
||||||
|
import re
|
||||||
|
import string
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class DataHelper():
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def clean_text(self, text_doc, new_line_elimination):
|
||||||
|
punctuations = r')(}{:؟!،؛»«.' + r"/<>?.,:;"
|
||||||
|
punctuations = '[' + punctuations + string.punctuation + ']'
|
||||||
|
punctuations = punctuations.replace("@", "")
|
||||||
|
|
||||||
|
text_doc.strip()
|
||||||
|
|
||||||
|
# pattern = ur'\s*@[a-zA-Z0-9]*\s*'
|
||||||
|
# tmp = re.findall(pattern, text_doc)
|
||||||
|
# newstring = re.sub(pattern, eliminate_pattern, text_doc)
|
||||||
|
|
||||||
|
|
||||||
|
#finding the numbers
|
||||||
|
pattern = r"[-+]?\d*\.\d+|\d+"
|
||||||
|
nums_list = re.findall(pattern, text_doc)
|
||||||
|
newstring = re.sub(pattern, 'floatingpointnumber', text_doc)
|
||||||
|
|
||||||
|
|
||||||
|
#pattern = '\s*' + punctuations + '+' + '\s*'
|
||||||
|
#tmp = re.findall(pattern, newstring)
|
||||||
|
#newstring = re.sub(pattern, self.add_space, newstring)
|
||||||
|
|
||||||
|
# pattern = u'([a-zA-Z0-9]+)(\s*)(' + punctuations + u')(\s*)([a-zA-Z0-9]+)'
|
||||||
|
# rep = ur'\1\3\5'
|
||||||
|
# tmp = re.findall(pattern, newstring)
|
||||||
|
# newstring = re.sub(pattern, rep, newstring)
|
||||||
|
|
||||||
|
pattern = r'[\n]+'
|
||||||
|
tmp = re.findall(pattern, newstring)
|
||||||
|
if new_line_elimination:
|
||||||
|
newstring = re.sub(pattern, " ", newstring)
|
||||||
|
else:
|
||||||
|
# newstring = re.sub(pattern, "\n", newstring)
|
||||||
|
pass
|
||||||
|
|
||||||
|
punctuations = r")(}{:؟!-،؛»«.@$&%" + r"/<>?.,:;"
|
||||||
|
latinLettersDigits = r"a-zA-Z0-9"
|
||||||
|
pattern = r'[^' + punctuations + latinLettersDigits + 'آ-ی' + '' + '\d\s:]'
|
||||||
|
tmp = re.findall(pattern, newstring)
|
||||||
|
newstring = re.sub(pattern, self.eliminate_pattern, newstring)
|
||||||
|
|
||||||
|
pattern = r'[ ]+'
|
||||||
|
tmp = re.findall(pattern, newstring)
|
||||||
|
newstring = re.sub(pattern, ' ', newstring)
|
||||||
|
|
||||||
|
for number in nums_list:
|
||||||
|
pattern = 'floatingpointnumber'
|
||||||
|
newstring = re.sub(pattern, number, newstring, 1)
|
||||||
|
|
||||||
|
return newstring
|
||||||
|
|
||||||
|
def add_space(self, mystring):
|
||||||
|
mystring = mystring.group() # this method return the string matched by re
|
||||||
|
mystring = mystring.strip(' ') # ommiting the whitespace around the pucntuation
|
||||||
|
mystring = " " + mystring + " " # adding a space after and before punctuation
|
||||||
|
return mystring
|
||||||
|
|
||||||
|
def replace_newline_with_dot(self, mystring):
|
||||||
|
return ' . '
|
||||||
|
|
||||||
|
def eliminate_pattern(self, mystring):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def load_var(self, load_path):
|
||||||
|
file = open(load_path, 'rb')
|
||||||
|
variable = pickle.load(file)
|
||||||
|
file.close()
|
||||||
|
return variable
|
||||||
|
|
||||||
|
def save_var(self, save_path, variable):
|
||||||
|
print("saving vars ...")
|
||||||
|
file = open(save_path, 'wb')
|
||||||
|
pickle.dump(variable, file)
|
||||||
|
print("variable saved.")
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
def build_stem_dictionary(self, normalizer, verb_tense_path, mokasar_noun_path):
|
||||||
|
path_dir = "resource/Persian_Dependency_Treebank/Data/2ndRep"
|
||||||
|
lexicon_stem = set()
|
||||||
|
verb_stem = set()
|
||||||
|
#verb_tense_map = {}
|
||||||
|
verb_p2f_map = {}
|
||||||
|
verb_f2p_map = {}
|
||||||
|
for fileName in os.listdir(path_dir):
|
||||||
|
file_path = path_dir + "/" + fileName
|
||||||
|
with open(file_path, "r") as input:
|
||||||
|
input_content = input.readlines()
|
||||||
|
for el in input_content:
|
||||||
|
el = normalizer.sub_alphabets(el)
|
||||||
|
el = el.split("\t")
|
||||||
|
if (len(el) > 2):
|
||||||
|
if (el[3] == 'V'):
|
||||||
|
tmp_pos = "V"
|
||||||
|
else:
|
||||||
|
tmp_pos = "N"
|
||||||
|
stem_word = el[2]
|
||||||
|
stem_word = stem_word.split("#")
|
||||||
|
stem_word = [x.strip('\u200c') for x in stem_word]
|
||||||
|
if (tmp_pos == "V" and len(stem_word) == 2):
|
||||||
|
if (len(stem_word[0]) != 0 and len(stem_word[1]) != 0):
|
||||||
|
verb_p2f_map[stem_word[0]] = stem_word[1]
|
||||||
|
verb_f2p_map[stem_word[1]] = stem_word[0]
|
||||||
|
verb_stem.add(stem_word[0])
|
||||||
|
verb_stem.add(stem_word[1])
|
||||||
|
if(tmp_pos == 'V' and len(stem_word) == 3):
|
||||||
|
if(len(stem_word[0]) != 0 and len(stem_word[1]) != 0 and len(stem_word[2]) !=0):
|
||||||
|
#verb_prifix.add(stem_word[0])
|
||||||
|
verb_p2f_map[stem_word[1]] = stem_word[2]
|
||||||
|
verb_f2p_map[stem_word[2]] = stem_word[1]
|
||||||
|
verb_stem.add(stem_word[1])
|
||||||
|
verb_stem.add(stem_word[2])
|
||||||
|
for t in stem_word:
|
||||||
|
if len(t) > 1:
|
||||||
|
if (tmp_pos == 'N'):
|
||||||
|
lexicon_stem.add(t)
|
||||||
|
|
||||||
|
with open(verb_tense_path, "r") as bon_file:
|
||||||
|
bon_file_content = bon_file.readlines()
|
||||||
|
for el in bon_file_content:
|
||||||
|
el = el.strip()
|
||||||
|
el = normalizer.sub_alphabets(el)
|
||||||
|
el = el.split()
|
||||||
|
el = [x.strip('\u200c') for x in el]
|
||||||
|
|
||||||
|
verb_p2f_map[el[0]] = el[1]
|
||||||
|
verb_f2p_map[el[1]] = el[0]
|
||||||
|
verb_stem.add(el[0])
|
||||||
|
verb_stem.add(el[1])
|
||||||
|
|
||||||
|
irregular_noun = {}
|
||||||
|
with open(mokasar_noun_path, "r") as input:
|
||||||
|
input_content = input.readlines()
|
||||||
|
for el in input_content:
|
||||||
|
el = normalizer.sub_alphabets(el)
|
||||||
|
el = el.replace("\t\t", "\t")
|
||||||
|
el = el.strip().split("\t")
|
||||||
|
el = [x.strip('\u200c') for x in el]
|
||||||
|
irregular_noun[el[0]] = el[1]
|
||||||
|
lexicon_stem.add(el[0])
|
||||||
|
|
||||||
|
verb_tense_map = [verb_p2f_map, verb_f2p_map]
|
||||||
|
return lexicon_stem, verb_stem, verb_tense_map, irregular_noun
|
150
import_data/diff_result_objects.py
Normal file
150
import_data/diff_result_objects.py
Normal file
|
@ -0,0 +1,150 @@
|
||||||
|
import pandas as pd
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy.exc import OperationalError
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
from decimal import Decimal
|
||||||
|
|
||||||
|
# توکن و هدر برای ارسال درخواست به API
|
||||||
|
TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTg3MTY3MTMsImp0aSI6Im1aY0MwSEdIV3dxb1ppWVwvb2VqMlwvT2FWc3FTOFIwSTkiLCJpc3MiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZXhwIjoxNzIwMDE2NzEyLCJhdWQiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZGF0YSI6eyJpZCI6NywiZmlyc3RfbmFtZSI6Ilx1MDY0NVx1MDYzNVx1MDYzN1x1MDY0MVx1MDZjYyIsImxhc3RfbmFtZSI6Ilx1MDYyOFx1MDY0N1x1MDYyZlx1MDYyN1x1MDY0Nlx1MDZjYyIsImVtYWlsIjoiamFtdXNiMjc0NzRAZ21haWwuY29tIiwidXNlcm5hbWUiOiJtYmVoZGFuaSIsInVzZXJfbGV2ZWwiOjF9fQ.NhxbjdXMCEb_ninOBKpzbUsaAmxva1zpShuesXrVpEQ'
|
||||||
|
ACCEPT = "application/json"
|
||||||
|
HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
|
||||||
|
|
||||||
|
# اطلاعات اتصال به پایگاه داده
|
||||||
|
db_host = 'DESKTOP-0STSURA\\MUSTAFA'
|
||||||
|
db_name = 'Qavanin'
|
||||||
|
db_driver = 'ODBC Driver 17 for SQL Server'
|
||||||
|
db_trusted_connection = 'yes'
|
||||||
|
|
||||||
|
# ایجاد یک رشته اتصال (connection string)
|
||||||
|
connection_string = f"mssql+pyodbc://@{db_host}/{db_name}?driver={db_driver}&Trusted_Connection={db_trusted_connection}"
|
||||||
|
|
||||||
|
# ایجاد یک engine با استفاده از SQLAlchemy
|
||||||
|
engine = create_engine(connection_string)
|
||||||
|
|
||||||
|
# بررسی اتصال به پایگاه داده
|
||||||
|
try:
|
||||||
|
with engine.connect() as connection:
|
||||||
|
print("اتصال به سرور موفقیتآمیز بود.")
|
||||||
|
except OperationalError as e:
|
||||||
|
print(f"خطا در اتصال به سرور: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
# اجرای کوئری SQL و خواندن دادهها به یک DataFrame
|
||||||
|
query = """
|
||||||
|
SELECT
|
||||||
|
(select Top(1) gl5.SECTIONTEXT from lwSectionLog gl5 where gl1.[F_LWSECTIONLOGID_EFFECTED]=gl5.ID ) as effect_prev_text,
|
||||||
|
gl2.[SECTIONTEXT] as effected_text
|
||||||
|
FROM [Qavanin].[dbo].[lwSectionChange] gl1
|
||||||
|
LEFT JOIN lwSection gl2 on gl2.ID=gl1.[F_LWSECTIONID_EFFECTED]
|
||||||
|
LEFT JOIN lwLaw gl4 on gl1.F_LWLAWID_EFFECTIVE=gl4.ID
|
||||||
|
WHERE
|
||||||
|
gl4.ISLAW = 1
|
||||||
|
AND gl1.[F_LWSECTIONID_EFFECTED] is not null
|
||||||
|
ORDER BY gl1.[F_LWSECTIONID_EFFECTED], gl4.APPROVEDATE
|
||||||
|
"""
|
||||||
|
|
||||||
|
df = pd.read_sql(query, engine)
|
||||||
|
|
||||||
|
# تابع برای حذف کاراکترهای غیرمجاز
|
||||||
|
def remove_illegal_chars(value):
|
||||||
|
if isinstance(value, str):
|
||||||
|
return re.sub(r'[\000-\010]|[\013-\014]|[\016-\037]', '', value)
|
||||||
|
return value
|
||||||
|
|
||||||
|
df = df.applymap(remove_illegal_chars)
|
||||||
|
|
||||||
|
df = df[['effect_prev_text', 'effected_text']]
|
||||||
|
|
||||||
|
num_rows = len(df)
|
||||||
|
num_chunks = 15
|
||||||
|
chunk_size = num_rows // num_chunks + (1 if num_rows % num_chunks != 0 else 0)
|
||||||
|
|
||||||
|
BASE_URL = "https://api.tavasi.ir/repo/dataset/multi/add/qasection/keyword"
|
||||||
|
|
||||||
|
class JSONEncoder(json.JSONEncoder):
|
||||||
|
def default(self, obj):
|
||||||
|
if isinstance(obj, Decimal):
|
||||||
|
return float(obj)
|
||||||
|
return json.JSONEncoder.default(self, obj)
|
||||||
|
|
||||||
|
def isNeedHtml(html):
|
||||||
|
if "<TABLE" in html or "<table" in html or "</TR" in html or "</tr" in html :
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def removeHtmlTags(html, exeptionTag=[]):
|
||||||
|
if exeptionTag.__len__():
|
||||||
|
exceptTags = ''
|
||||||
|
for tag in exeptionTag:
|
||||||
|
if exceptTags != '':
|
||||||
|
exceptTags += '|'
|
||||||
|
exceptTags += '(' + tag + ')'
|
||||||
|
reg1 = r'<(?P<slash>/)*(?P<tag>' + exceptTags + ')(?P<class>[^>]+)*>'
|
||||||
|
html = re.sub(reg1, '', html)
|
||||||
|
return html
|
||||||
|
|
||||||
|
def createIndex(content, values):
|
||||||
|
result_objects = [{
|
||||||
|
"task": "diff",
|
||||||
|
"key": "qasection_ref_state_diff",
|
||||||
|
"label": "تغییرات در قانون",
|
||||||
|
"values": [{"text": values, "score": 4}] # مثال امتیاز تشابه
|
||||||
|
}]
|
||||||
|
output = {
|
||||||
|
"content": content,
|
||||||
|
"domain": "تغییرات قانون",
|
||||||
|
"ref_id": "",
|
||||||
|
"ref_url": "",
|
||||||
|
"task": "diff",
|
||||||
|
"result_objects": result_objects,
|
||||||
|
}
|
||||||
|
return output
|
||||||
|
|
||||||
|
def send_data_to_api(data_list):
|
||||||
|
bulk_data = []
|
||||||
|
for item in data_list:
|
||||||
|
content = item["effect_prev_text"]
|
||||||
|
values = item["effected_text"]
|
||||||
|
|
||||||
|
# بررسی خالی نبودن content و values
|
||||||
|
if not content or not values:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# بررسی وجود تگ HTML خاص
|
||||||
|
if isNeedHtml(content) or isNeedHtml(values):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# حذف تگهای HTML از content و values
|
||||||
|
content = removeHtmlTags(content)
|
||||||
|
values = removeHtmlTags(values)
|
||||||
|
|
||||||
|
data = createIndex(content, values)
|
||||||
|
bulk_data.append(data)
|
||||||
|
|
||||||
|
if len(bulk_data) > 10:
|
||||||
|
payload = json.dumps(bulk_data, cls=JSONEncoder)
|
||||||
|
response = requests.post(BASE_URL, headers=HEADERS, data=payload)
|
||||||
|
bulk_data = []
|
||||||
|
|
||||||
|
if bulk_data:
|
||||||
|
payload = json.dumps(bulk_data, cls=JSONEncoder)
|
||||||
|
response = requests.post(BASE_URL, headers=HEADERS, data=payload)
|
||||||
|
|
||||||
|
# for i, line in mainList :
|
||||||
|
for i in range(num_chunks):
|
||||||
|
start_row = i * chunk_size
|
||||||
|
end_row = start_row + chunk_size
|
||||||
|
chunk_df = df.iloc[start_row:end_row]
|
||||||
|
|
||||||
|
data_list = []
|
||||||
|
for index, row in chunk_df.iterrows():
|
||||||
|
data_list.append({
|
||||||
|
"effect_prev_text": row['effect_prev_text'],
|
||||||
|
"effected_text": row['effected_text']
|
||||||
|
})
|
||||||
|
|
||||||
|
send_data_to_api(data_list)
|
||||||
|
|
||||||
|
print("تمام دادهها با موفقیت ارسال شدند.")
|
112
import_data/funcs.py
Normal file
112
import_data/funcs.py
Normal file
|
@ -0,0 +1,112 @@
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from pandas import read_excel
|
||||||
|
def remove_signs():
|
||||||
|
str = read_file()
|
||||||
|
# lines =
|
||||||
|
pattern = r"\(|\)"
|
||||||
|
str = re.sub(pattern,'', str)
|
||||||
|
# str = re.sub(')','', str)
|
||||||
|
# str = re.sub('/','', str)
|
||||||
|
|
||||||
|
return str
|
||||||
|
|
||||||
|
def read_file():
|
||||||
|
with open('./data/DATASET_2.txt', 'r', encoding='utf-8') as file:
|
||||||
|
text = ''
|
||||||
|
try:
|
||||||
|
text = str(file.read())
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return text
|
||||||
|
|
||||||
|
def read_file_by_address(file_address):
|
||||||
|
with open(file_address, 'r', encoding='utf-8') as file:
|
||||||
|
text = ''
|
||||||
|
try:
|
||||||
|
text = str(file.read())
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return text
|
||||||
|
|
||||||
|
def save_to_file(result):
|
||||||
|
with open('./data/DATASET_3.txt', 'a+', encoding='utf-8') as file:
|
||||||
|
previous_result = ''
|
||||||
|
try:
|
||||||
|
previous_result = file.read()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
file.write(result)
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
def save_to_file_by_address(file_address, text):
|
||||||
|
with open(file_address, 'a+', encoding='utf-8') as file:
|
||||||
|
previous_result = ''
|
||||||
|
try:
|
||||||
|
previous_result = file.read()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
file.write(text)
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
|
||||||
|
def read_from_excel(file_address, column_name):
|
||||||
|
# خواندن فایل اکسل
|
||||||
|
data = read_excel(file_address)
|
||||||
|
|
||||||
|
# استخراج محتوای ستون مورد نظر
|
||||||
|
column_data = data[column_name]
|
||||||
|
return column_data
|
||||||
|
|
||||||
|
def add_columndata_to_excel(file_address, column_name, columndata):
|
||||||
|
|
||||||
|
# خواندن فایل اکسل
|
||||||
|
data = read_excel(file_address)
|
||||||
|
|
||||||
|
# اضافه کردن ستون جدید به دادهها
|
||||||
|
data[column_name] = columndata
|
||||||
|
|
||||||
|
# ذخیره کردن دادهها در فایل اکسل
|
||||||
|
data.to_excel(file_address, index=False)
|
||||||
|
|
||||||
|
def write_to_json(dict, file_address):
|
||||||
|
|
||||||
|
# تبدیل دیکشنری به فرمت JSON
|
||||||
|
json_data = json.dumps(dict, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
# ذخیره فایل
|
||||||
|
with open(file_address, 'w+', encoding='utf-8') as file:
|
||||||
|
file.write(json_data)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def read_from_json(file_address):
|
||||||
|
data_dict = []
|
||||||
|
# خواندن اطلاعات از فایل JSON
|
||||||
|
with open(file_address, 'r', encoding='utf-8') as file:
|
||||||
|
loaded_data = json.load(file)
|
||||||
|
|
||||||
|
# نمایش اطلاعات خوانده شده
|
||||||
|
for item in loaded_data:
|
||||||
|
data_dict.append(item)
|
||||||
|
return data_dict
|
||||||
|
|
||||||
|
|
||||||
|
def separated_date_format_finder(date_ner):
|
||||||
|
result = False
|
||||||
|
date_ner = date_ner.replace('.','/')
|
||||||
|
date_ner = date_ner.replace('،','/')
|
||||||
|
date_ner = date_ner.replace('ر','/')
|
||||||
|
#date_pattern = r'\d{1,2} /\d{1,2} /\d{2,4}|\d{1,2}/\d{1,2}/\d{2,4}|\d{2,4} /\d{1,2} /\d{1,2}|\d{2,4}/\d{1,2}/\d{1,2}'
|
||||||
|
date_pattern = r'\b(?:(?:1[0-2]|0?[1-9])/?(?:3[01]|[12][0-9]|0?[1-9])/?(?:14[0-7][0-9]|13[0-9][0-9]|128[0-9])|(?:1[0-2]|0?[1-9])/?(?:3[01]|[12][0-9]|0?[1-9])/?(?:14[0-7][0-9]|13[0-9][0-9]|128[0-9]|[0-9]{2}))\b'
|
||||||
|
regex = re.compile(date_pattern)
|
||||||
|
match_dates = regex.finditer(date_ner)
|
||||||
|
for date_item in match_dates:
|
||||||
|
result = True
|
||||||
|
break
|
||||||
|
return result
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
pass
|
810
import_data/general_functions.py
Normal file
810
import_data/general_functions.py
Normal file
|
@ -0,0 +1,810 @@
|
||||||
|
from normalizer import Normalizer
|
||||||
|
from tokenizer import *
|
||||||
|
import jalali
|
||||||
|
import re
|
||||||
|
from re import sub
|
||||||
|
import textwrap
|
||||||
|
from html import escape
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# from lxml import etree
|
||||||
|
import datetime
|
||||||
|
#enumerate(token_list):
|
||||||
|
_normalizer = Normalizer(date_normalizing_needed=True)
|
||||||
|
|
||||||
|
yeAr = r"ﻱ|ې|ێ|ے|ى|ي|ئ"
|
||||||
|
yeFr= r"ی"
|
||||||
|
keAr = r"ڭ|ﻚ|ﮎ|ﻜ|ﮏ|ګ|ﻛ|ﮑ|ﮐ|ڪ|ك"
|
||||||
|
keFr = r"ک"
|
||||||
|
mark1 = r'#\[#'
|
||||||
|
mark2 = r'#\]#'
|
||||||
|
hTag1 = r'<'
|
||||||
|
hTag2 = r'>'
|
||||||
|
tableTag=["table","tr", "th", "td", "TABLE", "TR", "TH", "TD"]
|
||||||
|
strTable = ''
|
||||||
|
for tag in tableTag:
|
||||||
|
if strTable != '':
|
||||||
|
strTable += '|'
|
||||||
|
strTable += '('+tag+')'
|
||||||
|
regTable = r'<(?P<slash>\/)*(?P<tag>'+strTable+')(?P<class>[^>]+)*>'
|
||||||
|
regTableReplace = r'#[#\g<slash>\g<tag>\g<class>#]#'
|
||||||
|
|
||||||
|
|
||||||
|
def isNeedHtml(html):
|
||||||
|
if "<TABLE" in html or "<table" in html or "</TR" in html or "</tr" in html :
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def removeHtmlTags(html, exeptionTag=[]):
|
||||||
|
#reg1 = r'<[^>]+>'
|
||||||
|
|
||||||
|
if exeptionTag.__len__ :
|
||||||
|
exceptTags = ''
|
||||||
|
for tag in exeptionTag:
|
||||||
|
if exceptTags != '':
|
||||||
|
exceptTags += '|'
|
||||||
|
exceptTags += '('+tag+')'
|
||||||
|
reg1 = r'<(?P<slash>/)*(?P<tag>'+exceptTags+')(?P<class>[^>]+)*>'
|
||||||
|
html = sub(reg1, regTableReplace, html)
|
||||||
|
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
text = soup.get_text("\n", strip=True)
|
||||||
|
|
||||||
|
if exeptionTag.__len__ :
|
||||||
|
text = sub(mark1, hTag1, text)
|
||||||
|
text = sub(mark2, hTag2, text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def removeHtmlNoTableTag(html):
|
||||||
|
|
||||||
|
# خطا داره و هنگ می کنه در test2.py
|
||||||
|
html = sub(regTable, regTableReplace, html)
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
text = soup.get_text("\n", strip=True)
|
||||||
|
|
||||||
|
text = sub(mark1, hTag1, text)
|
||||||
|
text = sub(mark2, hTag2, text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
def normalizerData(data):
|
||||||
|
global _normalizer
|
||||||
|
normalTitle, dates = _normalizer.normalize(data, return_dates=True)
|
||||||
|
tdates = []
|
||||||
|
for d in dates:
|
||||||
|
if not d.startswith("num"):
|
||||||
|
try:
|
||||||
|
tsd = jdate2timestamp(d)
|
||||||
|
cdate = d
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
d = d.replace("y", "")
|
||||||
|
d = d.replace("m", "/")
|
||||||
|
d = d.replace("d", "/")
|
||||||
|
m = re.match(r"^(\d{4})\D(\d{1,2})\D(\d{1,2})$", d)
|
||||||
|
if m:
|
||||||
|
[year, month, day] = [
|
||||||
|
int(m.group(1)),
|
||||||
|
int(m.group(2)),
|
||||||
|
int(m.group(3)),
|
||||||
|
]
|
||||||
|
if year > 1200 and year < 1550:
|
||||||
|
if month < 1 or month > 12:
|
||||||
|
month = 1
|
||||||
|
if day < 1 or day > 31:
|
||||||
|
day = 1
|
||||||
|
cdate = str(year) + "/" + str(month) + "/" + str(day)
|
||||||
|
tsd = jdate2timestamp(cdate)
|
||||||
|
else:
|
||||||
|
# cdate = "1403/03/03"
|
||||||
|
# tsd = jdate2timestamp(cdate)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# cdate = "1403/03/03"
|
||||||
|
# tsd = jdate2timestamp(cdate)
|
||||||
|
continue
|
||||||
|
except:
|
||||||
|
# print("Error in:"+ d +" for id: " + id)
|
||||||
|
# cdate = "1403/03/03"
|
||||||
|
# tsd = jdate2timestamp(cdate)
|
||||||
|
continue
|
||||||
|
tdates.append({"date": cdate, "timestamp": tsd, "index": 0, "slice": ""})
|
||||||
|
|
||||||
|
return normalTitle,tdates
|
||||||
|
|
||||||
|
def normalizerDate2(inputString):
|
||||||
|
global _normalizer
|
||||||
|
normalizedString, dates, recognized_dates, recognized_numbers = _normalizer.normalize(inputString, return_dates=True)
|
||||||
|
tdates = []
|
||||||
|
for date_item in recognized_dates:
|
||||||
|
date_part = date_item['date']
|
||||||
|
date_token_index = date_item['date_token_index']
|
||||||
|
start_date_token_index = date_item['start_date_token_index']
|
||||||
|
end_date_token_index = date_item['end_date_token_index']
|
||||||
|
if not date_part.startswith("num"):
|
||||||
|
try:
|
||||||
|
cdate = date_part
|
||||||
|
tsd = jdate2timestamp(date_part)
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
date_part = date_part.replace("y", "")
|
||||||
|
date_part = date_part.replace("m", "/")
|
||||||
|
date_part = date_part.replace("d", "/")
|
||||||
|
m = re.match(r"^(\d{4})\D(\d{1,2})\D(\d{1,2})$", date_part)
|
||||||
|
if m:
|
||||||
|
[year, month, day] = [
|
||||||
|
int(m.group(1)),
|
||||||
|
int(m.group(2)),
|
||||||
|
int(m.group(3)),
|
||||||
|
]
|
||||||
|
if year > 1200 and year < 1550:
|
||||||
|
if month < 1 or month > 12:
|
||||||
|
month = 1
|
||||||
|
if day < 1 or day > 31:
|
||||||
|
day = 1
|
||||||
|
cdate = str(year) + "/" + str(month) + "/" + str(day)
|
||||||
|
tsd = jdate2timestamp(cdate)
|
||||||
|
else:
|
||||||
|
# cdate = "1403/03/03"
|
||||||
|
# tsd = jdate2timestamp(cdate)
|
||||||
|
continue
|
||||||
|
# else:
|
||||||
|
# # cdate = "1403/03/03"
|
||||||
|
# # tsd = jdate2timestamp(cdate)
|
||||||
|
# continue
|
||||||
|
except:
|
||||||
|
# print("Error in:"+ d +" for id: " + id)
|
||||||
|
# cdate = "1403/03/03"
|
||||||
|
# tsd = jdate2timestamp(cdate)
|
||||||
|
continue
|
||||||
|
import tokenizer as t
|
||||||
|
inputString_token = t.Tokenizer.tokenize_words(None,inputString)
|
||||||
|
# if start_date_token_index == end_date_token_index:
|
||||||
|
# end_date_token_index += 1
|
||||||
|
# original_date_part = inputString_token[start_date_token_index:end_date_token_index]
|
||||||
|
# else:
|
||||||
|
original_date_part = inputString_token[start_date_token_index:end_date_token_index + 1]
|
||||||
|
original_date = ''
|
||||||
|
for part in original_date_part:
|
||||||
|
original_date = original_date + ' ' + part
|
||||||
|
original_date = original_date.strip()
|
||||||
|
tdates.append({"converted_date": date_item['date'],
|
||||||
|
"date": cdate ,
|
||||||
|
"original_date" : original_date,
|
||||||
|
# "timestamp": tsd,
|
||||||
|
"date_token_index": date_token_index,
|
||||||
|
"start_date_token_index": start_date_token_index,
|
||||||
|
"end_date_token_index":end_date_token_index})
|
||||||
|
'''
|
||||||
|
for d in dates:
|
||||||
|
if not d.startswith("num"):
|
||||||
|
try:
|
||||||
|
tsd = jdate2timestamp(d)
|
||||||
|
cdate = d
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
d = d.replace("y", "")
|
||||||
|
d = d.replace("m", "/")
|
||||||
|
d = d.replace("d", "/")
|
||||||
|
m = re.match(r"^(\d{4})\D(\d{1,2})\D(\d{1,2})$", d)
|
||||||
|
if m:
|
||||||
|
[year, month, day] = [
|
||||||
|
int(m.group(1)),
|
||||||
|
int(m.group(2)),
|
||||||
|
int(m.group(3)),
|
||||||
|
]
|
||||||
|
if year > 1200 and year < 1550:
|
||||||
|
if month < 1 or month > 12:
|
||||||
|
month = 1
|
||||||
|
if day < 1 or day > 31:
|
||||||
|
day = 1
|
||||||
|
cdate = str(year) + "/" + str(month) + "/" + str(day)
|
||||||
|
tsd = jdate2timestamp(cdate)
|
||||||
|
else:
|
||||||
|
# cdate = "1403/03/03"
|
||||||
|
# tsd = jdate2timestamp(cdate)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# cdate = "1403/03/03"
|
||||||
|
# tsd = jdate2timestamp(cdate)
|
||||||
|
continue
|
||||||
|
except:
|
||||||
|
# print("Error in:"+ d +" for id: " + id)
|
||||||
|
# cdate = "1403/03/03"
|
||||||
|
# tsd = jdate2timestamp(cdate)
|
||||||
|
continue
|
||||||
|
tdates.append({"date": cdate, "timestamp": tsd, "index": 0, "slice": ""})'''
|
||||||
|
return normalizedString,tdates,recognized_numbers
|
||||||
|
|
||||||
|
def OtherDateFormatNormalizer(inputString,pattern):
|
||||||
|
mainTextTemp = inputString
|
||||||
|
regex_pattern_Mah = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\s(\d{1,4})\sو\s(\d{1,3})\sو\s(\d{1,2})\sو\s(\d{1})\s" # y0m4d4 ماه 1000 و 300 و 50 و 4
|
||||||
|
regex_pattern_MahSal = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\sسال\s(\d{1,4})\sو\s(\d{1,3})\sو\s(\d{1,2})\sو\s(\d{1})\s" # y0m4d4 ماه سال 1000 و 300 و 50 و 4
|
||||||
|
regex_pattern_MahSal2 = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\sسال\sy(\d{1,4})m(\d{1,2})d(\d{1,2})\sو\s(\d{1,3})\sو\s(\d{1,2})\sو\s(\d{1})\s" # y0m4d4 ماه سال y1000m0d0 و 300 و 50 و 4
|
||||||
|
regex_pattern_MahSal3 = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\sسال\sy(\d{1,4})m(\d{1,2})d(\d{1,2})" # y0m3d1 ماه سال y1353m0d0
|
||||||
|
|
||||||
|
if(pattern==1):
|
||||||
|
regex = re.compile(regex_pattern_Mah)
|
||||||
|
elif(pattern==2):
|
||||||
|
regex = re.compile(regex_pattern_MahSal)
|
||||||
|
elif(pattern==3):
|
||||||
|
regex = re.compile(regex_pattern_MahSal2)
|
||||||
|
elif(pattern==4):
|
||||||
|
regex = re.compile(regex_pattern_MahSal3)
|
||||||
|
|
||||||
|
matches = regex.finditer(inputString)
|
||||||
|
for match in matches:
|
||||||
|
foundedPattern = match.group()
|
||||||
|
foundedPatternTemp = match.group()
|
||||||
|
if(pattern==1):
|
||||||
|
foundedPattern = foundedPattern.replace('ماه','')
|
||||||
|
else:
|
||||||
|
foundedPattern = foundedPattern.replace('سال','')
|
||||||
|
foundedPattern = foundedPattern.replace('ماه','')
|
||||||
|
foundedPattern = foundedPattern.strip()
|
||||||
|
tempString = foundedPattern
|
||||||
|
standardDatePattern = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})"
|
||||||
|
#regex = re.compile(regex_pattern_Mah)
|
||||||
|
matchItems = re.finditer(standardDatePattern,tempString)
|
||||||
|
for item in matchItems:
|
||||||
|
tempPattern = item.group()
|
||||||
|
tempString = tempString.replace(tempPattern,'')
|
||||||
|
tempString = tempString.strip()
|
||||||
|
tempString = tempString.replace('و','')
|
||||||
|
tempString = tempString.strip()
|
||||||
|
tempArray = tempString.split()
|
||||||
|
year = 0
|
||||||
|
for item in tempArray:
|
||||||
|
dateMatch = re.finditer(standardDatePattern,item)
|
||||||
|
regexFlag = True
|
||||||
|
for dateItem in dateMatch:
|
||||||
|
yearStr = dateItem.group()[1:5]
|
||||||
|
year += int(yearStr)
|
||||||
|
regexFlag = False
|
||||||
|
break
|
||||||
|
if(item.isalnum() and regexFlag):
|
||||||
|
year += int(item)
|
||||||
|
tempPattern = tempPattern.replace('y0','y'+str(year))
|
||||||
|
mainTextTemp = mainTextTemp.replace(foundedPatternTemp,tempPattern+' ')
|
||||||
|
return mainTextTemp
|
||||||
|
|
||||||
|
#foundedPattern = jdate2timestamp(foundedPattern)
|
||||||
|
#convertedText = regex.sub(foundedPattern,convertedText)
|
||||||
|
|
||||||
|
def normalizerLongData(data):
|
||||||
|
dates = []
|
||||||
|
if len(data) > 10000:
|
||||||
|
textParts = textwrap.wrap(data, 10000, break_long_words=False)
|
||||||
|
for part in textParts:
|
||||||
|
dates.extend(normalizerData(part))
|
||||||
|
else:
|
||||||
|
dates = normalizerData(data)
|
||||||
|
return dates
|
||||||
|
|
||||||
|
# ##################
|
||||||
|
# در ویندوز برای اعداد منفی که تاریخهای قبلی بود را خطا می داد
|
||||||
|
# rr = gdt.timestamp()
|
||||||
|
# #################
|
||||||
|
def jdate2timestamp_old(dt):
|
||||||
|
ndt = dt.replace("y", "")
|
||||||
|
ndt = ndt.replace("m", "/")
|
||||||
|
ndt = ndt.replace("d", "/")
|
||||||
|
gd = jalali.Persian(ndt).gregorian_datetime()
|
||||||
|
# print(gd)
|
||||||
|
ztime = datetime.time(0, 0, 0, 0)
|
||||||
|
gdt = datetime.datetime.combine(gd, ztime)
|
||||||
|
# print(gdt)
|
||||||
|
rr = gdt.timestamp()
|
||||||
|
tst = int( round(rr) * 1000)
|
||||||
|
return tst
|
||||||
|
|
||||||
|
def jdate2timestamp(dt):
|
||||||
|
ndt = dt.replace("y", "")
|
||||||
|
ndt = ndt.replace("m", "/")
|
||||||
|
ndt = ndt.replace("d", "/")
|
||||||
|
gd = jalali.Persian(ndt).gregorian_datetime()
|
||||||
|
base = datetime.date(1970, 1, 1)
|
||||||
|
rr = (gd-base).total_seconds()
|
||||||
|
tst = int( round(rr) * 1000)
|
||||||
|
return tst
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def getSortTimestamp(ts_date):
|
||||||
|
empty_date = -15000000000
|
||||||
|
ts_ts = empty_date
|
||||||
|
try:
|
||||||
|
if ts_date != "":
|
||||||
|
ts_ts = jdate2timestamp(ts_date)
|
||||||
|
except:
|
||||||
|
ts_ts = empty_date
|
||||||
|
|
||||||
|
return ts_ts
|
||||||
|
|
||||||
|
def normalize_content(content):
|
||||||
|
text = normalYehKe(content)
|
||||||
|
text = _normalizer.sub_alphabets(text)
|
||||||
|
# کلماتی که با نیم فاصله از هم جدا شده اند، را به هم می چسباند
|
||||||
|
# در این صورت، اگر با یک اسپیس جایگزین شود، یک توکن به متن اصلی اضافه می کند
|
||||||
|
text = sub('\u200c','',text)
|
||||||
|
pattern = r',|٬|٫|‚|,|؟|ʕ|_|ـ'
|
||||||
|
text = sub(pattern,'', text)
|
||||||
|
|
||||||
|
if text.__contains__('\u200c'):
|
||||||
|
print(text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
def normalYehKe(text):
|
||||||
|
if(text == None) :
|
||||||
|
return ''
|
||||||
|
|
||||||
|
c1 = sub(yeAr, yeFr, text)
|
||||||
|
c2 = sub(keAr, keFr, c1)
|
||||||
|
c2 = c2.replace('\u00A0', '')
|
||||||
|
return c2.strip()
|
||||||
|
|
||||||
|
_term_list = []
|
||||||
|
def setTermList():
|
||||||
|
global _term_list
|
||||||
|
if(_term_list.__len__() > 0):
|
||||||
|
return
|
||||||
|
_term_list = [
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1285/07/14"),
|
||||||
|
"end": jdate2timestamp("1287/04/2"),
|
||||||
|
"term": "مجلس شورای ملی-دوره1",
|
||||||
|
"term_number": 1,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1288/8/24"),
|
||||||
|
"end": jdate2timestamp("1290/10/3"),
|
||||||
|
"term": "مجلس شورای ملی-دوره2",
|
||||||
|
"term_number": 2,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1293/9/14"),
|
||||||
|
"end": jdate2timestamp("1294/8/21"),
|
||||||
|
"term": "مجلس شورای ملی-دوره3",
|
||||||
|
"term_number": 3,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1300/4/1"),
|
||||||
|
"end": jdate2timestamp("1302/3/30"),
|
||||||
|
"term": "مجلس شورای ملی-دوره4",
|
||||||
|
"term_number": 4,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1302/11/22"),
|
||||||
|
"end": jdate2timestamp("1304/11/22"),
|
||||||
|
"term": "مجلس شورای ملی-دوره5",
|
||||||
|
"term_number": 5,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1305/4/19"),
|
||||||
|
"end": jdate2timestamp("1307/5/22"),
|
||||||
|
"term": "مجلس شورای ملی-دوره6",
|
||||||
|
"term_number": 6,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1307/7/19"),
|
||||||
|
"end": jdate2timestamp("1309/8/14"),
|
||||||
|
"term": "مجلس شورای ملی-دوره7",
|
||||||
|
"term_number": 7,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1309/9/24"),
|
||||||
|
"end": jdate2timestamp("1311/10/24"),
|
||||||
|
"term": "مجلس شورای ملی-دوره8",
|
||||||
|
"term_number": 8,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1311/12/24"),
|
||||||
|
"end": jdate2timestamp("1314/1/24"),
|
||||||
|
"term": "مجلس شورای ملی-دوره9",
|
||||||
|
"term_number": 9,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1314/3/15"),
|
||||||
|
"end": jdate2timestamp("1316/3/22"),
|
||||||
|
"term": "مجلس شورای ملی-دوره10",
|
||||||
|
"term_number": 10,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1316/6/20"),
|
||||||
|
"end": jdate2timestamp("1318/6/27"),
|
||||||
|
"term": "مجلس شورای ملی-دوره11",
|
||||||
|
"term_number": 11,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1318/8/3"),
|
||||||
|
"end": jdate2timestamp("1320/8/9"),
|
||||||
|
"term": "مجلس شورای ملی-دوره12",
|
||||||
|
"term_number": 12,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1320/8/22"),
|
||||||
|
"end": jdate2timestamp("1322/9/1"),
|
||||||
|
"term": "مجلس شورای ملی-دوره13",
|
||||||
|
"term_number": 13,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1322/12/16"),
|
||||||
|
"end": jdate2timestamp("1324/12/21"),
|
||||||
|
"term": "مجلس شورای ملی-دوره14",
|
||||||
|
"term_number": 14,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1326/4/25"),
|
||||||
|
"end": jdate2timestamp("1328/5/6"),
|
||||||
|
"term": "مجلس شورای ملی-دوره15",
|
||||||
|
"term_number": 15,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1328/11/20"),
|
||||||
|
"end": jdate2timestamp("1330/11/29"),
|
||||||
|
"term": "مجلس شورای ملی-دوره16",
|
||||||
|
"term_number": 16,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1331/2/7"),
|
||||||
|
"end": jdate2timestamp("1332/8/28"),
|
||||||
|
"term": "مجلس شورای ملی-دوره17",
|
||||||
|
"term_number": 17,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1332/12/27"),
|
||||||
|
"end": jdate2timestamp("1335/1/26"),
|
||||||
|
"term": "مجلس شورای ملی-دوره18",
|
||||||
|
"term_number": 18,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1335/3/10"),
|
||||||
|
"end": jdate2timestamp("1339/3/29"),
|
||||||
|
"term": "مجلس شورای ملی-دوره19",
|
||||||
|
"term_number": 19,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1339/12/2"),
|
||||||
|
"end": jdate2timestamp("1340/2/19"),
|
||||||
|
"term": "مجلس شورای ملی-دوره20",
|
||||||
|
"term_number": 20,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1342/7/14"),
|
||||||
|
"end": jdate2timestamp("1346/7/13"),
|
||||||
|
"term": "مجلس شورای ملی-دوره21",
|
||||||
|
"term_number": 21,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1346/7/14"),
|
||||||
|
"end": jdate2timestamp("1350/6/9"),
|
||||||
|
"term": "مجلس شورای ملی-دوره22",
|
||||||
|
"term_number": 22,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1350/6/9"),
|
||||||
|
"end": jdate2timestamp("1354/6/16"),
|
||||||
|
"term": "مجلس شورای ملی-دوره23",
|
||||||
|
"term_number": 23,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1354/6/17"),
|
||||||
|
"end": jdate2timestamp("1357/11/20"),
|
||||||
|
"term": "مجلس شورای ملی-دوره24",
|
||||||
|
"term_number": 24,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1359/3/7"),
|
||||||
|
"end": jdate2timestamp("1363/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره1",
|
||||||
|
"term_number": 1,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1363/3/7"),
|
||||||
|
"end": jdate2timestamp("1367/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره2",
|
||||||
|
"term_number": 2,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1367/3/7"),
|
||||||
|
"end": jdate2timestamp("1371/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره3",
|
||||||
|
"term_number": 3,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1371/3/7"),
|
||||||
|
"end": jdate2timestamp("1375/3/11"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره4",
|
||||||
|
"term_number": 4,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1375/3/12"),
|
||||||
|
"end": jdate2timestamp("1379/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره5",
|
||||||
|
"term_number": 5,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1379/3/7"),
|
||||||
|
"end": jdate2timestamp("1383/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره6",
|
||||||
|
"term_number": 6,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1383/3/7"),
|
||||||
|
"end": jdate2timestamp("1387/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره7",
|
||||||
|
"term_number": 7,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1387/3/7"),
|
||||||
|
"end": jdate2timestamp("1391/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره8",
|
||||||
|
"term_number": 8,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1391/3/7"),
|
||||||
|
"end": jdate2timestamp("1395/3/7"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره9",
|
||||||
|
"term_number": 9,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1395/3/8"),
|
||||||
|
"end": jdate2timestamp("1399/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره10",
|
||||||
|
"term_number": 10,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1399/3/7"),
|
||||||
|
"end": jdate2timestamp("1403/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره11",
|
||||||
|
"term_number": 11,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def getTermQanon(ts_date_timestamp, ts_ref):
|
||||||
|
setTermList()
|
||||||
|
global _term_list
|
||||||
|
term = ""
|
||||||
|
term_number = 0
|
||||||
|
majles_name = ""
|
||||||
|
|
||||||
|
if ts_ref == "هيات وزيران (دوره فترت)":
|
||||||
|
term = ts_ref
|
||||||
|
if ts_ref == "نخست وزير (مصدق)":
|
||||||
|
term = ts_ref
|
||||||
|
if ts_ref == "وزير عدليه (داور)":
|
||||||
|
term = ts_ref
|
||||||
|
if ts_ref == "شوراي انقلاب جمهوري اسلامي ايران":
|
||||||
|
term = ts_ref
|
||||||
|
|
||||||
|
majles_name = term
|
||||||
|
if term == "":
|
||||||
|
for i in range(len(_term_list) - 1, -1, -1):
|
||||||
|
begin = _term_list[i]["begin"]
|
||||||
|
end = _term_list[i]["end"]
|
||||||
|
if ts_date_timestamp >= begin and ts_date_timestamp <= end:
|
||||||
|
term = _term_list[i]["term"]
|
||||||
|
term_number = _term_list[i]["term_number"]
|
||||||
|
majles_name = _term_list[i]["majles_name"]
|
||||||
|
break
|
||||||
|
|
||||||
|
error = ""
|
||||||
|
if term == "":
|
||||||
|
# if ts_date_timestamp >= _term_list[0]["begin"] and ts_date_timestamp <= _term_list[len(_term_list)-1]["end"] :
|
||||||
|
if ts_date_timestamp <= _term_list[len(_term_list) - 1]["end"]:
|
||||||
|
for i in range(0, len(_term_list) - 1, 1):
|
||||||
|
end = _term_list[i]["end"]
|
||||||
|
if ts_date_timestamp <= end:
|
||||||
|
term = _term_list[i]["term"]
|
||||||
|
term_number = _term_list[i]["term_number"]
|
||||||
|
majles_name = _term_list[i]["majles_name"]
|
||||||
|
error = "تاریخ بین دو دوره"
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
term_number = -1
|
||||||
|
error = "تاریخ خارج از محدوده"
|
||||||
|
|
||||||
|
return term, term_number, majles_name, error
|
||||||
|
|
||||||
|
# این متد یک متن و ایندکس آغاز و پایان یک عبارت درون آن متن را دریافت می کند
|
||||||
|
# و شماره توکن آغازین و توکن پایانی مربوط به عبارت در متن را بر می گرداند
|
||||||
|
def token_state_finder(normalized_section_content, start_index, end_index):
|
||||||
|
before_substring = normalized_section_content[0:start_index-1].strip()
|
||||||
|
pattern_substring = normalized_section_content[start_index-1:end_index+1].strip()
|
||||||
|
before_substring_token_list = before_substring.strip().split()
|
||||||
|
pattern_token_list = pattern_substring.strip().split()
|
||||||
|
start_token_state = len(before_substring_token_list)
|
||||||
|
end_token_state = len(before_substring_token_list) + (len(pattern_token_list)-1)
|
||||||
|
pattern_tokens_state ={
|
||||||
|
"start_token_state": start_token_state,
|
||||||
|
"end_token_state" : end_token_state
|
||||||
|
}
|
||||||
|
return pattern_tokens_state
|
||||||
|
|
||||||
|
def find_number_indexes_in_string(normalized_string,recognized_numbers):
|
||||||
|
complete_recognized_numbers = []
|
||||||
|
for item in recognized_numbers:
|
||||||
|
number_start_index, number_end_index = find_token_indexes_in_string(normalized_string,item['start_token_index'],item['end_token_index'])
|
||||||
|
content = normalized_string.split()
|
||||||
|
# if item['start_token_index']==item['end_token_index']:
|
||||||
|
# # حذف این بخش و باقی گذاشتن دستور ذیل الز زیر در کفایت درست کار کردن متد بررسی شود
|
||||||
|
|
||||||
|
# number_token_list = content[item['start_token_index']]
|
||||||
|
# else:
|
||||||
|
number_token_list = content[item['start_token_index']:item['end_token_index']+1]
|
||||||
|
complete_recognized_numbers.append(
|
||||||
|
{
|
||||||
|
'number_value' : item['number_value'],
|
||||||
|
'number_token_list' : number_token_list,
|
||||||
|
'start_token_index' : item['start_token_index'],
|
||||||
|
'end_token_index' : item['end_token_index'],
|
||||||
|
"start_number_state": number_start_index,
|
||||||
|
"end_number_state" : number_end_index
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return complete_recognized_numbers
|
||||||
|
|
||||||
|
# این متد متن اصلی یک متن، توکن آغازین و توکن پایانی مربوط به یک عبارت را می گیرد
|
||||||
|
# و ایندکس آغاز و ایندکس پایان متن وارد شده را بر می گرداند
|
||||||
|
def find_token_indexes_in_string(normalized_string,start_token_state,end_token_state):
|
||||||
|
before_tokens = normalized_string.split()[0:start_token_state]
|
||||||
|
content_tokens = normalized_string.split()[start_token_state:end_token_state + 1]
|
||||||
|
content_start_index = 0
|
||||||
|
content_end_index = 0
|
||||||
|
# شمردن تعداد کاراکترهای هر توکن در لیست توکن قبل از عدد
|
||||||
|
for token in before_tokens:
|
||||||
|
content_start_index += len(token)
|
||||||
|
# اضافه کردن تعداد فاصله های خالی یا همان اسپیس به عدد ایندکس شروع عدد
|
||||||
|
content_start_index += len(before_tokens) + 1
|
||||||
|
|
||||||
|
# شمردن تعداد کاراکترهای هر توکن در لیست توکن مربوط به عدد
|
||||||
|
for token in content_tokens:
|
||||||
|
content_end_index += len(token)
|
||||||
|
# اضافه کردن تعداد فاصله های خالی یا همان اسپیس به عدد ایندکس پایان عدد
|
||||||
|
content_end_index += (content_start_index - 1) + (len(content_tokens) - 1)
|
||||||
|
|
||||||
|
return content_start_index, content_end_index
|
||||||
|
|
||||||
|
# این متد، متنی را دریافت می کند و الگوهای تعریف شده را در آن جستجو می کند و آرایه ای از عبارات مطابق با هر الگو،
|
||||||
|
# شماره ایندکس شروع و پایان هر عبارت، عنوان و محتوای الگو، و شماره توکن شروع و توکن پایانی هر عبارت
|
||||||
|
# پیدا شده را بر می گرداند
|
||||||
|
def regex_patterns_finder(sectoin_content):
|
||||||
|
regex_patterns = {
|
||||||
|
"asle N asasi": r"اصل\s*شماره\s*(\d+)\s*قانون\s*اساسی\s*جمهوری\s*اسلامی\s*ایران", # اصل شماره فلان قانون اساسی جمهوری اسلامی ایران
|
||||||
|
"qanone asasi": r"(?<!^)\b\sقانون\sاساسی\sجمهوری\sاسلامی\sایران", # قانون اساسی جمهوری اسلامی ایران که در اول پاراگراف نباشد
|
||||||
|
"qanone asasi": r"(?<!^)\b\sقانون\sاساسی", # قانون اساسی که در اول پاراگراف نباشد
|
||||||
|
"qanon * mosavvab tarikh" : r"\bقانون[\s\w/]*مصوب\s((y\d{2,4}m\d{1,2}d\d{1,2})|\d{2,4})", # قانون * مصوب تاریخ
|
||||||
|
"in qanon" : r"این\sقانون", # این قانون
|
||||||
|
"qanone foq" : r"قانون\sفوق", # قانون فوق
|
||||||
|
"eslahe qanon": r"قانون\sاصلاح", # اصلاح قانون
|
||||||
|
"tabsare foq" : r"تبصره\sفوق", # تبصره فوق
|
||||||
|
"made foq" : r"ماده\sفوق", # ماده فوق
|
||||||
|
"made vahede" : r"ماده\sواحده", # ماده واحده
|
||||||
|
"made vahed" : r"ماده\sواحد", # ماده واحد
|
||||||
|
"tabsare N" : r"^\bتبصره\s*شماره\s*(\d+)\s*", # تبصره شماره فلان که فقط اول پاراگراف باشد
|
||||||
|
"f tabsare N" : r"(?<!^)\bتبصره\sشماره\s(\d+)\s", # تبصره شماره فلان که همه جا غیر از اول پاراگراف باشد
|
||||||
|
"tabsare N" : r"(?<!^)\bتبصره ?\(? ?\d+? ?\)?[ :.]", # تبصره شماره فلان که همه جا غیر از اول پاراگراف باشد
|
||||||
|
"made N" : r"(?<!^)\bماده ?\(? ?\d+? ?\)?[ :.]", # *** ماده فلان که هرجای پاراگراف غیر از اول آن باشد
|
||||||
|
"f made N" : r"^\bماده\s*[(]?\s*(\d+)\s*[)]?\s*" # ماده فلان که فقط اول پاراگراف باشد با یا بدون ترکیب عدد با پرانتز
|
||||||
|
}
|
||||||
|
|
||||||
|
matched_array = []
|
||||||
|
for pattern_key,pattern_value in regex_patterns.items():
|
||||||
|
regex = re.compile(pattern_value)
|
||||||
|
matches = regex.finditer(sectoin_content)
|
||||||
|
|
||||||
|
for match in matches:
|
||||||
|
# انجام عملیات مرتبط با هر الگو در اینجا
|
||||||
|
founded_item = match.group()
|
||||||
|
start_index = match.start() + 1
|
||||||
|
end_index = match.end() - 1
|
||||||
|
pattern_tokens_state = token_state_finder(sectoin_content, start_index, end_index)
|
||||||
|
matched_array.append(
|
||||||
|
{
|
||||||
|
"founded_item" : founded_item,
|
||||||
|
"start_index" : start_index,
|
||||||
|
"end_index" : end_index,
|
||||||
|
"pattern_key" : pattern_key,
|
||||||
|
"pattern_value" : pattern_value,
|
||||||
|
"start_token_state": pattern_tokens_state["start_token_state"],
|
||||||
|
"end_token_state" : pattern_tokens_state["end_token_state"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# convertedText = regex.sub(' wwwwwwwww ',convertedText)
|
||||||
|
# مرتب کردن آرایه بر اساس توکن شروع عبارت
|
||||||
|
matched_array.sort(key=lambda x: int(x['start_token_state']), reverse=False)
|
||||||
|
return matched_array
|
||||||
|
|
||||||
|
def change_refrece_tokens(normalized_section_content, recognized_patterns_array):
|
||||||
|
token_list = normalized_section_content.strip().split()
|
||||||
|
for ref_item in recognized_patterns_array:
|
||||||
|
start_token_state = ref_item.get('start_token_state')
|
||||||
|
end_token_state = ref_item.get('end_token_state')
|
||||||
|
for i in range(start_token_state, end_token_state+1):
|
||||||
|
token_list[i] = 'eeee'
|
||||||
|
normalized_section_content = ''
|
||||||
|
for token in token_list:
|
||||||
|
normalized_section_content = ''.join([normalized_section_content, (' ' + token)])
|
||||||
|
return normalized_section_content.strip()
|
||||||
|
|
||||||
|
def getMetaData(text):
|
||||||
|
normalized_section_content, recognized_dates, recognized_numbers = normalizerDate2(text.strip())
|
||||||
|
recognized_numbers = find_number_indexes_in_string(text,recognized_numbers)
|
||||||
|
normalized_section_content = normalized_section_content.strip()
|
||||||
|
recognized_patterns_array = regex_patterns_finder(normalized_section_content)
|
||||||
|
normalized_section_content = change_refrece_tokens(normalized_section_content, recognized_patterns_array)
|
||||||
|
nlp_parser = []
|
||||||
|
date_list = recognized_dates
|
||||||
|
ref_list = recognized_patterns_array
|
||||||
|
for date_item in date_list:
|
||||||
|
nlp_parser.append({
|
||||||
|
"properties": {
|
||||||
|
"type" : "date",
|
||||||
|
"index_start": int(date_item['start_date_token_index']),
|
||||||
|
"index_end" : int(date_item['end_date_token_index']),
|
||||||
|
"text" : date_item['original_date'],
|
||||||
|
"result" : date_item['converted_date'],
|
||||||
|
#"timestamp" : date_item['timestamp'],
|
||||||
|
"ref_link" : ''
|
||||||
|
}
|
||||||
|
})
|
||||||
|
for ref_item in ref_list:
|
||||||
|
nlp_parser.append({
|
||||||
|
"properties": {
|
||||||
|
"type" : "reference",
|
||||||
|
"index_start" : int(ref_item['start_token_state']),
|
||||||
|
"index_end" : int(ref_item['end_token_state']),
|
||||||
|
"text" : ref_item['founded_item'],
|
||||||
|
"result" : ref_item['pattern_value'],
|
||||||
|
"ref_link" : ''
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return nlp_parser, normalized_section_content
|
||||||
|
|
||||||
|
def save_error(error_text,filename):
|
||||||
|
with open(filename, 'a+', encoding='utf-8') as file:
|
||||||
|
# نوشتن خطا در فایل
|
||||||
|
file.write(error_text + '\n' + 50*'*' + '\n')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user