first push
This commit is contained in:
commit
e73258bff3
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
*.log
|
||||||
|
*.pyc
|
||||||
|
*.json
|
94
conflict.py
Normal file
94
conflict.py
Normal file
|
@ -0,0 +1,94 @@
|
||||||
|
import json
|
||||||
|
from tqdm import tqdm
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import os
|
||||||
|
from transformers import AutoTokenizer, AutoModel
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||||
|
os.environ['HF_HOME'] = "/home/admin/HFHOME"
|
||||||
|
|
||||||
|
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
|
#model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
||||||
|
|
||||||
|
# use quantization to lower GPU usage
|
||||||
|
# 4 bit:
|
||||||
|
# bnb_config = BitsAndBytesConfig(
|
||||||
|
# load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
|
||||||
|
# )
|
||||||
|
# 8 bit:
|
||||||
|
bnb_config = BitsAndBytesConfig(
|
||||||
|
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_id,
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
device_map="auto",
|
||||||
|
quantization_config=bnb_config
|
||||||
|
)
|
||||||
|
terminators = [
|
||||||
|
tokenizer.eos_token_id,
|
||||||
|
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
||||||
|
]
|
||||||
|
model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
|
||||||
|
|
||||||
|
SYS_PROMPT = """
|
||||||
|
You receive two Persian legal rule texts and analyze them carefully, explain your answer step by step, to see whether these two rules logically conflict with each other.
|
||||||
|
Finally, state the final conclusion for the presence or absence of conflict with the words "yes" or "no".
|
||||||
|
"""# Explain your answer step by step.
|
||||||
|
|
||||||
|
|
||||||
|
def format_prompt(SENTENCE1, SENTENCE2):
|
||||||
|
PROMPT = f"Rule 1: {SENTENCE1}. Rule 2: {SENTENCE2}."
|
||||||
|
return PROMPT
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def generate(formatted_prompt):
|
||||||
|
formatted_prompt = formatted_prompt[:50000] # to avoid GPU OOM
|
||||||
|
messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
|
||||||
|
# tell the model to generate
|
||||||
|
input_ids = tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
return_tensors="pt"
|
||||||
|
).to(model.device)
|
||||||
|
outputs = model.generate(
|
||||||
|
input_ids,
|
||||||
|
max_new_tokens=2048,
|
||||||
|
eos_token_id=terminators,
|
||||||
|
do_sample=True,
|
||||||
|
temperature=0.6,
|
||||||
|
top_p=0.9,
|
||||||
|
)
|
||||||
|
response = outputs[0][input_ids.shape[-1]:]
|
||||||
|
return tokenizer.decode(response, skip_special_tokens=True)
|
||||||
|
|
||||||
|
|
||||||
|
def conflict(sentence1, sentence2):
|
||||||
|
formatted_prompt = format_prompt(sentence1, sentence2)
|
||||||
|
return generate(formatted_prompt)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print('start')
|
||||||
|
start_time = time.time()
|
||||||
|
#inputfile = open('./main_rules_lama70B_dataset_02.json', "r", encoding='utf-8')
|
||||||
|
result = conflict("حسین در حال حاضر در وزارت نفت و انرژی به استخدام دولت در آمده است.",
|
||||||
|
"حسین الان در دانشگاه دولتی مشغول به تحصیل است و هرکسی که در حال تحصیل باشد، از نظر قانون نمی تواند در دولت استخدام شود")
|
||||||
|
end_time = time.time()
|
||||||
|
|
||||||
|
print("*********************************************************")
|
||||||
|
print("*********************************************************")
|
||||||
|
print()
|
||||||
|
print(result)
|
||||||
|
print()
|
||||||
|
print("*********************************************************")
|
||||||
|
print("*********************************************************")
|
||||||
|
|
||||||
|
|
||||||
|
print(f"elapsed time: {end_time-start_time}")
|
||||||
|
print("end")
|
153
data_helper.py
Normal file
153
data_helper.py
Normal file
|
@ -0,0 +1,153 @@
|
||||||
|
|
||||||
|
import pickle
|
||||||
|
import re
|
||||||
|
import string
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class DataHelper():
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def clean_text(self, text_doc, new_line_elimination):
|
||||||
|
punctuations = r')(}{:؟!،؛»«.' + r"/<>?.,:;"
|
||||||
|
punctuations = '[' + punctuations + string.punctuation + ']'
|
||||||
|
punctuations = punctuations.replace("@", "")
|
||||||
|
|
||||||
|
text_doc.strip()
|
||||||
|
|
||||||
|
# pattern = ur'\s*@[a-zA-Z0-9]*\s*'
|
||||||
|
# tmp = re.findall(pattern, text_doc)
|
||||||
|
# newstring = re.sub(pattern, eliminate_pattern, text_doc)
|
||||||
|
|
||||||
|
|
||||||
|
#finding the numbers
|
||||||
|
pattern = r"[-+]?\d*\.\d+|\d+"
|
||||||
|
nums_list = re.findall(pattern, text_doc)
|
||||||
|
newstring = re.sub(pattern, 'floatingpointnumber', text_doc)
|
||||||
|
|
||||||
|
|
||||||
|
#pattern = '\s*' + punctuations + '+' + '\s*'
|
||||||
|
#tmp = re.findall(pattern, newstring)
|
||||||
|
#newstring = re.sub(pattern, self.add_space, newstring)
|
||||||
|
|
||||||
|
# pattern = u'([a-zA-Z0-9]+)(\s*)(' + punctuations + u')(\s*)([a-zA-Z0-9]+)'
|
||||||
|
# rep = ur'\1\3\5'
|
||||||
|
# tmp = re.findall(pattern, newstring)
|
||||||
|
# newstring = re.sub(pattern, rep, newstring)
|
||||||
|
|
||||||
|
pattern = r'[\n]+'
|
||||||
|
tmp = re.findall(pattern, newstring)
|
||||||
|
if new_line_elimination:
|
||||||
|
newstring = re.sub(pattern, " ", newstring)
|
||||||
|
else:
|
||||||
|
# newstring = re.sub(pattern, "\n", newstring)
|
||||||
|
pass
|
||||||
|
|
||||||
|
punctuations = r")(}{:؟!-،؛»«.@$&%" + r"/<>?.,:;"
|
||||||
|
latinLettersDigits = r"a-zA-Z0-9"
|
||||||
|
pattern = r'[^' + punctuations + latinLettersDigits + 'آ-ی' + '' + '\d\s:]'
|
||||||
|
tmp = re.findall(pattern, newstring)
|
||||||
|
newstring = re.sub(pattern, self.eliminate_pattern, newstring)
|
||||||
|
|
||||||
|
pattern = r'[ ]+'
|
||||||
|
tmp = re.findall(pattern, newstring)
|
||||||
|
newstring = re.sub(pattern, ' ', newstring)
|
||||||
|
|
||||||
|
for number in nums_list:
|
||||||
|
pattern = 'floatingpointnumber'
|
||||||
|
newstring = re.sub(pattern, number, newstring, 1)
|
||||||
|
|
||||||
|
return newstring
|
||||||
|
|
||||||
|
def add_space(self, mystring):
|
||||||
|
mystring = mystring.group() # this method return the string matched by re
|
||||||
|
mystring = mystring.strip(' ') # ommiting the whitespace around the pucntuation
|
||||||
|
mystring = " " + mystring + " " # adding a space after and before punctuation
|
||||||
|
return mystring
|
||||||
|
|
||||||
|
def replace_newline_with_dot(self, mystring):
|
||||||
|
return ' . '
|
||||||
|
|
||||||
|
def eliminate_pattern(self, mystring):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def load_var(self, load_path):
|
||||||
|
file = open(load_path, 'rb')
|
||||||
|
variable = pickle.load(file)
|
||||||
|
file.close()
|
||||||
|
return variable
|
||||||
|
|
||||||
|
def save_var(self, save_path, variable):
|
||||||
|
print("saving vars ...")
|
||||||
|
file = open(save_path, 'wb')
|
||||||
|
pickle.dump(variable, file)
|
||||||
|
print("variable saved.")
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
def build_stem_dictionary(self, normalizer, verb_tense_path, mokasar_noun_path):
|
||||||
|
path_dir = "resource/Persian_Dependency_Treebank/Data/2ndRep"
|
||||||
|
lexicon_stem = set()
|
||||||
|
verb_stem = set()
|
||||||
|
#verb_tense_map = {}
|
||||||
|
verb_p2f_map = {}
|
||||||
|
verb_f2p_map = {}
|
||||||
|
for fileName in os.listdir(path_dir):
|
||||||
|
file_path = path_dir + "/" + fileName
|
||||||
|
with open(file_path, "r") as input:
|
||||||
|
input_content = input.readlines()
|
||||||
|
for el in input_content:
|
||||||
|
el = normalizer.sub_alphabets(el)
|
||||||
|
el = el.split("\t")
|
||||||
|
if (len(el) > 2):
|
||||||
|
if (el[3] == 'V'):
|
||||||
|
tmp_pos = "V"
|
||||||
|
else:
|
||||||
|
tmp_pos = "N"
|
||||||
|
stem_word = el[2]
|
||||||
|
stem_word = stem_word.split("#")
|
||||||
|
stem_word = [x.strip('\u200c') for x in stem_word]
|
||||||
|
if (tmp_pos == "V" and len(stem_word) == 2):
|
||||||
|
if (len(stem_word[0]) != 0 and len(stem_word[1]) != 0):
|
||||||
|
verb_p2f_map[stem_word[0]] = stem_word[1]
|
||||||
|
verb_f2p_map[stem_word[1]] = stem_word[0]
|
||||||
|
verb_stem.add(stem_word[0])
|
||||||
|
verb_stem.add(stem_word[1])
|
||||||
|
if(tmp_pos == 'V' and len(stem_word) == 3):
|
||||||
|
if(len(stem_word[0]) != 0 and len(stem_word[1]) != 0 and len(stem_word[2]) !=0):
|
||||||
|
#verb_prifix.add(stem_word[0])
|
||||||
|
verb_p2f_map[stem_word[1]] = stem_word[2]
|
||||||
|
verb_f2p_map[stem_word[2]] = stem_word[1]
|
||||||
|
verb_stem.add(stem_word[1])
|
||||||
|
verb_stem.add(stem_word[2])
|
||||||
|
for t in stem_word:
|
||||||
|
if len(t) > 1:
|
||||||
|
if (tmp_pos == 'N'):
|
||||||
|
lexicon_stem.add(t)
|
||||||
|
|
||||||
|
with open(verb_tense_path, "r") as bon_file:
|
||||||
|
bon_file_content = bon_file.readlines()
|
||||||
|
for el in bon_file_content:
|
||||||
|
el = el.strip()
|
||||||
|
el = normalizer.sub_alphabets(el)
|
||||||
|
el = el.split()
|
||||||
|
el = [x.strip('\u200c') for x in el]
|
||||||
|
|
||||||
|
verb_p2f_map[el[0]] = el[1]
|
||||||
|
verb_f2p_map[el[1]] = el[0]
|
||||||
|
verb_stem.add(el[0])
|
||||||
|
verb_stem.add(el[1])
|
||||||
|
|
||||||
|
irregular_noun = {}
|
||||||
|
with open(mokasar_noun_path, "r") as input:
|
||||||
|
input_content = input.readlines()
|
||||||
|
for el in input_content:
|
||||||
|
el = normalizer.sub_alphabets(el)
|
||||||
|
el = el.replace("\t\t", "\t")
|
||||||
|
el = el.strip().split("\t")
|
||||||
|
el = [x.strip('\u200c') for x in el]
|
||||||
|
irregular_noun[el[0]] = el[1]
|
||||||
|
lexicon_stem.add(el[0])
|
||||||
|
|
||||||
|
verb_tense_map = [verb_p2f_map, verb_f2p_map]
|
||||||
|
return lexicon_stem, verb_stem, verb_tense_map, irregular_noun
|
200
do_extractor.py
Normal file
200
do_extractor.py
Normal file
|
@ -0,0 +1,200 @@
|
||||||
|
"""
|
||||||
|
این فایل با نرمالایزر پارسیور کار می کند
|
||||||
|
"""
|
||||||
|
from datetime import datetime
|
||||||
|
# from elasticsearch import Elasticsearch
|
||||||
|
from threading import Thread
|
||||||
|
import torch
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import os.path
|
||||||
|
import os
|
||||||
|
os.environ['HF_HOME'] = "/home/admin/HFHOME"
|
||||||
|
# from general_functions import normalize_content
|
||||||
|
from funcs import write_to_json, read_from_json
|
||||||
|
|
||||||
|
from normalizer import Normalizer
|
||||||
|
from tokenizer import *
|
||||||
|
_normalizer = Normalizer(date_normalizing_needed=True)
|
||||||
|
address = os.getcwd()
|
||||||
|
# sections_list = read_from_json(address + '/data/clean_sections_11k.json') # Main File
|
||||||
|
# sections_list = read_from_json('../data/clean_sections_11k.json') # Main File
|
||||||
|
# sections_list = read_from_json('../data/simplized_sentences_110_2.json') # Main File
|
||||||
|
# sections_list = read_from_json('./data/main_sections_170k_metadata.json') # Main File
|
||||||
|
|
||||||
|
|
||||||
|
# not_have_two_token_kw = read_from_json('../data/not_have_two_token_kw.json')
|
||||||
|
# not_have_two_token_kw_list = [item3["id"] for item3 in not_have_two_token_kw]
|
||||||
|
not_have_two_token_kw_list = []
|
||||||
|
|
||||||
|
import json
|
||||||
|
from tqdm import tqdm
|
||||||
|
import time
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer, AutoModel
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||||
|
os.environ['HF_HOME'] = "/home/admin/HFHOME"
|
||||||
|
|
||||||
|
#model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
|
#model_id = "PartAI/Dorna2-Llama3.1-8B-Instruct"
|
||||||
|
|
||||||
|
model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
||||||
|
|
||||||
|
# use quantization to lower GPU usage
|
||||||
|
# 4 bit:
|
||||||
|
# bnb_config = BitsAndBytesConfig(
|
||||||
|
# load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
|
||||||
|
# )
|
||||||
|
# 8 bit:
|
||||||
|
bnb_config = BitsAndBytesConfig(
|
||||||
|
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
|
||||||
|
)
|
||||||
|
print("Model Loading START:")
|
||||||
|
print(str(datetime.now()))
|
||||||
|
print()
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_id,
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
device_map="auto",
|
||||||
|
quantization_config=bnb_config
|
||||||
|
)
|
||||||
|
terminators = [
|
||||||
|
tokenizer.eos_token_id,
|
||||||
|
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
||||||
|
]
|
||||||
|
model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
|
||||||
|
print("Model Loading END:")
|
||||||
|
print(str(datetime.now()))
|
||||||
|
print()
|
||||||
|
|
||||||
|
sections_list = read_from_json('./data/sections_110.json') # Main File
|
||||||
|
# if torch.cuda.is_available():
|
||||||
|
# #model_id = "PartAI/Dorna-Llama3-8B-Instruct"
|
||||||
|
# # model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
|
|
||||||
|
# model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
||||||
|
# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
|
||||||
|
# tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
|
||||||
|
# index_name_i = 'semantic_search-v10'
|
||||||
|
|
||||||
|
# es = Elasticsearch(
|
||||||
|
# "http://127.0.0.1:6900",
|
||||||
|
# # ca_certs="/path/to/http_ca.crt",
|
||||||
|
# basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
|
||||||
|
# )
|
||||||
|
|
||||||
|
counter = 0
|
||||||
|
total = 0
|
||||||
|
remained = 0
|
||||||
|
id = ''
|
||||||
|
keywords_count = 15
|
||||||
|
|
||||||
|
def generateKeywords(text):
|
||||||
|
global remained
|
||||||
|
try:
|
||||||
|
keywords_count = (len(text) / 1000) * 15
|
||||||
|
keywords_count = int(keywords_count)
|
||||||
|
if keywords_count == 0:
|
||||||
|
keywords_count = 1
|
||||||
|
|
||||||
|
|
||||||
|
# messages = [{"role": "system", "content": "تو یک وکیل حقوق دان هستی و باید بتوانی متن های قانونی و حقوقی را بدون تغییر اصطلاحات فنی، به صورتی توضیح دهی که افراد غیر حقوق دان، معنای متن را درک کنند. " },
|
||||||
|
# {"role": "user", "content":
|
||||||
|
# '''از "متن" حداقل {} عبارت های کلیدی مهم و پراهمیت را استخراج کن و عبارت های کلیدی را در قالب لیست به زبان فارسی چاپ کن و هر کلید عبارت کلیدی را در یک خط جدید قرار بده و هیچ گونه توضیحی در ابتدا یا انتهای پاسخ، اضافه نکن.
|
||||||
|
# هر عبارت کلیدی دارای یک شماره ترتیبی در ابتدای آن باشد. عبارت های کلیدی، دقیقا در متن موجود باشد. بسیار مهم و ضروری است که طول هر عبارت کلیدی حداقل دو توکن داشته باشد و عبارت کلیدی یک توکنی قابل قبول نیست. تاکید می کنم که هیچ عبارت کلیدی نباید فقط یک توکن داشته باشد. نام سازمان ها و نهادها و اشخاص حقوقی، حتما به عنوان عبارت کلیدی درنظر گرفته شود. هیچ عبارت کلیدی، فعل یا حرف اضافه نباشد و فقط شامل اسم هایی باشد که به هم اضافه شده اند. هیچ عبارت کلیدی نباید با حرف اضافه یا حرف «و» تمام شود. ضروری است که عبارت های کلیدی شامل ماده، بند، تبصره یا تاریخ ها نباشند.'''
|
||||||
|
# .format(keywords_count)
|
||||||
|
# },
|
||||||
|
# {"role": "user", "content":
|
||||||
|
# '''"متن": {}'''.format(text)
|
||||||
|
# },]
|
||||||
|
messages = [{"role": "system", "content": "You are a lawyer and you must be able to explain legal texts without changing technical terms in a way that non-lawyers can understand the meaning of the text." },
|
||||||
|
{"role": "user", "content":
|
||||||
|
'''Extract at least {} important and significant key phrases from the "text" and print the key phrases in the form of a list in Persian and put each key phrase on a new line and do not add any explanation at the beginning or end of the answer.
|
||||||
|
Each key phrase has a sequential number at the beginning. The key phrases must be present exactly in the text. It is very important and essential that the length of each key phrase has at least two tokens and a single-token key phrase is not acceptable. I emphasize that no key phrase should have only one token. The names of organizations, institutions and legal entities must be considered as key phrases. No key phrase should be a verb or a preposition and should only include nouns that are added together. No key phrase should end with a preposition or the letter "و". It is essential that key phrases do not include "ماده", "تبصره", "بند" or "تاریخ ها".'''
|
||||||
|
.format(keywords_count)
|
||||||
|
},
|
||||||
|
{"role": "user", "content":
|
||||||
|
'''"متن": {}'''.format(text)
|
||||||
|
},]
|
||||||
|
|
||||||
|
|
||||||
|
input_ids = tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
return_tensors="pt"
|
||||||
|
).to(model.device)
|
||||||
|
|
||||||
|
terminators = [
|
||||||
|
tokenizer.eos_token_id,
|
||||||
|
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
||||||
|
]
|
||||||
|
model.generation_config.pad_token_id = tokenizer.pad_token_id
|
||||||
|
|
||||||
|
|
||||||
|
outputs = model.generate(
|
||||||
|
input_ids,
|
||||||
|
max_new_tokens=256,
|
||||||
|
eos_token_id=terminators,
|
||||||
|
do_sample=True,
|
||||||
|
temperature=0.6,
|
||||||
|
top_p=0.85,
|
||||||
|
)
|
||||||
|
#lock0.release()
|
||||||
|
response = outputs[0][input_ids.shape[-1]:]
|
||||||
|
keywords = tokenizer.decode(response, skip_special_tokens=True)
|
||||||
|
#lock1.acquire()
|
||||||
|
# resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})
|
||||||
|
|
||||||
|
|
||||||
|
return keywords
|
||||||
|
|
||||||
|
except Exception as inst:
|
||||||
|
print(type(inst)) # the exception type
|
||||||
|
print(inst.args) # arguments stored in .args
|
||||||
|
print("Exception: " + str(inst))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
start_time = time.time()
|
||||||
|
print("start_time: "+str(datetime.now()))
|
||||||
|
|
||||||
|
try:
|
||||||
|
keywords_dict = []
|
||||||
|
|
||||||
|
count = 1
|
||||||
|
for content_item in sections_list:
|
||||||
|
id = content_item['id']
|
||||||
|
# if not id in not_have_two_token_kw_list:
|
||||||
|
# continue
|
||||||
|
content = content_item['content']
|
||||||
|
content_len = len(content.split())
|
||||||
|
# کنارگذاشتن محتواهای با حجم زیاد
|
||||||
|
if content_len > 2000:
|
||||||
|
print("too long content " + str(id))
|
||||||
|
continue
|
||||||
|
content = _normalizer.sub_alphabets(content)
|
||||||
|
keywords = generateKeywords(content)
|
||||||
|
print("section " + str(count) + "/" + str(len(not_have_two_token_kw_list)) + " keyword extracting ... ")
|
||||||
|
keywords_dict.append({
|
||||||
|
'id':id,
|
||||||
|
'keywords':keywords
|
||||||
|
})
|
||||||
|
if count % 500 == 0:
|
||||||
|
write_to_json(keywords_dict, f"./data/sections_kw_llama_8b_main_{count}.json")
|
||||||
|
keywords_dict = []
|
||||||
|
count+=1
|
||||||
|
|
||||||
|
write_to_json(keywords_dict, f"./data/sections_kw_llama_8b_main_{count}.json")
|
||||||
|
|
||||||
|
except Exception as inst:
|
||||||
|
print(type(inst)) # the exception type
|
||||||
|
print(inst.args) # arguments stored in .args
|
||||||
|
|
||||||
|
end_time = time.time()
|
||||||
|
print("end_time: "+ str(datetime.now()))
|
||||||
|
operation_time = (int(end_time-start_time)/60)/60
|
||||||
|
print(f"elapsed time: {operation_time} hours")
|
||||||
|
print(f"Finished!!!")
|
119
funcs.py
Normal file
119
funcs.py
Normal file
|
@ -0,0 +1,119 @@
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
# from pandas import read_excel, DataFrame
|
||||||
|
def remove_signs():
|
||||||
|
str = read_file()
|
||||||
|
# lines =
|
||||||
|
pattern = r"\(|\)"
|
||||||
|
str = re.sub(pattern,'', str)
|
||||||
|
# str = re.sub(')','', str)
|
||||||
|
# str = re.sub('/','', str)
|
||||||
|
|
||||||
|
return str
|
||||||
|
|
||||||
|
def read_file():
|
||||||
|
with open('./data/DATASET_2.txt', 'r', encoding='utf-8') as file:
|
||||||
|
text = ''
|
||||||
|
try:
|
||||||
|
text = str(file.read())
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return text
|
||||||
|
|
||||||
|
def read_file_by_address(file_address):
|
||||||
|
with open(file_address, 'r', encoding='utf-8') as file:
|
||||||
|
text = ''
|
||||||
|
try:
|
||||||
|
text = str(file.read())
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return text
|
||||||
|
|
||||||
|
def save_to_file(result):
|
||||||
|
with open('./data/DATASET_3.txt', 'a+', encoding='utf-8') as file:
|
||||||
|
previous_result = ''
|
||||||
|
try:
|
||||||
|
previous_result = file.read()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
file.write(result)
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
def save_to_file_by_address(file_address, text):
|
||||||
|
with open(file_address, 'a+', encoding='utf-8') as file:
|
||||||
|
previous_result = ''
|
||||||
|
try:
|
||||||
|
previous_result = file.read()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
file.write(text)
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
|
||||||
|
def read_from_excel(file_address, column_name):
|
||||||
|
# خواندن فایل اکسل
|
||||||
|
data = read_excel(file_address)
|
||||||
|
|
||||||
|
# استخراج محتوای ستون مورد نظر
|
||||||
|
column_data = data[column_name]
|
||||||
|
return column_data
|
||||||
|
|
||||||
|
def add_columndata_to_excel(file_address, column_name, columndata):
|
||||||
|
|
||||||
|
# خواندن فایل اکسل
|
||||||
|
data = read_excel(file_address)
|
||||||
|
|
||||||
|
# اضافه کردن ستون جدید به دادهها
|
||||||
|
data[column_name] = columndata
|
||||||
|
|
||||||
|
# ذخیره کردن دادهها در فایل اکسل
|
||||||
|
data.to_excel(file_address, index=False)
|
||||||
|
|
||||||
|
def write_to_excel(data_dict, file_name_and_address):
|
||||||
|
df = DataFrame(data_dict)
|
||||||
|
|
||||||
|
# ذخیره DataFrame به عنوان فایل اکسل
|
||||||
|
df.to_excel(file_name_and_address, index=False)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def write_to_json(dict, file_address):
|
||||||
|
|
||||||
|
# تبدیل دیکشنری به فرمت JSON
|
||||||
|
json_data = json.dumps(dict, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
# ذخیره فایل
|
||||||
|
with open(file_address, 'w+', encoding='utf-8') as file:
|
||||||
|
file.write(json_data)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def read_from_json(file_address):
|
||||||
|
data_dict = []
|
||||||
|
# خواندن اطلاعات از فایل JSON
|
||||||
|
with open(file_address, 'r', encoding='utf-8') as file:
|
||||||
|
loaded_data = json.load(file)
|
||||||
|
|
||||||
|
# نمایش اطلاعات خوانده شده
|
||||||
|
for item in loaded_data:
|
||||||
|
data_dict.append(item)
|
||||||
|
return data_dict
|
||||||
|
|
||||||
|
|
||||||
|
def separated_date_format_finder(date_ner):
|
||||||
|
result = False
|
||||||
|
date_ner = date_ner.replace('.','/')
|
||||||
|
date_ner = date_ner.replace('،','/')
|
||||||
|
date_ner = date_ner.replace('ر','/')
|
||||||
|
#date_pattern = r'\d{1,2} /\d{1,2} /\d{2,4}|\d{1,2}/\d{1,2}/\d{2,4}|\d{2,4} /\d{1,2} /\d{1,2}|\d{2,4}/\d{1,2}/\d{1,2}'
|
||||||
|
date_pattern = r'\b(?:(?:1[0-2]|0?[1-9])/?(?:3[01]|[12][0-9]|0?[1-9])/?(?:14[0-7][0-9]|13[0-9][0-9]|128[0-9])|(?:1[0-2]|0?[1-9])/?(?:3[01]|[12][0-9]|0?[1-9])/?(?:14[0-7][0-9]|13[0-9][0-9]|128[0-9]|[0-9]{2}))\b'
|
||||||
|
regex = re.compile(date_pattern)
|
||||||
|
match_dates = regex.finditer(date_ner)
|
||||||
|
for date_item in match_dates:
|
||||||
|
result = True
|
||||||
|
break
|
||||||
|
return result
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
pass
|
800
general_functions.py
Normal file
800
general_functions.py
Normal file
|
@ -0,0 +1,800 @@
|
||||||
|
from normalizer import Normalizer
|
||||||
|
from tokenizer import *
|
||||||
|
# import jalali
|
||||||
|
import re
|
||||||
|
from re import sub
|
||||||
|
import textwrap
|
||||||
|
from html import escape
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# from lxml import etree
|
||||||
|
import datetime
|
||||||
|
#enumerate(token_list):
|
||||||
|
_normalizer = Normalizer(date_normalizing_needed=True)
|
||||||
|
|
||||||
|
yeAr = r"ﻱ|ې|ێ|ے|ى|ي|ئ"
|
||||||
|
yeFr= r"ی"
|
||||||
|
keAr = r"ڭ|ﻚ|ﮎ|ﻜ|ﮏ|ګ|ﻛ|ﮑ|ﮐ|ڪ|ك"
|
||||||
|
keFr = r"ک"
|
||||||
|
mark1 = r'#\[#'
|
||||||
|
mark2 = r'#\]#'
|
||||||
|
hTag1 = r'<'
|
||||||
|
hTag2 = r'>'
|
||||||
|
tableTag=["table","tr", "th", "td", "TABLE", "TR", "TH", "TD"]
|
||||||
|
strTable = ''
|
||||||
|
for tag in tableTag:
|
||||||
|
if strTable != '':
|
||||||
|
strTable += '|'
|
||||||
|
strTable += '('+tag+')'
|
||||||
|
regTable = r'<(?P<slash>\/)*(?P<tag>'+strTable+')(?P<class>[^>]+)*>'
|
||||||
|
regTableReplace = r'#[#\g<slash>\g<tag>\g<class>#]#'
|
||||||
|
|
||||||
|
|
||||||
|
def isNeedHtml(html):
|
||||||
|
if "<TABLE" in html or "<table" in html or "</TR" in html or "</tr" in html :
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def removeHtmlTags(html, exeptionTag=[]):
|
||||||
|
#reg1 = r'<[^>]+>'
|
||||||
|
|
||||||
|
if exeptionTag.__len__ :
|
||||||
|
exceptTags = ''
|
||||||
|
for tag in exeptionTag:
|
||||||
|
if exceptTags != '':
|
||||||
|
exceptTags += '|'
|
||||||
|
exceptTags += '('+tag+')'
|
||||||
|
reg1 = r'<(?P<slash>/)*(?P<tag>'+exceptTags+')(?P<class>[^>]+)*>'
|
||||||
|
html = sub(reg1, regTableReplace, html)
|
||||||
|
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
text = soup.get_text("\n", strip=True)
|
||||||
|
|
||||||
|
if exeptionTag.__len__ :
|
||||||
|
text = sub(mark1, hTag1, text)
|
||||||
|
text = sub(mark2, hTag2, text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def removeHtmlNoTableTag(html):
|
||||||
|
|
||||||
|
# خطا داره و هنگ می کنه در test2.py
|
||||||
|
html = sub(regTable, regTableReplace, html)
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
text = soup.get_text("\n", strip=True)
|
||||||
|
|
||||||
|
text = sub(mark1, hTag1, text)
|
||||||
|
text = sub(mark2, hTag2, text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
def normalizerData(data):
|
||||||
|
global _normalizer
|
||||||
|
normalTitle, dates = _normalizer.normalize(data, return_dates=True)
|
||||||
|
tdates = []
|
||||||
|
for d in dates:
|
||||||
|
if not d.startswith("num"):
|
||||||
|
try:
|
||||||
|
tsd = jdate2timestamp(d)
|
||||||
|
cdate = d
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
d = d.replace("y", "")
|
||||||
|
d = d.replace("m", "/")
|
||||||
|
d = d.replace("d", "/")
|
||||||
|
m = re.match(r"^(\d{4})\D(\d{1,2})\D(\d{1,2})$", d)
|
||||||
|
if m:
|
||||||
|
[year, month, day] = [
|
||||||
|
int(m.group(1)),
|
||||||
|
int(m.group(2)),
|
||||||
|
int(m.group(3)),
|
||||||
|
]
|
||||||
|
if year > 1200 and year < 1550:
|
||||||
|
if month < 1 or month > 12:
|
||||||
|
month = 1
|
||||||
|
if day < 1 or day > 31:
|
||||||
|
day = 1
|
||||||
|
cdate = str(year) + "/" + str(month) + "/" + str(day)
|
||||||
|
tsd = jdate2timestamp(cdate)
|
||||||
|
else:
|
||||||
|
# cdate = "1403/03/03"
|
||||||
|
# tsd = jdate2timestamp(cdate)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# cdate = "1403/03/03"
|
||||||
|
# tsd = jdate2timestamp(cdate)
|
||||||
|
continue
|
||||||
|
except:
|
||||||
|
# print("Error in:"+ d +" for id: " + id)
|
||||||
|
# cdate = "1403/03/03"
|
||||||
|
# tsd = jdate2timestamp(cdate)
|
||||||
|
continue
|
||||||
|
tdates.append({"date": cdate, "timestamp": tsd, "index": 0, "slice": ""})
|
||||||
|
|
||||||
|
return normalTitle,tdates
|
||||||
|
|
||||||
|
def normalizerDate2(inputString):
|
||||||
|
global _normalizer
|
||||||
|
normalizedString, dates, recognized_dates, recognized_numbers = _normalizer.normalize(inputString, return_dates=True)
|
||||||
|
tdates = []
|
||||||
|
for date_item in recognized_dates:
|
||||||
|
date_part = date_item['date']
|
||||||
|
date_token_index = date_item['date_token_index']
|
||||||
|
start_date_token_index = date_item['start_date_token_index']
|
||||||
|
end_date_token_index = date_item['end_date_token_index']
|
||||||
|
if not date_part.startswith("num"):
|
||||||
|
try:
|
||||||
|
cdate = date_part
|
||||||
|
tsd = jdate2timestamp(date_part)
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
date_part = date_part.replace("y", "")
|
||||||
|
date_part = date_part.replace("m", "/")
|
||||||
|
date_part = date_part.replace("d", "/")
|
||||||
|
m = re.match(r"^(\d{4})\D(\d{1,2})\D(\d{1,2})$", date_part)
|
||||||
|
if m:
|
||||||
|
[year, month, day] = [
|
||||||
|
int(m.group(1)),
|
||||||
|
int(m.group(2)),
|
||||||
|
int(m.group(3)),
|
||||||
|
]
|
||||||
|
if year > 1200 and year < 1550:
|
||||||
|
if month < 1 or month > 12:
|
||||||
|
month = 1
|
||||||
|
if day < 1 or day > 31:
|
||||||
|
day = 1
|
||||||
|
cdate = str(year) + "/" + str(month) + "/" + str(day)
|
||||||
|
tsd = jdate2timestamp(cdate)
|
||||||
|
else:
|
||||||
|
# cdate = "1403/03/03"
|
||||||
|
# tsd = jdate2timestamp(cdate)
|
||||||
|
continue
|
||||||
|
# else:
|
||||||
|
# # cdate = "1403/03/03"
|
||||||
|
# # tsd = jdate2timestamp(cdate)
|
||||||
|
# continue
|
||||||
|
except:
|
||||||
|
# print("Error in:"+ d +" for id: " + id)
|
||||||
|
# cdate = "1403/03/03"
|
||||||
|
# tsd = jdate2timestamp(cdate)
|
||||||
|
continue
|
||||||
|
import tokenizer as t
|
||||||
|
inputString_token = t.Tokenizer.tokenize_words(None,inputString)
|
||||||
|
# if start_date_token_index == end_date_token_index:
|
||||||
|
# end_date_token_index += 1
|
||||||
|
# original_date_part = inputString_token[start_date_token_index:end_date_token_index]
|
||||||
|
# else:
|
||||||
|
original_date_part = inputString_token[start_date_token_index:end_date_token_index + 1]
|
||||||
|
original_date = ''
|
||||||
|
for part in original_date_part:
|
||||||
|
original_date = original_date + ' ' + part
|
||||||
|
original_date = original_date.strip()
|
||||||
|
tdates.append({"converted_date": date_item['date'],
|
||||||
|
"date": cdate ,
|
||||||
|
"original_date" : original_date,
|
||||||
|
# "timestamp": tsd,
|
||||||
|
"date_token_index": date_token_index,
|
||||||
|
"start_date_token_index": start_date_token_index,
|
||||||
|
"end_date_token_index":end_date_token_index})
|
||||||
|
'''
|
||||||
|
for d in dates:
|
||||||
|
if not d.startswith("num"):
|
||||||
|
try:
|
||||||
|
tsd = jdate2timestamp(d)
|
||||||
|
cdate = d
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
d = d.replace("y", "")
|
||||||
|
d = d.replace("m", "/")
|
||||||
|
d = d.replace("d", "/")
|
||||||
|
m = re.match(r"^(\d{4})\D(\d{1,2})\D(\d{1,2})$", d)
|
||||||
|
if m:
|
||||||
|
[year, month, day] = [
|
||||||
|
int(m.group(1)),
|
||||||
|
int(m.group(2)),
|
||||||
|
int(m.group(3)),
|
||||||
|
]
|
||||||
|
if year > 1200 and year < 1550:
|
||||||
|
if month < 1 or month > 12:
|
||||||
|
month = 1
|
||||||
|
if day < 1 or day > 31:
|
||||||
|
day = 1
|
||||||
|
cdate = str(year) + "/" + str(month) + "/" + str(day)
|
||||||
|
tsd = jdate2timestamp(cdate)
|
||||||
|
else:
|
||||||
|
# cdate = "1403/03/03"
|
||||||
|
# tsd = jdate2timestamp(cdate)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# cdate = "1403/03/03"
|
||||||
|
# tsd = jdate2timestamp(cdate)
|
||||||
|
continue
|
||||||
|
except:
|
||||||
|
# print("Error in:"+ d +" for id: " + id)
|
||||||
|
# cdate = "1403/03/03"
|
||||||
|
# tsd = jdate2timestamp(cdate)
|
||||||
|
continue
|
||||||
|
tdates.append({"date": cdate, "timestamp": tsd, "index": 0, "slice": ""})'''
|
||||||
|
return normalizedString,tdates,recognized_numbers
|
||||||
|
|
||||||
|
def OtherDateFormatNormalizer(inputString,pattern):
|
||||||
|
mainTextTemp = inputString
|
||||||
|
regex_pattern_Mah = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\s(\d{1,4})\sو\s(\d{1,3})\sو\s(\d{1,2})\sو\s(\d{1})\s" # y0m4d4 ماه 1000 و 300 و 50 و 4
|
||||||
|
regex_pattern_MahSal = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\sسال\s(\d{1,4})\sو\s(\d{1,3})\sو\s(\d{1,2})\sو\s(\d{1})\s" # y0m4d4 ماه سال 1000 و 300 و 50 و 4
|
||||||
|
regex_pattern_MahSal2 = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\sسال\sy(\d{1,4})m(\d{1,2})d(\d{1,2})\sو\s(\d{1,3})\sو\s(\d{1,2})\sو\s(\d{1})\s" # y0m4d4 ماه سال y1000m0d0 و 300 و 50 و 4
|
||||||
|
regex_pattern_MahSal3 = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\sسال\sy(\d{1,4})m(\d{1,2})d(\d{1,2})" # y0m3d1 ماه سال y1353m0d0
|
||||||
|
|
||||||
|
if(pattern==1):
|
||||||
|
regex = re.compile(regex_pattern_Mah)
|
||||||
|
elif(pattern==2):
|
||||||
|
regex = re.compile(regex_pattern_MahSal)
|
||||||
|
elif(pattern==3):
|
||||||
|
regex = re.compile(regex_pattern_MahSal2)
|
||||||
|
elif(pattern==4):
|
||||||
|
regex = re.compile(regex_pattern_MahSal3)
|
||||||
|
|
||||||
|
matches = regex.finditer(inputString)
|
||||||
|
for match in matches:
|
||||||
|
foundedPattern = match.group()
|
||||||
|
foundedPatternTemp = match.group()
|
||||||
|
if(pattern==1):
|
||||||
|
foundedPattern = foundedPattern.replace('ماه','')
|
||||||
|
else:
|
||||||
|
foundedPattern = foundedPattern.replace('سال','')
|
||||||
|
foundedPattern = foundedPattern.replace('ماه','')
|
||||||
|
foundedPattern = foundedPattern.strip()
|
||||||
|
tempString = foundedPattern
|
||||||
|
standardDatePattern = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})"
|
||||||
|
#regex = re.compile(regex_pattern_Mah)
|
||||||
|
matchItems = re.finditer(standardDatePattern,tempString)
|
||||||
|
for item in matchItems:
|
||||||
|
tempPattern = item.group()
|
||||||
|
tempString = tempString.replace(tempPattern,'')
|
||||||
|
tempString = tempString.strip()
|
||||||
|
tempString = tempString.replace('و','')
|
||||||
|
tempString = tempString.strip()
|
||||||
|
tempArray = tempString.split()
|
||||||
|
year = 0
|
||||||
|
for item in tempArray:
|
||||||
|
dateMatch = re.finditer(standardDatePattern,item)
|
||||||
|
regexFlag = True
|
||||||
|
for dateItem in dateMatch:
|
||||||
|
yearStr = dateItem.group()[1:5]
|
||||||
|
year += int(yearStr)
|
||||||
|
regexFlag = False
|
||||||
|
break
|
||||||
|
if(item.isalnum() and regexFlag):
|
||||||
|
year += int(item)
|
||||||
|
tempPattern = tempPattern.replace('y0','y'+str(year))
|
||||||
|
mainTextTemp = mainTextTemp.replace(foundedPatternTemp,tempPattern+' ')
|
||||||
|
return mainTextTemp
|
||||||
|
|
||||||
|
#foundedPattern = jdate2timestamp(foundedPattern)
|
||||||
|
#convertedText = regex.sub(foundedPattern,convertedText)
|
||||||
|
|
||||||
|
def normalizerLongData(data):
|
||||||
|
dates = []
|
||||||
|
if len(data) > 10000:
|
||||||
|
textParts = textwrap.wrap(data, 10000, break_long_words=False)
|
||||||
|
for part in textParts:
|
||||||
|
dates.extend(normalizerData(part))
|
||||||
|
else:
|
||||||
|
dates = normalizerData(data)
|
||||||
|
return dates
|
||||||
|
|
||||||
|
# ##################
|
||||||
|
# در ویندوز برای اعداد منفی که تاریخهای قبلی بود را خطا می داد
|
||||||
|
# rr = gdt.timestamp()
|
||||||
|
# #################
|
||||||
|
def jdate2timestamp_old(dt):
|
||||||
|
ndt = dt.replace("y", "")
|
||||||
|
ndt = ndt.replace("m", "/")
|
||||||
|
ndt = ndt.replace("d", "/")
|
||||||
|
gd = jalali.Persian(ndt).gregorian_datetime()
|
||||||
|
# print(gd)
|
||||||
|
ztime = datetime.time(0, 0, 0, 0)
|
||||||
|
gdt = datetime.datetime.combine(gd, ztime)
|
||||||
|
# print(gdt)
|
||||||
|
rr = gdt.timestamp()
|
||||||
|
tst = int( round(rr) * 1000)
|
||||||
|
return tst
|
||||||
|
|
||||||
|
def jdate2timestamp(dt):
|
||||||
|
ndt = dt.replace("y", "")
|
||||||
|
ndt = ndt.replace("m", "/")
|
||||||
|
ndt = ndt.replace("d", "/")
|
||||||
|
gd = jalali.Persian(ndt).gregorian_datetime()
|
||||||
|
base = datetime.date(1970, 1, 1)
|
||||||
|
rr = (gd-base).total_seconds()
|
||||||
|
tst = int( round(rr) * 1000)
|
||||||
|
return tst
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def getSortTimestamp(ts_date):
|
||||||
|
empty_date = -15000000000
|
||||||
|
ts_ts = empty_date
|
||||||
|
try:
|
||||||
|
if ts_date != "":
|
||||||
|
ts_ts = jdate2timestamp(ts_date)
|
||||||
|
except:
|
||||||
|
ts_ts = empty_date
|
||||||
|
|
||||||
|
return ts_ts
|
||||||
|
|
||||||
|
def normalize_content(content):
|
||||||
|
text = _normalizer.sub_alphabets(content)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def normalYehKe(text):
|
||||||
|
if(text == None) :
|
||||||
|
return ''
|
||||||
|
|
||||||
|
c1 = sub(yeAr, yeFr, text)
|
||||||
|
c2 = sub(keAr, keFr, c1)
|
||||||
|
c2 = c2.replace('\u00A0', '')
|
||||||
|
return c2.strip()
|
||||||
|
|
||||||
|
_term_list = []
|
||||||
|
def setTermList():
|
||||||
|
global _term_list
|
||||||
|
if(_term_list.__len__() > 0):
|
||||||
|
return
|
||||||
|
_term_list = [
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1285/07/14"),
|
||||||
|
"end": jdate2timestamp("1287/04/2"),
|
||||||
|
"term": "مجلس شورای ملی-دوره1",
|
||||||
|
"term_number": 1,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1288/8/24"),
|
||||||
|
"end": jdate2timestamp("1290/10/3"),
|
||||||
|
"term": "مجلس شورای ملی-دوره2",
|
||||||
|
"term_number": 2,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1293/9/14"),
|
||||||
|
"end": jdate2timestamp("1294/8/21"),
|
||||||
|
"term": "مجلس شورای ملی-دوره3",
|
||||||
|
"term_number": 3,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1300/4/1"),
|
||||||
|
"end": jdate2timestamp("1302/3/30"),
|
||||||
|
"term": "مجلس شورای ملی-دوره4",
|
||||||
|
"term_number": 4,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1302/11/22"),
|
||||||
|
"end": jdate2timestamp("1304/11/22"),
|
||||||
|
"term": "مجلس شورای ملی-دوره5",
|
||||||
|
"term_number": 5,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1305/4/19"),
|
||||||
|
"end": jdate2timestamp("1307/5/22"),
|
||||||
|
"term": "مجلس شورای ملی-دوره6",
|
||||||
|
"term_number": 6,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1307/7/19"),
|
||||||
|
"end": jdate2timestamp("1309/8/14"),
|
||||||
|
"term": "مجلس شورای ملی-دوره7",
|
||||||
|
"term_number": 7,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1309/9/24"),
|
||||||
|
"end": jdate2timestamp("1311/10/24"),
|
||||||
|
"term": "مجلس شورای ملی-دوره8",
|
||||||
|
"term_number": 8,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1311/12/24"),
|
||||||
|
"end": jdate2timestamp("1314/1/24"),
|
||||||
|
"term": "مجلس شورای ملی-دوره9",
|
||||||
|
"term_number": 9,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1314/3/15"),
|
||||||
|
"end": jdate2timestamp("1316/3/22"),
|
||||||
|
"term": "مجلس شورای ملی-دوره10",
|
||||||
|
"term_number": 10,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1316/6/20"),
|
||||||
|
"end": jdate2timestamp("1318/6/27"),
|
||||||
|
"term": "مجلس شورای ملی-دوره11",
|
||||||
|
"term_number": 11,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1318/8/3"),
|
||||||
|
"end": jdate2timestamp("1320/8/9"),
|
||||||
|
"term": "مجلس شورای ملی-دوره12",
|
||||||
|
"term_number": 12,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1320/8/22"),
|
||||||
|
"end": jdate2timestamp("1322/9/1"),
|
||||||
|
"term": "مجلس شورای ملی-دوره13",
|
||||||
|
"term_number": 13,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1322/12/16"),
|
||||||
|
"end": jdate2timestamp("1324/12/21"),
|
||||||
|
"term": "مجلس شورای ملی-دوره14",
|
||||||
|
"term_number": 14,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1326/4/25"),
|
||||||
|
"end": jdate2timestamp("1328/5/6"),
|
||||||
|
"term": "مجلس شورای ملی-دوره15",
|
||||||
|
"term_number": 15,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1328/11/20"),
|
||||||
|
"end": jdate2timestamp("1330/11/29"),
|
||||||
|
"term": "مجلس شورای ملی-دوره16",
|
||||||
|
"term_number": 16,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1331/2/7"),
|
||||||
|
"end": jdate2timestamp("1332/8/28"),
|
||||||
|
"term": "مجلس شورای ملی-دوره17",
|
||||||
|
"term_number": 17,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1332/12/27"),
|
||||||
|
"end": jdate2timestamp("1335/1/26"),
|
||||||
|
"term": "مجلس شورای ملی-دوره18",
|
||||||
|
"term_number": 18,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1335/3/10"),
|
||||||
|
"end": jdate2timestamp("1339/3/29"),
|
||||||
|
"term": "مجلس شورای ملی-دوره19",
|
||||||
|
"term_number": 19,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1339/12/2"),
|
||||||
|
"end": jdate2timestamp("1340/2/19"),
|
||||||
|
"term": "مجلس شورای ملی-دوره20",
|
||||||
|
"term_number": 20,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1342/7/14"),
|
||||||
|
"end": jdate2timestamp("1346/7/13"),
|
||||||
|
"term": "مجلس شورای ملی-دوره21",
|
||||||
|
"term_number": 21,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1346/7/14"),
|
||||||
|
"end": jdate2timestamp("1350/6/9"),
|
||||||
|
"term": "مجلس شورای ملی-دوره22",
|
||||||
|
"term_number": 22,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1350/6/9"),
|
||||||
|
"end": jdate2timestamp("1354/6/16"),
|
||||||
|
"term": "مجلس شورای ملی-دوره23",
|
||||||
|
"term_number": 23,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1354/6/17"),
|
||||||
|
"end": jdate2timestamp("1357/11/20"),
|
||||||
|
"term": "مجلس شورای ملی-دوره24",
|
||||||
|
"term_number": 24,
|
||||||
|
"majles_name": "شورای ملی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1359/3/7"),
|
||||||
|
"end": jdate2timestamp("1363/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره1",
|
||||||
|
"term_number": 1,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1363/3/7"),
|
||||||
|
"end": jdate2timestamp("1367/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره2",
|
||||||
|
"term_number": 2,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1367/3/7"),
|
||||||
|
"end": jdate2timestamp("1371/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره3",
|
||||||
|
"term_number": 3,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1371/3/7"),
|
||||||
|
"end": jdate2timestamp("1375/3/11"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره4",
|
||||||
|
"term_number": 4,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1375/3/12"),
|
||||||
|
"end": jdate2timestamp("1379/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره5",
|
||||||
|
"term_number": 5,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1379/3/7"),
|
||||||
|
"end": jdate2timestamp("1383/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره6",
|
||||||
|
"term_number": 6,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1383/3/7"),
|
||||||
|
"end": jdate2timestamp("1387/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره7",
|
||||||
|
"term_number": 7,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1387/3/7"),
|
||||||
|
"end": jdate2timestamp("1391/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره8",
|
||||||
|
"term_number": 8,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1391/3/7"),
|
||||||
|
"end": jdate2timestamp("1395/3/7"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره9",
|
||||||
|
"term_number": 9,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1395/3/8"),
|
||||||
|
"end": jdate2timestamp("1399/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره10",
|
||||||
|
"term_number": 10,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"begin": jdate2timestamp("1399/3/7"),
|
||||||
|
"end": jdate2timestamp("1403/3/6"),
|
||||||
|
"term": "مجلس شورای اسلامی-دوره11",
|
||||||
|
"term_number": 11,
|
||||||
|
"majles_name": "شورای اسلامی",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def getTermQanon(ts_date_timestamp, ts_ref):
|
||||||
|
setTermList()
|
||||||
|
global _term_list
|
||||||
|
term = ""
|
||||||
|
term_number = 0
|
||||||
|
majles_name = ""
|
||||||
|
|
||||||
|
if ts_ref == "هيات وزيران (دوره فترت)":
|
||||||
|
term = ts_ref
|
||||||
|
if ts_ref == "نخست وزير (مصدق)":
|
||||||
|
term = ts_ref
|
||||||
|
if ts_ref == "وزير عدليه (داور)":
|
||||||
|
term = ts_ref
|
||||||
|
if ts_ref == "شوراي انقلاب جمهوري اسلامي ايران":
|
||||||
|
term = ts_ref
|
||||||
|
|
||||||
|
majles_name = term
|
||||||
|
if term == "":
|
||||||
|
for i in range(len(_term_list) - 1, -1, -1):
|
||||||
|
begin = _term_list[i]["begin"]
|
||||||
|
end = _term_list[i]["end"]
|
||||||
|
if ts_date_timestamp >= begin and ts_date_timestamp <= end:
|
||||||
|
term = _term_list[i]["term"]
|
||||||
|
term_number = _term_list[i]["term_number"]
|
||||||
|
majles_name = _term_list[i]["majles_name"]
|
||||||
|
break
|
||||||
|
|
||||||
|
error = ""
|
||||||
|
if term == "":
|
||||||
|
# if ts_date_timestamp >= _term_list[0]["begin"] and ts_date_timestamp <= _term_list[len(_term_list)-1]["end"] :
|
||||||
|
if ts_date_timestamp <= _term_list[len(_term_list) - 1]["end"]:
|
||||||
|
for i in range(0, len(_term_list) - 1, 1):
|
||||||
|
end = _term_list[i]["end"]
|
||||||
|
if ts_date_timestamp <= end:
|
||||||
|
term = _term_list[i]["term"]
|
||||||
|
term_number = _term_list[i]["term_number"]
|
||||||
|
majles_name = _term_list[i]["majles_name"]
|
||||||
|
error = "تاریخ بین دو دوره"
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
term_number = -1
|
||||||
|
error = "تاریخ خارج از محدوده"
|
||||||
|
|
||||||
|
return term, term_number, majles_name, error
|
||||||
|
|
||||||
|
# این متد یک متن و ایندکس آغاز و پایان یک عبارت درون آن متن را دریافت می کند
|
||||||
|
# و شماره توکن آغازین و توکن پایانی مربوط به عبارت در متن را بر می گرداند
|
||||||
|
def token_state_finder(normalized_section_content, start_index, end_index):
|
||||||
|
before_substring = normalized_section_content[0:start_index-1].strip()
|
||||||
|
pattern_substring = normalized_section_content[start_index-1:end_index+1].strip()
|
||||||
|
before_substring_token_list = before_substring.strip().split()
|
||||||
|
pattern_token_list = pattern_substring.strip().split()
|
||||||
|
start_token_state = len(before_substring_token_list)
|
||||||
|
end_token_state = len(before_substring_token_list) + (len(pattern_token_list)-1)
|
||||||
|
pattern_tokens_state ={
|
||||||
|
"start_token_state": start_token_state,
|
||||||
|
"end_token_state" : end_token_state
|
||||||
|
}
|
||||||
|
return pattern_tokens_state
|
||||||
|
|
||||||
|
def find_number_indexes_in_string(normalized_string,recognized_numbers):
|
||||||
|
complete_recognized_numbers = []
|
||||||
|
for item in recognized_numbers:
|
||||||
|
number_start_index, number_end_index = find_token_indexes_in_string(normalized_string,item['start_token_index'],item['end_token_index'])
|
||||||
|
content = normalized_string.split()
|
||||||
|
# if item['start_token_index']==item['end_token_index']:
|
||||||
|
# # حذف این بخش و باقی گذاشتن دستور ذیل الز زیر در کفایت درست کار کردن متد بررسی شود
|
||||||
|
|
||||||
|
# number_token_list = content[item['start_token_index']]
|
||||||
|
# else:
|
||||||
|
number_token_list = content[item['start_token_index']:item['end_token_index']+1]
|
||||||
|
complete_recognized_numbers.append(
|
||||||
|
{
|
||||||
|
'number_value' : item['number_value'],
|
||||||
|
'number_token_list' : number_token_list,
|
||||||
|
'start_token_index' : item['start_token_index'],
|
||||||
|
'end_token_index' : item['end_token_index'],
|
||||||
|
"start_number_state": number_start_index,
|
||||||
|
"end_number_state" : number_end_index
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return complete_recognized_numbers
|
||||||
|
|
||||||
|
# این متد متن اصلی یک متن، توکن آغازین و توکن پایانی مربوط به یک عبارت را می گیرد
|
||||||
|
# و ایندکس آغاز و ایندکس پایان متن وارد شده را بر می گرداند
|
||||||
|
def find_token_indexes_in_string(normalized_string,start_token_state,end_token_state):
|
||||||
|
before_tokens = normalized_string.split()[0:start_token_state]
|
||||||
|
content_tokens = normalized_string.split()[start_token_state:end_token_state + 1]
|
||||||
|
content_start_index = 0
|
||||||
|
content_end_index = 0
|
||||||
|
# شمردن تعداد کاراکترهای هر توکن در لیست توکن قبل از عدد
|
||||||
|
for token in before_tokens:
|
||||||
|
content_start_index += len(token)
|
||||||
|
# اضافه کردن تعداد فاصله های خالی یا همان اسپیس به عدد ایندکس شروع عدد
|
||||||
|
content_start_index += len(before_tokens) + 1
|
||||||
|
|
||||||
|
# شمردن تعداد کاراکترهای هر توکن در لیست توکن مربوط به عدد
|
||||||
|
for token in content_tokens:
|
||||||
|
content_end_index += len(token)
|
||||||
|
# اضافه کردن تعداد فاصله های خالی یا همان اسپیس به عدد ایندکس پایان عدد
|
||||||
|
content_end_index += (content_start_index - 1) + (len(content_tokens) - 1)
|
||||||
|
|
||||||
|
return content_start_index, content_end_index
|
||||||
|
|
||||||
|
# این متد، متنی را دریافت می کند و الگوهای تعریف شده را در آن جستجو می کند و آرایه ای از عبارات مطابق با هر الگو،
|
||||||
|
# شماره ایندکس شروع و پایان هر عبارت، عنوان و محتوای الگو، و شماره توکن شروع و توکن پایانی هر عبارت
|
||||||
|
# پیدا شده را بر می گرداند
|
||||||
|
def regex_patterns_finder(sectoin_content):
|
||||||
|
regex_patterns = {
|
||||||
|
"asle N asasi": r"اصل\s*شماره\s*(\d+)\s*قانون\s*اساسی\s*جمهوری\s*اسلامی\s*ایران", # اصل شماره فلان قانون اساسی جمهوری اسلامی ایران
|
||||||
|
"qanone asasi": r"(?<!^)\b\sقانون\sاساسی\sجمهوری\sاسلامی\sایران", # قانون اساسی جمهوری اسلامی ایران که در اول پاراگراف نباشد
|
||||||
|
"qanone asasi": r"(?<!^)\b\sقانون\sاساسی", # قانون اساسی که در اول پاراگراف نباشد
|
||||||
|
"qanon * mosavvab tarikh" : r"\bقانون[\s\w/]*مصوب\s((y\d{2,4}m\d{1,2}d\d{1,2})|\d{2,4})", # قانون * مصوب تاریخ
|
||||||
|
"in qanon" : r"این\sقانون", # این قانون
|
||||||
|
"qanone foq" : r"قانون\sفوق", # قانون فوق
|
||||||
|
"eslahe qanon": r"قانون\sاصلاح", # اصلاح قانون
|
||||||
|
"tabsare foq" : r"تبصره\sفوق", # تبصره فوق
|
||||||
|
"made foq" : r"ماده\sفوق", # ماده فوق
|
||||||
|
"made vahede" : r"ماده\sواحده", # ماده واحده
|
||||||
|
"made vahed" : r"ماده\sواحد", # ماده واحد
|
||||||
|
"tabsare N" : r"^\bتبصره\s*شماره\s*(\d+)\s*", # تبصره شماره فلان که فقط اول پاراگراف باشد
|
||||||
|
"f tabsare N" : r"(?<!^)\bتبصره\sشماره\s(\d+)\s", # تبصره شماره فلان که همه جا غیر از اول پاراگراف باشد
|
||||||
|
"tabsare N" : r"(?<!^)\bتبصره ?\(? ?\d+? ?\)?[ :.]", # تبصره شماره فلان که همه جا غیر از اول پاراگراف باشد
|
||||||
|
"made N" : r"(?<!^)\bماده ?\(? ?\d+? ?\)?[ :.]", # *** ماده فلان که هرجای پاراگراف غیر از اول آن باشد
|
||||||
|
"f made N" : r"^\bماده\s*[(]?\s*(\d+)\s*[)]?\s*" # ماده فلان که فقط اول پاراگراف باشد با یا بدون ترکیب عدد با پرانتز
|
||||||
|
}
|
||||||
|
|
||||||
|
matched_array = []
|
||||||
|
for pattern_key,pattern_value in regex_patterns.items():
|
||||||
|
regex = re.compile(pattern_value)
|
||||||
|
matches = regex.finditer(sectoin_content)
|
||||||
|
|
||||||
|
for match in matches:
|
||||||
|
# انجام عملیات مرتبط با هر الگو در اینجا
|
||||||
|
founded_item = match.group()
|
||||||
|
start_index = match.start() + 1
|
||||||
|
end_index = match.end() - 1
|
||||||
|
pattern_tokens_state = token_state_finder(sectoin_content, start_index, end_index)
|
||||||
|
matched_array.append(
|
||||||
|
{
|
||||||
|
"founded_item" : founded_item,
|
||||||
|
"start_index" : start_index,
|
||||||
|
"end_index" : end_index,
|
||||||
|
"pattern_key" : pattern_key,
|
||||||
|
"pattern_value" : pattern_value,
|
||||||
|
"start_token_state": pattern_tokens_state["start_token_state"],
|
||||||
|
"end_token_state" : pattern_tokens_state["end_token_state"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# convertedText = regex.sub(' wwwwwwwww ',convertedText)
|
||||||
|
# مرتب کردن آرایه بر اساس توکن شروع عبارت
|
||||||
|
matched_array.sort(key=lambda x: int(x['start_token_state']), reverse=False)
|
||||||
|
return matched_array
|
||||||
|
|
||||||
|
def change_refrece_tokens(normalized_section_content, recognized_patterns_array):
|
||||||
|
token_list = normalized_section_content.strip().split()
|
||||||
|
for ref_item in recognized_patterns_array:
|
||||||
|
start_token_state = ref_item.get('start_token_state')
|
||||||
|
end_token_state = ref_item.get('end_token_state')
|
||||||
|
for i in range(start_token_state, end_token_state+1):
|
||||||
|
token_list[i] = 'eeee'
|
||||||
|
normalized_section_content = ''
|
||||||
|
for token in token_list:
|
||||||
|
normalized_section_content = ''.join([normalized_section_content, (' ' + token)])
|
||||||
|
return normalized_section_content.strip()
|
||||||
|
|
||||||
|
def getMetaData(text):
|
||||||
|
normalized_section_content, recognized_dates, recognized_numbers = normalizerDate2(text.strip())
|
||||||
|
recognized_numbers = find_number_indexes_in_string(text,recognized_numbers)
|
||||||
|
normalized_section_content = normalized_section_content.strip()
|
||||||
|
recognized_patterns_array = regex_patterns_finder(normalized_section_content)
|
||||||
|
normalized_section_content = change_refrece_tokens(normalized_section_content, recognized_patterns_array)
|
||||||
|
nlp_parser = []
|
||||||
|
date_list = recognized_dates
|
||||||
|
ref_list = recognized_patterns_array
|
||||||
|
for date_item in date_list:
|
||||||
|
nlp_parser.append({
|
||||||
|
"properties": {
|
||||||
|
"type" : "date",
|
||||||
|
"index_start": int(date_item['start_date_token_index']),
|
||||||
|
"index_end" : int(date_item['end_date_token_index']),
|
||||||
|
"text" : date_item['original_date'],
|
||||||
|
"result" : date_item['converted_date'],
|
||||||
|
#"timestamp" : date_item['timestamp'],
|
||||||
|
"ref_link" : ''
|
||||||
|
}
|
||||||
|
})
|
||||||
|
for ref_item in ref_list:
|
||||||
|
nlp_parser.append({
|
||||||
|
"properties": {
|
||||||
|
"type" : "reference",
|
||||||
|
"index_start" : int(ref_item['start_token_state']),
|
||||||
|
"index_end" : int(ref_item['end_token_state']),
|
||||||
|
"text" : ref_item['founded_item'],
|
||||||
|
"result" : ref_item['pattern_value'],
|
||||||
|
"ref_link" : ''
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return nlp_parser, normalized_section_content
|
||||||
|
|
||||||
|
def save_error(error_text,filename):
|
||||||
|
with open(filename, 'a+', encoding='utf-8') as file:
|
||||||
|
# نوشتن خطا در فایل
|
||||||
|
file.write(error_text + '\n' + 50*'*' + '\n')
|
||||||
|
|
106
keyword_extractor.py
Normal file
106
keyword_extractor.py
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
import json
|
||||||
|
from tqdm import tqdm
|
||||||
|
import time
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import os
|
||||||
|
from transformers import AutoTokenizer, AutoModel
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||||
|
|
||||||
|
os.environ['HF_HOME'] = "/home/admin/HFHOME"
|
||||||
|
|
||||||
|
#model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
|
model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
||||||
|
|
||||||
|
# use quantization to lower GPU usage
|
||||||
|
# 4 bit:
|
||||||
|
# bnb_config = BitsAndBytesConfig(
|
||||||
|
# load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
|
||||||
|
# )
|
||||||
|
# 8 bit:
|
||||||
|
bnb_config = BitsAndBytesConfig(
|
||||||
|
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_id,
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
device_map="auto",
|
||||||
|
quantization_config=bnb_config
|
||||||
|
)
|
||||||
|
terminators = [
|
||||||
|
tokenizer.eos_token_id,
|
||||||
|
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
||||||
|
]
|
||||||
|
model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
|
||||||
|
|
||||||
|
SYS_PROMPT = """You receive a Persian legal text and extract from it the keywords that are most important.
|
||||||
|
And you don't need to provide explanations or additional text.
|
||||||
|
Put each keyword on a single line."
|
||||||
|
"""# Explain your answer step by step.
|
||||||
|
|
||||||
|
|
||||||
|
def format_prompt(SENTENCE):
|
||||||
|
PROMPT = f"Persian legal text: {SENTENCE}."
|
||||||
|
return PROMPT
|
||||||
|
|
||||||
|
def generate(formatted_prompt):
|
||||||
|
formatted_prompt = formatted_prompt[:50000] # to avoid GPU OOM
|
||||||
|
messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
|
||||||
|
# tell the model to generate
|
||||||
|
input_ids = tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
return_tensors="pt"
|
||||||
|
).to(model.device)
|
||||||
|
outputs = model.generate(
|
||||||
|
input_ids,
|
||||||
|
max_new_tokens=2048,
|
||||||
|
eos_token_id=terminators,
|
||||||
|
do_sample=True,
|
||||||
|
temperature=0.6,
|
||||||
|
top_p=0.9,
|
||||||
|
)
|
||||||
|
response = outputs[0][input_ids.shape[-1]:]
|
||||||
|
return tokenizer.decode(response, skip_special_tokens=True)
|
||||||
|
|
||||||
|
def get_rules(sentence):
|
||||||
|
formatted_prompt = format_prompt(sentence)
|
||||||
|
rules = generate(formatted_prompt).split('\n')
|
||||||
|
result = [r.strip() for r in rules if r.strip()]
|
||||||
|
return result
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print('start')
|
||||||
|
start_time = time.time()
|
||||||
|
inputfile = open('./data/main_classes_dataset_03.json', "r", encoding='utf-8')
|
||||||
|
data = json.load(inputfile)
|
||||||
|
inputfile.close()
|
||||||
|
counter = 1
|
||||||
|
for c in tqdm(data):
|
||||||
|
for item in tqdm(data[c]):
|
||||||
|
content = item['content']
|
||||||
|
item['keywords'] = get_rules(content)
|
||||||
|
print(f"section {counter} ...")
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
outputfile = open('./data/main_keywords_lama70B_dataset_03.json', "w", encoding='utf-8')
|
||||||
|
outputfile.write(json.dumps(data, ensure_ascii=False, indent = 4))
|
||||||
|
outputfile.close()
|
||||||
|
end_time = time.time()
|
||||||
|
print(f"elapsed time: {end_time-start_time}")
|
||||||
|
print("end")
|
||||||
|
|
||||||
|
exit()
|
||||||
|
|
||||||
|
"""
|
||||||
|
system prompt version 2 for test:
|
||||||
|
|
||||||
|
You are a lawyer and you must be able to explain legal texts without changing technical terms in a way that non-lawyers can understand the meaning of the text.
|
||||||
|
|
||||||
|
user prompt version 2 for test:
|
||||||
|
|
||||||
|
Extract at least {} important and significant key phrases from the "text" and print the key phrases in the form of a list in Persian and put each key phrase on a new line and do not add any explanation at the beginning or end of the answer.
|
||||||
|
Each key phrase has a sequential number at the beginning. The key phrases must be present exactly in the text. It is very important and essential that the length of each key phrase has at least two tokens and a single-token key phrase is not acceptable. I emphasize that no key phrase should have only one token. The names of organizations, institutions and legal entities must be considered as key phrases. No key phrase should be a verb or a preposition and should only include nouns that are added together. No key phrase should end with a preposition or the letter "و". It is essential that key phrases do not include "ماده", "تبصره", "بند" or "تاریخ ها".
|
||||||
|
"""
|
236
llama_givechi.py
Normal file
236
llama_givechi.py
Normal file
|
@ -0,0 +1,236 @@
|
||||||
|
"""
|
||||||
|
این فایل با نرمالایزر پارسیور کار می کند
|
||||||
|
"""
|
||||||
|
from datetime import datetime
|
||||||
|
# from elasticsearch import Elasticsearch
|
||||||
|
from threading import Thread
|
||||||
|
import torch
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import os.path
|
||||||
|
import os
|
||||||
|
from funcs import write_to_json, read_from_json
|
||||||
|
from normalizer import Normalizer
|
||||||
|
from tokenizer import *
|
||||||
|
_normalizer = Normalizer(date_normalizing_needed=True)
|
||||||
|
address = os.getcwd()
|
||||||
|
# sections_list = read_from_json(address + '/data/clean_sections_11k.json') # Main File
|
||||||
|
# sections_list = read_from_json('../data/clean_sections_11k.json') # Main File
|
||||||
|
# sections_list = read_from_json('../data/simplized_sentences_110_2.json') # Main File
|
||||||
|
# sections_list = read_from_json('./data/main_sections_170k_metadata.json') # Main File
|
||||||
|
|
||||||
|
def read_from_json1(file_address):
|
||||||
|
data_dict = []
|
||||||
|
# خواندن اطلاعات از فایل JSON
|
||||||
|
with open(file_address, 'r', encoding='utf-8') as file:
|
||||||
|
loaded_data = json.load(file)
|
||||||
|
|
||||||
|
# نمایش اطلاعات خوانده شده
|
||||||
|
# for item in loaded_data:
|
||||||
|
# data_dict.append(item)
|
||||||
|
return loaded_data
|
||||||
|
|
||||||
|
sections_list = read_from_json1("./data_givechi/main_classes_dataset_03.json") # Main File
|
||||||
|
|
||||||
|
# not_have_two_token_kw = read_from_json('../data/not_have_two_token_kw.json')
|
||||||
|
# not_have_two_token_kw_list = [item3["id"] for item3 in not_have_two_token_kw]
|
||||||
|
not_have_two_token_kw_list = []
|
||||||
|
|
||||||
|
import json
|
||||||
|
os.environ['HF_HOME'] = "/home/admin/HFHOME"
|
||||||
|
from tqdm import tqdm
|
||||||
|
import time
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer, AutoModel
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||||
|
|
||||||
|
#model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
|
#model_id = "PartAI/Dorna2-Llama3.1-8B-Instruct"
|
||||||
|
|
||||||
|
model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
||||||
|
|
||||||
|
# use quantization to lower GPU usage
|
||||||
|
# 4 bit:
|
||||||
|
# bnb_config = BitsAndBytesConfig(
|
||||||
|
# load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
|
||||||
|
# )
|
||||||
|
# 8 bit:
|
||||||
|
bnb_config = BitsAndBytesConfig(
|
||||||
|
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
|
||||||
|
)
|
||||||
|
print("Model Loading START:")
|
||||||
|
print(str(datetime.now()))
|
||||||
|
print()
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_id,
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
device_map="auto",
|
||||||
|
quantization_config=bnb_config
|
||||||
|
)
|
||||||
|
terminators = [
|
||||||
|
tokenizer.eos_token_id,
|
||||||
|
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
||||||
|
]
|
||||||
|
model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
|
||||||
|
print("Model Loading END:")
|
||||||
|
print(str(datetime.now()))
|
||||||
|
print()
|
||||||
|
|
||||||
|
# if torch.cuda.is_available():
|
||||||
|
# #model_id = "PartAI/Dorna-Llama3-8B-Instruct"
|
||||||
|
# # model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
|
|
||||||
|
# model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
||||||
|
# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
|
||||||
|
# tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
|
||||||
|
# index_name_i = 'semantic_search-v10'
|
||||||
|
|
||||||
|
# es = Elasticsearch(
|
||||||
|
# "http://127.0.0.1:6900",
|
||||||
|
# # ca_certs="/path/to/http_ca.crt",
|
||||||
|
# basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
|
||||||
|
# )
|
||||||
|
|
||||||
|
counter = 0
|
||||||
|
total = 0
|
||||||
|
remained = 0
|
||||||
|
id = ''
|
||||||
|
keywords_count = 15
|
||||||
|
|
||||||
|
def generateKeywords(law):
|
||||||
|
global remained
|
||||||
|
try:
|
||||||
|
keywords_count = (len(law) / 1000) * 15
|
||||||
|
keywords_count = int(keywords_count)
|
||||||
|
if keywords_count == 0:
|
||||||
|
keywords_count = 1
|
||||||
|
|
||||||
|
|
||||||
|
# messages = [{"role": "system", "content": "تو یک وکیل حقوق دان هستی و باید بتوانی متن های قانونی و حقوقی را بدون تغییر اصطلاحات فنی، به صورتی توضیح دهی که افراد غیر حقوق دان، معنای متن را درک کنند. " },
|
||||||
|
# {"role": "user", "content":
|
||||||
|
# '''از "متن" حداقل {} عبارت های کلیدی مهم و پراهمیت را استخراج کن و عبارت های کلیدی را در قالب لیست به زبان فارسی چاپ کن و هر کلید عبارت کلیدی را در یک خط جدید قرار بده و هیچ گونه توضیحی در ابتدا یا انتهای پاسخ، اضافه نکن.
|
||||||
|
# هر عبارت کلیدی دارای یک شماره ترتیبی در ابتدای آن باشد. عبارت های کلیدی، دقیقا در متن موجود باشد. بسیار مهم و ضروری است که طول هر عبارت کلیدی حداقل دو توکن داشته باشد و عبارت کلیدی یک توکنی قابل قبول نیست. تاکید می کنم که هیچ عبارت کلیدی نباید فقط یک توکن داشته باشد. نام سازمان ها و نهادها و اشخاص حقوقی، حتما به عنوان عبارت کلیدی درنظر گرفته شود. هیچ عبارت کلیدی، فعل یا حرف اضافه نباشد و فقط شامل اسم هایی باشد که به هم اضافه شده اند. هیچ عبارت کلیدی نباید با حرف اضافه یا حرف «و» تمام شود. ضروری است که عبارت های کلیدی شامل ماده، بند، تبصره یا تاریخ ها نباشند.'''
|
||||||
|
# .format(keywords_count)
|
||||||
|
# },
|
||||||
|
# {"role": "user", "content":
|
||||||
|
# '''"متن": {}'''.format(text)
|
||||||
|
# },]
|
||||||
|
messages = [{"role": "system", "content": '''You are a lawyer and legal expert who can interpret legal texts well, understand the semantic layers of the text and infer logical relationships in the text.
|
||||||
|
You can understand Persian and English and interpret expressions in a Legal Context. You are also an expert in Deontic Logic and can understand the logical structure of law and rules in text.
|
||||||
|
|
||||||
|
Use uppercase English letters such as A, B, C, etc. to identify all possible propositions. Do not include negative tones such as ""not"" in the propositions. For example, if the sentence is ""It is not bored,"" you should use ""A: bored"" to represent it.
|
||||||
|
Next, for each proposition, use the symbol to represent its negative form. For example, the negative form of proposition A can be expressed as ¬A.
|
||||||
|
Now, carefully analyze the context and find causal relationship between propositions seriously. A causal expression is only established when the context directly and explicitly supports this relationship. Use arrows (→) to indicate causal relationships, for example, ""If A, then B"", ""B if A"" and ""A causes B"" etc. must be represented as A → B.
|
||||||
|
another example, ""if A and not C then B"" should be represented as A ∧ ¬C → B
|
||||||
|
if the causal expression is not strict and is somehow loose, do not state it.
|
||||||
|
|
||||||
|
Extraction Criteria:
|
||||||
|
1- Defintion of a propostion:
|
||||||
|
a proposition is an statement which could be true or false. ""I'm an animal"" is a propostion, but ""Ministry of Roads and Urban Development"" is not.
|
||||||
|
Defintion of strict caustion:
|
||||||
|
A strictly causes B if and only if:
|
||||||
|
1. Whenever A occurs, B necessarily occurs.
|
||||||
|
2. The occurrence of B is entirely dependent on the occurrence of A.
|
||||||
|
3. There are no other factors or conditions that can independently cause B.
|
||||||
|
4. The relationship between A and B is invariant and holds in all relevant circumstances.
|
||||||
|
|
||||||
|
Additional Instructions:
|
||||||
|
Do Not Add Extra Information: Provide only the extracted rules in the specified format without additional commentary or explanations.
|
||||||
|
|
||||||
|
Formatting Instructions:
|
||||||
|
Language: Output must be in English.
|
||||||
|
output only propositions and causal expressions that are explicitly stated in the following text.''' },
|
||||||
|
{"role": "user", "content": f'''"this is your text to process:{law}'''
|
||||||
|
},]
|
||||||
|
|
||||||
|
|
||||||
|
input_ids = tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
return_tensors="pt"
|
||||||
|
).to(model.device)
|
||||||
|
|
||||||
|
terminators = [
|
||||||
|
tokenizer.eos_token_id,
|
||||||
|
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
||||||
|
]
|
||||||
|
model.generation_config.pad_token_id = tokenizer.pad_token_id
|
||||||
|
|
||||||
|
|
||||||
|
outputs = model.generate(
|
||||||
|
input_ids,
|
||||||
|
max_new_tokens=256,
|
||||||
|
eos_token_id=terminators,
|
||||||
|
do_sample=True,
|
||||||
|
temperature=0.6,
|
||||||
|
top_p=0.85,
|
||||||
|
)
|
||||||
|
#lock0.release()
|
||||||
|
response = outputs[0][input_ids.shape[-1]:]
|
||||||
|
keywords = tokenizer.decode(response, skip_special_tokens=True)
|
||||||
|
#lock1.acquire()
|
||||||
|
# resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})
|
||||||
|
|
||||||
|
|
||||||
|
return keywords
|
||||||
|
|
||||||
|
except Exception as inst:
|
||||||
|
print(type(inst)) # the exception type
|
||||||
|
print(inst.args) # arguments stored in .args
|
||||||
|
print("Exception: " + str(inst))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
start_time = time.time()
|
||||||
|
print("start_time: "+str(datetime.now()))
|
||||||
|
|
||||||
|
try:
|
||||||
|
keywords_dict = []
|
||||||
|
|
||||||
|
count = 1
|
||||||
|
|
||||||
|
finall_data = []
|
||||||
|
for q_class, current_section_list in sections_list.items():
|
||||||
|
|
||||||
|
|
||||||
|
for content_item in current_section_list:
|
||||||
|
id = content_item['id']
|
||||||
|
qanon_id = content_item['qanon_id']
|
||||||
|
# if not id in not_have_two_token_kw_list:
|
||||||
|
# continue
|
||||||
|
content = content_item['content']
|
||||||
|
content_len = len(content.split())
|
||||||
|
# کنارگذاشتن محتواهای با حجم زیاد
|
||||||
|
if content_len > 2000:
|
||||||
|
print("too long content " + str(id))
|
||||||
|
continue
|
||||||
|
content = _normalizer.sub_alphabets(content)
|
||||||
|
prompt_result = generateKeywords(content)
|
||||||
|
print("section " + str(count) + "/" + str(len(not_have_two_token_kw_list)) + " prompting ... ")
|
||||||
|
keywords_dict.append({
|
||||||
|
'id':id,
|
||||||
|
'qanon-id':qanon_id,
|
||||||
|
'content':content,
|
||||||
|
'prompt-result':prompt_result
|
||||||
|
})
|
||||||
|
if count ==5:
|
||||||
|
write_to_json(keywords_dict, f"./data_givechi/main_test_{count}.json")
|
||||||
|
# keywords_dict = []
|
||||||
|
count+=1
|
||||||
|
# finall_data.append({'class':q_class, 'sections': keywords_dict})
|
||||||
|
finall_data.append({q_class : keywords_dict})
|
||||||
|
|
||||||
|
write_to_json(finall_data, "./data_givechi/main_classes_dataset_result.json")
|
||||||
|
|
||||||
|
except Exception as inst:
|
||||||
|
print(type(inst)) # the exception type
|
||||||
|
print(inst.args) # arguments stored in .args
|
||||||
|
|
||||||
|
end_time = time.time()
|
||||||
|
print("end_time: "+ str(datetime.now()))
|
||||||
|
operation_time = (int(end_time-start_time)/60)/60
|
||||||
|
print(f"elapsed time: {operation_time} hours")
|
||||||
|
print(f"Finished!!!")
|
1370
normalizer.py
Normal file
1370
normalizer.py
Normal file
File diff suppressed because it is too large
Load Diff
74
resource/normalizer/Dic1_new.txt
Normal file
74
resource/normalizer/Dic1_new.txt
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
بیخبر بیخبر
|
||||||
|
بیتوجهی بیتوجهی
|
||||||
|
بیطرفانه بیطرفانه
|
||||||
|
گفتوگو گفتوگو
|
||||||
|
آنها آنها
|
||||||
|
پیشبرد پیشبرد
|
||||||
|
روانشناختی روانشناختی
|
||||||
|
میباشد میباشد
|
||||||
|
لذتبخش لذتبخش
|
||||||
|
میدادند میدادند
|
||||||
|
مینویسد مینویسد
|
||||||
|
میبخشد میبخشد
|
||||||
|
بیقاعده بیقاعده
|
||||||
|
میباشند میباشند
|
||||||
|
موافقتنامه موافقتنامه
|
||||||
|
تخمگذار تخمگذار
|
||||||
|
پایینترین پایینترین
|
||||||
|
گرمکن گرمکن
|
||||||
|
پیشبینی پیشبینی
|
||||||
|
برونگرا برونگرا
|
||||||
|
میدهد میدهد
|
||||||
|
فیلمبرداری فیلمبرداری
|
||||||
|
آنسوی آنسوی
|
||||||
|
خدمتدهی خدمتدهی
|
||||||
|
اینگونه اینگونه
|
||||||
|
کمکرسانی کمکرسانی
|
||||||
|
کلانشهر کلانشهر
|
||||||
|
سپردهگذار سپردهگذار
|
||||||
|
بنیانگذار بنیانگذار
|
||||||
|
رضایتبخش رضایتبخش
|
||||||
|
اصلاحطلبان اصلاحطلبان
|
||||||
|
استخوانبندی استخوانبندی
|
||||||
|
درونگرا درونگرا
|
||||||
|
میگردد میگردد
|
||||||
|
اصلاحطلب اصلاحطلب
|
||||||
|
میتوان میتوان
|
||||||
|
عملکرد عملکرد
|
||||||
|
میروم میروم
|
||||||
|
بزرگنمایی بزرگنمایی
|
||||||
|
همجنس همجنس
|
||||||
|
همانطور همانطور
|
||||||
|
بیشترین بیشترین
|
||||||
|
انسانگرایی انسانگرایی
|
||||||
|
نمیباشند نمیباشند
|
||||||
|
جانبداری جانبداری
|
||||||
|
نمیتوانی نمیتوانی
|
||||||
|
قانونگذار قانونگذار
|
||||||
|
میشدند میشدند
|
||||||
|
تفاهمنامه تفاهمنامه
|
||||||
|
آسیبپذیر آسیبپذیر
|
||||||
|
برونگرایی برونگرایی
|
||||||
|
جفتگیری جفتگیری
|
||||||
|
گرانبها گرانبها
|
||||||
|
میشوند میشوند
|
||||||
|
کلاهبرداری کلاهبرداری
|
||||||
|
جهتیابی جهتیابی
|
||||||
|
چشمپوشی چشمپوشی
|
||||||
|
بنیانگذاران بنیانگذاران
|
||||||
|
میکند میکند
|
||||||
|
الهامبخش الهامبخش
|
||||||
|
وقتگیر وقتگیر
|
||||||
|
پسلرزه پسلرزه
|
||||||
|
میکنند میکنند
|
||||||
|
میتواند میتواند
|
||||||
|
آرامبخش آرامبخش
|
||||||
|
بینام بینام
|
||||||
|
غربزدگی غربزدگی
|
||||||
|
بیتفاوت بیتفاوت
|
||||||
|
بیثباتی بیثباتی
|
||||||
|
پاسخگویی پاسخگویی
|
||||||
|
میگیرد میگیرد
|
||||||
|
جمعبندی جمعبندی
|
||||||
|
میشود میشود
|
||||||
|
میکنیم میکنیم
|
112
resource/normalizer/Dic2_new.txt
Normal file
112
resource/normalizer/Dic2_new.txt
Normal file
|
@ -0,0 +1,112 @@
|
||||||
|
مهماننوازی مهماننوازی
|
||||||
|
صلیالله صلیالله
|
||||||
|
موافقتنامه موافقتنامه
|
||||||
|
اعتراضآمیز اعتراضآمیز
|
||||||
|
رییسجمهور رییسجمهور
|
||||||
|
چشمپوشی چشمپوشی
|
||||||
|
هیئتعلمی هیئتعلمی
|
||||||
|
الزامآور الزامآور
|
||||||
|
بیمهنامه بیمهنامه
|
||||||
|
آییننامه آییننامه
|
||||||
|
بتنریزی بتنریزی
|
||||||
|
تشییعجنازه تشییعجنازه
|
||||||
|
تامینکنندگان تامینکنندگان
|
||||||
|
پرسشنامه پرسشنامه
|
||||||
|
تحتالشعاع تحتالشعاع
|
||||||
|
شگفتانگیز شگفتانگیز
|
||||||
|
بزرگنمایی بزرگنمایی
|
||||||
|
نیمههادی نیمههادی
|
||||||
|
قابلکنترل قابلکنترل
|
||||||
|
روانپزشکی روانپزشکی
|
||||||
|
ضربالمثل ضربالمثل
|
||||||
|
اضافهکاری اضافهکاری
|
||||||
|
اختلافنظر اختلافنظر
|
||||||
|
بینالملل بینالملل
|
||||||
|
یکطرفه یکطرفه
|
||||||
|
موجشکن موجشکن
|
||||||
|
عزتنفس عزتنفس
|
||||||
|
بیسیم بیسیم
|
||||||
|
شیبدار شیبدار
|
||||||
|
دستیابی دستیابی
|
||||||
|
روانشناختی روانشناختی
|
||||||
|
عقبنشینی عقبنشینی
|
||||||
|
بهطور بهطور
|
||||||
|
خطچین خطچین
|
||||||
|
ادراکشده ادراکشده
|
||||||
|
خزانهداری خزانهداری
|
||||||
|
شیمیدرمانی شیمیدرمانی
|
||||||
|
آنسوی آنسوی
|
||||||
|
نقطهچین نقطهچین
|
||||||
|
منحصربهفرد منحصربهفرد
|
||||||
|
درحالتوسعه درحالتوسعه
|
||||||
|
رضایتبخش رضایتبخش
|
||||||
|
قرضالحسنه قرضالحسنه
|
||||||
|
هرجومرج هرجومرج
|
||||||
|
سیبزمینی سیبزمینی
|
||||||
|
میلیگرم میلیگرم
|
||||||
|
نخستوزیر نخستوزیر
|
||||||
|
تعیینکنندهای تعیینکنندهای
|
||||||
|
طاقتفرسا طاقتفرسا
|
||||||
|
قابلمشاهده قابلمشاهده
|
||||||
|
بهوسیله بهوسیله
|
||||||
|
قابلدستیابی قابلدستیابی
|
||||||
|
الهامبخش الهامبخش
|
||||||
|
پیدرپی پیدرپی
|
||||||
|
سرمایهداری سرمایهداری
|
||||||
|
لذتبخش لذتبخش
|
||||||
|
تخمگذار تخمگذار
|
||||||
|
گرمکن گرمکن
|
||||||
|
قابلتوجهی قابلتوجهی
|
||||||
|
فیلمبرداری فیلمبرداری
|
||||||
|
خدمتدهی خدمتدهی
|
||||||
|
معنیدار معنیدار
|
||||||
|
کلانشهری کلانشهری
|
||||||
|
گواهینامه گواهینامه
|
||||||
|
همجنس همجنس
|
||||||
|
همانطور همانطور
|
||||||
|
سیستمعامل سیستمعامل
|
||||||
|
حملونقل حملونقل
|
||||||
|
تفاهمنامه تفاهمنامه
|
||||||
|
بینالمللی بینالمللی
|
||||||
|
کلاهبرداری کلاهبرداری
|
||||||
|
نرمافزار نرمافزار
|
||||||
|
مضافالیه مضافالیه
|
||||||
|
قطعنامهای قطعنامهای
|
||||||
|
پاسخگویی پاسخگویی
|
||||||
|
عکسبرداری عکسبرداری
|
||||||
|
پسلرزه پسلرزه
|
||||||
|
خردهفروشی خردهفروشی
|
||||||
|
حقوقبشر حقوقبشر
|
||||||
|
تحلیلگران تحلیلگران
|
||||||
|
اینگونه اینگونه
|
||||||
|
صرفهجویی صرفهجویی
|
||||||
|
علیالخصوص علیالخصوص
|
||||||
|
کلانشهرها کلانشهرها
|
||||||
|
حاصلضرب حاصلضرب
|
||||||
|
اطلاعرسانی اطلاعرسانی
|
||||||
|
دندانپزشکی دندانپزشکی
|
||||||
|
پیشبرد پیشبرد
|
||||||
|
ایدهال ایدهال
|
||||||
|
هیچگاه هیچگاه
|
||||||
|
صنایعدستی صنایعدستی
|
||||||
|
سانتیمتر سانتیمتر
|
||||||
|
پیشبینی پیشبینی
|
||||||
|
خلیجفارس خلیجفارس
|
||||||
|
تاریخنگاری تاریخنگاری
|
||||||
|
هیچگونه هیچگونه
|
||||||
|
راهاندازی راهاندازی
|
||||||
|
جستوجوی جستوجوی
|
||||||
|
حاشیهنشینی حاشیهنشینی
|
||||||
|
رنگآمیزی رنگآمیزی
|
||||||
|
جمعآوری جمعآوری
|
||||||
|
وقتگیر وقتگیر
|
||||||
|
آرامبخش آرامبخش
|
||||||
|
غربزدگی غربزدگی
|
||||||
|
کلانشهر کلانشهر
|
||||||
|
نرمافزاری نرمافزاری
|
||||||
|
بدینوسیله بدینوسیله
|
||||||
|
جمعبندی جمعبندی
|
||||||
|
گفتوگو گفتوگو
|
||||||
|
حملونقل حملونقل
|
||||||
|
آیتالله آیتالله
|
||||||
|
حجتالاسلام حجتالاسلام
|
5
resource/normalizer/Dic3_new.txt
Normal file
5
resource/normalizer/Dic3_new.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
حملونقل حملونقل
|
||||||
|
حجتالاسلاموالمسلمین حجتالاسلاموالمسلمین
|
||||||
|
آیتاللهالعظمی آیتاللهالعظمی
|
||||||
|
گفتوگو گفتوگو
|
||||||
|
حملونقل حملونقل
|
BIN
resource/tokenizer/TokenMerger.pckl
Normal file
BIN
resource/tokenizer/TokenMerger.pckl
Normal file
Binary file not shown.
BIN
resource/tokenizer/enDict
Normal file
BIN
resource/tokenizer/enDict
Normal file
Binary file not shown.
BIN
resource/tokenizer/faDict
Normal file
BIN
resource/tokenizer/faDict
Normal file
Binary file not shown.
59
tokenizer.py
Normal file
59
tokenizer.py
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class Tokenizer():
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def tokenize_words(self, doc_string):
|
||||||
|
token_list = doc_string.strip().split()
|
||||||
|
#token_list = [x.strip("\u200c").strip('\u200f') for x in token_list if len(x.strip("\u200c")) != 0]
|
||||||
|
new_token_list = []
|
||||||
|
new_token = ''
|
||||||
|
for index,token in enumerate(token_list):
|
||||||
|
if len(token.strip("\u200c")) != 0:
|
||||||
|
new_token = token.strip("\u200c")
|
||||||
|
if len(token.strip("\u200f")) != 0:
|
||||||
|
new_token = new_token.strip('\u200f')
|
||||||
|
new_token_list.append(new_token)
|
||||||
|
return new_token_list
|
||||||
|
|
||||||
|
def tokenize_sentences(self, doc_string):
|
||||||
|
#finding the numbers
|
||||||
|
pattern = r"[-+]?\d*\.\d+|\d+"
|
||||||
|
nums_list = re.findall(pattern, doc_string)
|
||||||
|
doc_string = re.sub(pattern, 'floatingpointnumber', doc_string)
|
||||||
|
|
||||||
|
pattern = r'([!\.\?؟]+)[\n]*'
|
||||||
|
tmp = re.findall(pattern, doc_string)
|
||||||
|
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
||||||
|
|
||||||
|
pattern = r':\n'
|
||||||
|
tmp = re.findall(pattern, doc_string)
|
||||||
|
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
||||||
|
|
||||||
|
pattern = r';\n'
|
||||||
|
tmp = re.findall(pattern, doc_string)
|
||||||
|
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
||||||
|
|
||||||
|
pattern = r'؛\n'
|
||||||
|
tmp = re.findall(pattern, doc_string)
|
||||||
|
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
||||||
|
|
||||||
|
pattern = r'[\n]+'
|
||||||
|
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
||||||
|
|
||||||
|
for number in nums_list:
|
||||||
|
pattern = 'floatingpointnumber'
|
||||||
|
doc_string = re.sub(pattern, number, doc_string, 1)
|
||||||
|
|
||||||
|
doc_string = doc_string.split('\t\t')
|
||||||
|
doc_string = [x for x in doc_string if len(x) > 0]
|
||||||
|
return doc_string
|
||||||
|
|
||||||
|
def add_tab(self, mystring):
|
||||||
|
mystring = mystring.group() # this method return the string matched by re
|
||||||
|
mystring = mystring.strip(' ') # ommiting the whitespace around the pucntuation
|
||||||
|
mystring = mystring.strip('\n') # ommiting the newline around the pucntuation
|
||||||
|
mystring = " " + mystring + "\t\t" # adding a space after and before punctuation
|
||||||
|
return mystring
|
Loading…
Reference in New Issue
Block a user