163 lines
6.4 KiB
Python
163 lines
6.4 KiB
Python
import json
|
|
from sentence_transformers import SentenceTransformer, util
|
|
# from normalizer import cleaning
|
|
import os
|
|
import torch
|
|
# from inference import inference_main
|
|
from funcs import read_from_json, save_to_file_by_address, add_columndata_to_excel , read_from_excel
|
|
from datetime import datetime
|
|
#from general_functions import normalize_content
|
|
|
|
#model_path = './paraphrase-multilingual-mpnet-base-v2-1401-07-30'
|
|
#model_path = '/home/gpu/NLP/MLM/MODELS/training_stsbenchmark-HooshvareLab-bert-fa-base-uncased-finetuned-2-pt-2024-02-20_16-55-15'
|
|
|
|
|
|
def find_similarity(value_1, value_2):
|
|
value_1 = [value_1]
|
|
value_2 = [value_2]
|
|
# value_1 = value_1.lstrip('tensor(')
|
|
# value_1 = value_1.rstrip(', device=\'cuda:0\')')
|
|
# # value_1 = torch.tensor(eval(value_1))
|
|
|
|
# # print(value_1)
|
|
# # # value_2 = value_2.lstrip('tensor(')
|
|
# # # value_2 = value_2.rstrip(', device=\'cuda:0\')')
|
|
# # value_2 = torch.tensor(eval(value_2))
|
|
|
|
# # print(value_2)
|
|
|
|
# اگر دستگاه GPU موجود باشد، آن را انتخاب کنید، در غیر این صورت از CPU استفاده کنید
|
|
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
# value_1 = torch.tensor(value_1, device=device)
|
|
# value_2 = torch.tensor(value_2, device=device)
|
|
# Compute cosine-similarities
|
|
cosine_scores = util.cos_sim(value_1, value_2)
|
|
# print(cosine_scores)
|
|
return cosine_scores
|
|
|
|
def get_embedding(text):
|
|
#text = cleaning(text)
|
|
embedded_value = encoder.encode(text, convert_to_tensor=True)
|
|
return embedded_value
|
|
|
|
|
|
def method():
|
|
#model = "orgcatorg/xlm-v-base-ner"
|
|
file_address = current_path + '/data/eee.xlsx'#archive.tgz
|
|
|
|
|
|
column_name = "ai_nlp_results"
|
|
sections_ner = read_from_excel(file_address, column_name)
|
|
'''vals = Span[0:2]: "ماده 9"/HALFREFERENCE / 1.0
|
|
Span[7:10]: "ماده (5"/HALFREFERENCE / 1.0
|
|
Span[11:15]: "قانون مدیریت خدمات کشوری"/REFERENCE / 1.0
|
|
Span[16:21]: "8 /7 /1386"/DATE2 / 1.0
|
|
Span[27:31]: "ماده (5)"/HALFREFERENCE / 0.86
|
|
Span[31:35]: "قانون محاسبات عمومی کشور"/REFERENCE / 1.0
|
|
Span[36:41]: "1 /6 /1366"/DATE2 / 1.0
|
|
Span[53:56]: "ماده (2"/HALFREFERENCE / 1.0
|
|
Span[56:64]: ") قانون استخدام نیروی انتظامی جمهوری اسلامی ایران"/REFERENCE / 1.0
|
|
Span[65:70]: "20 /12 /1382"/DATE2 / 1.0
|
|
Span[86:88]: "این قانون"/HALFREFERENCE / 1.0
|
|
Span[8:13]: "1401/5/3"/DATE2 / 1.0
|
|
Span[16:21]: "قانون خانواده و جوانی جمعیت"/REFERENCE / 0.96
|
|
Span[1:2]: "روز"/DATE / 0.99
|
|
Span[2:12]: "نوزدهم دی ماه سال یکهزار وسیصد و نود و سه"/DATE / 1.0
|
|
Span[5:8]: "سازمان محیط زیست"/ORG / 1.0
|
|
Span[10:14]: "سازمان جوانان هلال احمر"/ORG / 1.0
|
|
Span[16:23]: "قانون صیانت از کاربران در فضای مجازی"/REFERENCE / 1.0
|
|
Span[26:29]: "قانون هوای پاک"/REFERENCE / 1.0
|
|
Span[35:38]: "سازمان محیط زیست"/ORG / 1.0'''
|
|
# sections_ner.append(vals)
|
|
print(f'number of sections: {str(len(sections_ner))}')
|
|
new_nlp_list = []
|
|
for index, detected_ner in enumerate(sections_ner):
|
|
print(f'section number: {index+1}')
|
|
print('*'*80)
|
|
ner_items = detected_ner.split('\n')
|
|
new_nlp = ''
|
|
for ner in ner_items:
|
|
|
|
try:
|
|
ner_parts = ner.split('\"')
|
|
detected_value = ner_parts[1].strip()
|
|
detected_type = ner_parts[2].lstrip("/")
|
|
detected_type = detected_type.split("/")
|
|
detected_type = detected_type[0].strip()
|
|
except:
|
|
continue
|
|
if detected_type == 'REFERENCE':
|
|
try:
|
|
found_law = find_related_law(detected_value)
|
|
found_item = f"""########################################################################
|
|
{detected_value} | {detected_type}
|
|
found_law: {found_law['caption']} | score: {found_law['similarity']}
|
|
url: https://qavanin.ir/Law/TreeText/{found_law['law_id']}
|
|
########################################################################\n"""
|
|
# ذخیره در فایل تکست
|
|
save_to_file_by_address(current_path + '/founded_law_3.txt',found_item)
|
|
new_nlp = new_nlp + found_item
|
|
except:
|
|
new_nlp = new_nlp + ner + ' ??????????????? \n'
|
|
|
|
else:
|
|
new_nlp = new_nlp + ner + '\n'
|
|
new_nlp_list.append(new_nlp)
|
|
|
|
new_column_name = 'ai_nlp_values_6'
|
|
add_columndata_to_excel(file_address, new_column_name, new_nlp_list)
|
|
print('*** excel file updated! ***')
|
|
|
|
return True
|
|
|
|
def find_related_law(detected_value):
|
|
similarity_arr = []
|
|
detected_value = pre_process(detected_value)
|
|
# حذف عنوان قانون از ابتدای توکن به منظور یکدست سازی با امبدینگ های موجود در جیسون
|
|
detected_value = detected_value.lstrip('قانون').strip()
|
|
# print(detected_value)
|
|
detected_value_embedding = get_embedding(detected_value)
|
|
x = 1
|
|
for law in law_dict:
|
|
|
|
caption_embedding = law['caption_embedding']
|
|
|
|
similarity_value = find_similarity(detected_value_embedding.tolist(), caption_embedding)
|
|
|
|
similarity_arr.append({'law_id':law['id'], 'similarity':similarity_value, 'caption':law['caption']})
|
|
|
|
# if x == 1:
|
|
# print(f'{datetime.now()} oooooooooooooooooooooooooooooooooooooooooooooooooooooooooo')
|
|
# if x%1000 == 0:
|
|
# print(f'law title number {str(x)} is reading ...')
|
|
|
|
try:
|
|
x += 1
|
|
except:
|
|
pass
|
|
|
|
sorted_similarity_arr = sorted(similarity_arr, key=lambda x: x['similarity'],reverse= True)
|
|
found_law = sorted_similarity_arr[0]
|
|
return found_law
|
|
|
|
def pre_process(text):
|
|
#text = normalize_content(text)
|
|
return text
|
|
|
|
if __name__ == "__main__":
|
|
|
|
model_path = '/home/gpu/tnlp/jokar/Models/HooshvareLab-bert-fa-base-uncased-finetuned-2-pt'
|
|
encoder = SentenceTransformer(model_path)
|
|
current_path = os.getcwd()
|
|
if not current_path.__contains__('jokar'):
|
|
current_path = os.getcwd() + '/jokar/Flair_NER'
|
|
|
|
law_dict = read_from_json(current_path + '/data/law_title.json')
|
|
|
|
|
|
# find_related_law('d')
|
|
# method()
|
|
# print(' operation finished!')
|
|
# print(datetime.now())
|