105 lines
3.2 KiB
Python
105 lines
3.2 KiB
Python
|
import json
|
|||
|
import requests
|
|||
|
from decimal import Decimal
|
|||
|
|
|||
|
TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTg3ODk5OTEsImp0aSI6IlNGaWVOcWIxeEFzZ252QmtvUkxXWU9UbXR2VTNvT3R6IiwiaXNzIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImV4cCI6MTcyMDA4OTk5MCwiYXVkIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImRhdGEiOnsiaWQiOjEsImZpcnN0X25hbWUiOiJcdTA2MjhcdTA2MzFcdTA2NDZcdTA2MjdcdTA2NDVcdTA2NDcgXHUwNjQ2XHUwNjQ4XHUwNmNjXHUwNjMzIiwibGFzdF9uYW1lIjoiXHUwNjQxXHUwNjQ2XHUwNmNjIiwiZW1haWwiOiJkZXZAZ21haWwuY29tIiwidXNlcm5hbWUiOiJkZXYiLCJ1c2VyX2xldmVsIjoyfX0.7DzFqHLee3ZI7EnZYjy5ChtVWhT3QJvBNUbLUdPssSo'
|
|||
|
ACCEPT = "application/json"
|
|||
|
HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
|
|||
|
|
|||
|
url = "https://api.tavasi.ir/repo/dataset/multi/add/qasection/keyword"
|
|||
|
headers = HEADERS
|
|||
|
|
|||
|
|
|||
|
# # باز کردن فایل متنی
|
|||
|
# file_path = 'G:/_majles/ner_law_dataset.txt'
|
|||
|
# with open(file_path, 'r', encoding='utf-8') as file:
|
|||
|
# input_text = file.read()
|
|||
|
|
|||
|
# # تبدیل متن به لیستی از خطوط
|
|||
|
# lines = input_text.strip().split('\n')
|
|||
|
|
|||
|
|
|||
|
file_address = './new_law_excel.xlsx'
|
|||
|
# output_file_address = './output/keywords_law_excel.xlsx'
|
|||
|
column_name = "content"
|
|||
|
|
|||
|
contents = read_from_excel(file_address, column_name)
|
|||
|
|
|||
|
contents_list = []
|
|||
|
for index, section in enumerate(contents):
|
|||
|
#ner_values = inference_main(model, section)
|
|||
|
contents_list.append(section)
|
|||
|
# contents_list = contents_list + section + '\n********************\n'
|
|||
|
|
|||
|
new_column_name = 'content_keywords'
|
|||
|
|
|||
|
|
|||
|
|
|||
|
key = ''
|
|||
|
begin = -1
|
|||
|
end = -1
|
|||
|
tokenNumber = -1
|
|||
|
content = ''
|
|||
|
result_token = []
|
|||
|
|
|||
|
class JSONEncoder(json.JSONEncoder):
|
|||
|
def default(self, obj):
|
|||
|
if isinstance(obj, Decimal):
|
|||
|
return float(obj)
|
|||
|
return json.JSONEncoder.default(self, obj)
|
|||
|
|
|||
|
|
|||
|
|
|||
|
# content : main text/content
|
|||
|
# results : keywords list
|
|||
|
def createIndex(content, extracted_keywords):
|
|||
|
result_objects = [{
|
|||
|
"task":"keyword",
|
|||
|
"key":"lama3-8b",
|
|||
|
"label":"لاما3 فارسی شده",
|
|||
|
"values":extracted_keywords
|
|||
|
} ]
|
|||
|
output ={
|
|||
|
"content": content,
|
|||
|
"domain": "مقررات",
|
|||
|
"ref_id": "",
|
|||
|
"ref_url": "",
|
|||
|
"result_objects": result_objects,
|
|||
|
}
|
|||
|
# print(output)
|
|||
|
# print(json.dumps(output, indent=4, ensure_ascii=False))
|
|||
|
return output
|
|||
|
|
|||
|
bulk_data = []
|
|||
|
bulk_count = 1
|
|||
|
count = 0
|
|||
|
|
|||
|
for mentry in contents_list:
|
|||
|
count += 1
|
|||
|
tokenNumber = tokenNumber + 1
|
|||
|
|
|||
|
|
|||
|
extracted_keywords = []
|
|||
|
|
|||
|
data=createIndex(mentry, extracted_keywords)
|
|||
|
bulk_data.append(data)
|
|||
|
bulk_count +=1
|
|||
|
if bulk_data.__len__() > 10:
|
|||
|
print('=' * 10 )
|
|||
|
print('count' + str(count))
|
|||
|
payload = json.dumps(bulk_data, cls=JSONEncoder) #Works!
|
|||
|
response = requests.request("POST", url, headers=headers, data=payload)
|
|||
|
print(response.text)
|
|||
|
bulk_data = []
|
|||
|
bulk_count = 1
|
|||
|
|
|||
|
|
|||
|
if bulk_data.__len__() > 0:
|
|||
|
print(bulk_count)
|
|||
|
payload = json.dumps(bulk_data, cls=JSONEncoder) #Works!
|
|||
|
response = requests.request("POST", url, headers=headers, data=payload)
|
|||
|
print(response.text)
|
|||
|
|
|||
|
|
|||
|
# نمایش دیکشنری خروجی به صورت JSON
|
|||
|
print("***************** end ")
|