ai_dataset/import_data/ai_keyword_update.py

173 lines
5.4 KiB
Python
Raw Permalink Normal View History

2024-09-17 16:44:08 +00:00
import json
import requests
from decimal import Decimal
import os
from funcs import read_from_json
TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTk5MzI2MDUsImp0aSI6IkZcLzRQcnRYckdnbDZ5bFg0QVUzdnVzRFlMZ2ZHXC93NnUiLCJpc3MiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZXhwIjoxNzIxMjMyNjA0LCJhdWQiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZGF0YSI6eyJpZCI6MSwiZmlyc3RfbmFtZSI6Ilx1MDYyOFx1MDYzMVx1MDY0Nlx1MDYyN1x1MDY0NVx1MDY0NyBcdTA2NDZcdTA2NDhcdTA2Y2NcdTA2MzMiLCJsYXN0X25hbWUiOiJcdTA2NDFcdTA2NDZcdTA2Y2MiLCJlbWFpbCI6ImRldkBnbWFpbC5jb20iLCJ1c2VybmFtZSI6ImRldiIsInVzZXJfbGV2ZWwiOjJ9fQ.f8y7Qce6u88atEtuyGI-cS3eqcupH5-Ow1ihfQTkV98"
ACCEPT = "application/json"
HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
url = "https://api.tavasi.ir/repo/dataset/multi/add/rgsection/keyword"
headers = HEADERS
address = os.getcwd()
if "impoert_data" in address:
address += "/data/section_keywords_0413.json"
else:
address += "/impoert_data/data/section_keywords_0413.json"
# open .json file
lines = read_from_json(address)
key = ""
begin = -1
end = -1
tokenNumber = -1
content = ""
result_token = []
class JSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, Decimal):
return float(obj)
return json.JSONEncoder.default(self, obj)
def createIndex(content, result_objects):
# result_objects = [
# {
# "task": "keyword",
# "key": "base_bert",
# "label": "برت اصلی",
# "values": result_token,
# },
# {
# "task": "keyword",
# "key": "trained_bert",
# "label": "برت آموزش دیده",
# "values": result_token,
# },
# {"task": "keyword", "key": "llama3", "label": "لاما 3", "values": result_token},
# ]
output = {
"content": content,
"domain": "استخراج کلیدواژه",
"ref_id": "",
"ref_url": "",
"result_objects": result_objects,
}
# print(output)
# print(json.dumps(output, indent=4, ensure_ascii=False))
return output
def appendResultToken(text, key, begin, tokenNumber, result_token):
end = -1
if key:
end = tokenNumber - 1
result_token.append(
{"begin": begin, "end": end, "result_key": key, "text": text}
)
begin = -1
end = -1
key = ""
return key, begin, end, result_token
def extract_keywords(keywords_text):
# جدا کردن بخشی از متن که شامل کلمات کلیدی است
temp_text = keywords_text.split(":\n")[1]
temp_text = temp_text.lstrip("\n")
keywords = temp_text.split("\n")
final_keywords = []
for index, key in enumerate(keywords):
if key == '':
continue
# این برای حالتی است که بعد از کلید واژه ها، توضیحات اضافه وجود داشته باشد
if keywords[index-1] == "":
break
first_character = key[0]
if first_character.isdigit():
if key[1].isdigit():
# اگر در ابتدای این کلیدواژه یک عدد دورقمی است از کاراکتر چهارم به بعد را به عنوان کلیدواژه انتخاب کن
final_keywords.append({"text":(key[3:]).strip()})
continue
final_keywords.append({"text":(key[2:]).strip()})
return final_keywords
def convert_to_obj(kewords_list):
keywords_obj_list = []
for key in kewords_list:
keywords_obj_list.append({
"text":key[0],
"score":str(key[1]),
})
return keywords_obj_list
bulk_data = []
bulk_count = 1
count = 0
text = ""
aikeis = []
for i, line in enumerate(lines):
#print("line: " + str(i))
count += 1
tokenNumber = tokenNumber + 1
content = line["content"]
result_objects = []
if line["llama3_org_keywords"]:
llam_kw = convert_to_obj(line["llama3_org_keywords"])
result_objects.append(
{
"task": "keyword",
"key": "llama3_keywords_noprompt2",
"label": "لاما - بدون پرامپت 2",
"values": llam_kw,
})
data = createIndex(content, result_objects)
tokenNumber = -1
content = ""
result_token = []
result_objects = []
bulk_data.append(data)
bulk_count += 1
if bulk_data.__len__() > 10:
print("=" * 30)
print("count " + str(count))
payload = json.dumps(bulk_data, cls=JSONEncoder) # Works!
response = requests.request("POST", url, headers=headers, data=payload)
print(response.text)
bulk_data = []
bulk_count = 1
if content != "":
# key, begin, end, result_token = appendResultToken(
# text, key, begin, tokenNumber, result_token
# )
data = createIndex(content, result_token)
bulk_data.append(data)
bulk_count += 1
if bulk_data.__len__() > 0:
print(bulk_count)
payload = json.dumps(bulk_data, cls=JSONEncoder) # Works!
response = requests.request("POST", url, headers=headers, data=payload)
print(response.text)
# نمایش دیکشنری خروجی به صورت JSON
print("******* update operation finished! *******")