ai_dataset/import_data/ai_keyword_insert_02.py
2024-09-22 20:19:25 +03:30

183 lines
5.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import requests
from decimal import Decimal
import os
from funcs import read_from_json, write_to_json
TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MjcwMjA3MjQsImp0aSI6IlFsQUIxVmdjT0M5eTFGbkNnbkRHbU45a3VHYXVtNThaIiwiaXNzIjpudWxsLCJleHAiOjE3MjgzMjA3MjMsImF1ZCI6bnVsbCwiZGF0YSI6eyJpZCI6MSwiZmlyc3RfbmFtZSI6Ilx1MDYyOFx1MDYzMVx1MDY0Nlx1MDYyN1x1MDY0NVx1MDY0NyBcdTA2NDZcdTA2NDhcdTA2Y2NcdTA2MzMiLCJsYXN0X25hbWUiOiJcdTA2NDFcdTA2NDZcdTA2Y2MiLCJlbWFpbCI6ImRldkBnbWFpbC5jb20iLCJ1c2VybmFtZSI6ImRldiIsInVzZXJfbGV2ZWwiOjJ9fQ.qhTF5dXo3zYJnMdlXmH4S8e6FmWjobrp2TabM19bLr4"
ACCEPT = "application/json"
HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
# url = "https://api.tavasi.ir/repo/dataset/multi/add/rgsection/keyword"
url = "http://192.168.23.160:8080/repo/dataset/multi/add/rgsection/keyword"
headers = HEADERS
address = os.getcwd()
if "import_data" in address:
address += "/data/clean_sections_kw_15k.json"
else:
address += "/import_data/data/clean_sections_kw_15k.json"
# open .json file
lines = read_from_json(address)
# تبدیل متن به لیستی از خطوط
# lines = input_text.strip().split('\n')
key = ""
begin = -1
end = -1
tokenNumber = -1
content = ""
result_token = []
class JSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, Decimal):
return float(obj)
return json.JSONEncoder.default(self, obj)
def createIndex(id, content, result_objects):
# result_objects = [
# {
# "task": "keyword",
# "key": "base_bert",
# "label": "برت اصلی",
# "values": result_token,
# },
# {
# "task": "keyword",
# "key": "trained_bert",
# "label": "برت آموزش دیده",
# "values": result_token,
# },
# {"task": "keyword", "key": "llama3", "label": "لاما 3", "values": result_token},
# ]
output = {
"id" : id,
"content": content,
"domain": "استخراج کلیدواژه 15 هزارتایی",
"ref_id": "",
"ref_url": "",
"result_objects": result_objects,
}
# print(output)
# print(json.dumps(output, indent=4, ensure_ascii=False))
return output
def appendResultToken(text, key, begin, tokenNumber, result_token):
end = -1
if key:
end = tokenNumber - 1
result_token.append(
{"begin": begin, "end": end, "result_key": key, "text": text}
)
begin = -1
end = -1
key = ""
return key, begin, end, result_token
def extract_keywords(keywords_text):
# جدا کردن بخشی از متن که شامل کلمات کلیدی است
temp_text = keywords_text.split(":\n")[1]
temp_text = temp_text.lstrip("\n")
keywords = temp_text.split("\n")
final_keywords = []
for index, key in enumerate(keywords):
if key == '':
continue
# این برای حالتی است که بعد از کلید واژه ها، توضیحات اضافه وجود داشته باشد
if keywords[index-1] == "":
break
first_character = key[0]
if first_character.isdigit():
if key[1].isdigit():
# اگر در ابتدای این کلیدواژه یک عدد دورقمی است از کاراکتر چهارم به بعد را به عنوان کلیدواژه انتخاب کن
final_keywords.append({"text":(key[3:]).strip()})
continue
final_keywords.append({"text":(key[2:]).strip()})
return final_keywords
def convert_to_obj(kewords_list):
keywords_obj_list = []
for key in kewords_list:
keywords_obj_list.append({
"text":key[0],
"score":str(key[1]),
})
return keywords_obj_list
bulk_data = []
bulk_count = 1
count = 0
text = ""
new_json_data = []
aikeis = []
for i, line in enumerate(lines):
#print("line: " + str(i))
count += 1
tokenNumber = tokenNumber + 1
content = line["content"]
result_objects = []
llam_prompt_kws = line["keywords"]
# values = extract_keywords(llam_prompt_kw)
values = llam_prompt_kws
result_objects.append(
{
"task": "keyword",
"key": "llama3_keywords_prompt",
"label": "لاما3 - با پرامپت",
"values": values,
})
id = "ai_kw_"+ str(count).zfill(5)
# new_json_data.append({
# "id": id,
# "content": content
# })
data = createIndex(id, content, result_objects)
tokenNumber = -1
content = ""
result_token = []
result_objects = []
bulk_data.append(data)
bulk_count += 1
if bulk_data.__len__() > 500:
print("=" * 30)
print("count " + str(count))
payload = json.dumps(bulk_data, cls=JSONEncoder) # Works!
response = requests.request("POST", url, headers=headers, data=payload)
print(response)
bulk_data = []
bulk_count = 1
if bulk_data.__len__() > 0:
print(bulk_count)
payload = json.dumps(bulk_data, cls=JSONEncoder) # Works!
response = requests.request("POST", url, headers=headers, data=payload)
print(response)
# print(len(aikeis))
# address2 = os.getcwd()
# if "impoert_data" in address2:
# address2 += "\\data\\sample_sections.json"
# else:
# address2 += "\\impoert_data\\data\\sample_sections.json"
# write_to_json(new_json_data, address2)
# نمایش دیکشنری خروجی به صورت JSON
print("******* insert operation finished! *******")