183 lines
5.5 KiB
Python
183 lines
5.5 KiB
Python
|
import json
|
|||
|
import requests
|
|||
|
from decimal import Decimal
|
|||
|
import os
|
|||
|
from funcs import read_from_json, write_to_json
|
|||
|
|
|||
|
TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MjcwMjA3MjQsImp0aSI6IlFsQUIxVmdjT0M5eTFGbkNnbkRHbU45a3VHYXVtNThaIiwiaXNzIjpudWxsLCJleHAiOjE3MjgzMjA3MjMsImF1ZCI6bnVsbCwiZGF0YSI6eyJpZCI6MSwiZmlyc3RfbmFtZSI6Ilx1MDYyOFx1MDYzMVx1MDY0Nlx1MDYyN1x1MDY0NVx1MDY0NyBcdTA2NDZcdTA2NDhcdTA2Y2NcdTA2MzMiLCJsYXN0X25hbWUiOiJcdTA2NDFcdTA2NDZcdTA2Y2MiLCJlbWFpbCI6ImRldkBnbWFpbC5jb20iLCJ1c2VybmFtZSI6ImRldiIsInVzZXJfbGV2ZWwiOjJ9fQ.qhTF5dXo3zYJnMdlXmH4S8e6FmWjobrp2TabM19bLr4"
|
|||
|
ACCEPT = "application/json"
|
|||
|
HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
|
|||
|
|
|||
|
# url = "https://api.tavasi.ir/repo/dataset/multi/add/rgsection/keyword"
|
|||
|
url = "http://192.168.23.160:8080/repo/dataset/multi/add/rgsection/keyword"
|
|||
|
headers = HEADERS
|
|||
|
|
|||
|
address = os.getcwd()
|
|||
|
if "import_data" in address:
|
|||
|
address += "/data/clean_sections_kw_15k.json"
|
|||
|
else:
|
|||
|
address += "/import_data/data/clean_sections_kw_15k.json"
|
|||
|
|
|||
|
# open .json file
|
|||
|
lines = read_from_json(address)
|
|||
|
|
|||
|
# تبدیل متن به لیستی از خطوط
|
|||
|
# lines = input_text.strip().split('\n')
|
|||
|
|
|||
|
key = ""
|
|||
|
begin = -1
|
|||
|
end = -1
|
|||
|
tokenNumber = -1
|
|||
|
content = ""
|
|||
|
result_token = []
|
|||
|
|
|||
|
|
|||
|
class JSONEncoder(json.JSONEncoder):
|
|||
|
def default(self, obj):
|
|||
|
if isinstance(obj, Decimal):
|
|||
|
return float(obj)
|
|||
|
return json.JSONEncoder.default(self, obj)
|
|||
|
|
|||
|
|
|||
|
def createIndex(id, content, result_objects):
|
|||
|
# result_objects = [
|
|||
|
# {
|
|||
|
# "task": "keyword",
|
|||
|
# "key": "base_bert",
|
|||
|
# "label": "برت اصلی",
|
|||
|
# "values": result_token,
|
|||
|
# },
|
|||
|
# {
|
|||
|
# "task": "keyword",
|
|||
|
# "key": "trained_bert",
|
|||
|
# "label": "برت آموزش دیده",
|
|||
|
# "values": result_token,
|
|||
|
# },
|
|||
|
# {"task": "keyword", "key": "llama3", "label": "لاما 3", "values": result_token},
|
|||
|
# ]
|
|||
|
output = {
|
|||
|
"id" : id,
|
|||
|
"content": content,
|
|||
|
"domain": "استخراج کلیدواژه 15 هزارتایی",
|
|||
|
"ref_id": "",
|
|||
|
"ref_url": "",
|
|||
|
"result_objects": result_objects,
|
|||
|
}
|
|||
|
# print(output)
|
|||
|
# print(json.dumps(output, indent=4, ensure_ascii=False))
|
|||
|
return output
|
|||
|
|
|||
|
|
|||
|
def appendResultToken(text, key, begin, tokenNumber, result_token):
|
|||
|
end = -1
|
|||
|
|
|||
|
if key:
|
|||
|
|
|||
|
end = tokenNumber - 1
|
|||
|
result_token.append(
|
|||
|
{"begin": begin, "end": end, "result_key": key, "text": text}
|
|||
|
)
|
|||
|
begin = -1
|
|||
|
end = -1
|
|||
|
key = ""
|
|||
|
return key, begin, end, result_token
|
|||
|
|
|||
|
def extract_keywords(keywords_text):
|
|||
|
# جدا کردن بخشی از متن که شامل کلمات کلیدی است
|
|||
|
temp_text = keywords_text.split(":\n")[1]
|
|||
|
temp_text = temp_text.lstrip("\n")
|
|||
|
keywords = temp_text.split("\n")
|
|||
|
final_keywords = []
|
|||
|
for index, key in enumerate(keywords):
|
|||
|
if key == '':
|
|||
|
continue
|
|||
|
# این برای حالتی است که بعد از کلید واژه ها، توضیحات اضافه وجود داشته باشد
|
|||
|
if keywords[index-1] == "":
|
|||
|
break
|
|||
|
first_character = key[0]
|
|||
|
if first_character.isdigit():
|
|||
|
if key[1].isdigit():
|
|||
|
# اگر در ابتدای این کلیدواژه یک عدد دورقمی است از کاراکتر چهارم به بعد را به عنوان کلیدواژه انتخاب کن
|
|||
|
final_keywords.append({"text":(key[3:]).strip()})
|
|||
|
continue
|
|||
|
final_keywords.append({"text":(key[2:]).strip()})
|
|||
|
|
|||
|
return final_keywords
|
|||
|
|
|||
|
def convert_to_obj(kewords_list):
|
|||
|
keywords_obj_list = []
|
|||
|
for key in kewords_list:
|
|||
|
keywords_obj_list.append({
|
|||
|
"text":key[0],
|
|||
|
"score":str(key[1]),
|
|||
|
})
|
|||
|
return keywords_obj_list
|
|||
|
|
|||
|
bulk_data = []
|
|||
|
bulk_count = 1
|
|||
|
count = 0
|
|||
|
text = ""
|
|||
|
new_json_data = []
|
|||
|
aikeis = []
|
|||
|
for i, line in enumerate(lines):
|
|||
|
#print("line: " + str(i))
|
|||
|
count += 1
|
|||
|
tokenNumber = tokenNumber + 1
|
|||
|
|
|||
|
content = line["content"]
|
|||
|
result_objects = []
|
|||
|
llam_prompt_kws = line["keywords"]
|
|||
|
# values = extract_keywords(llam_prompt_kw)
|
|||
|
values = llam_prompt_kws
|
|||
|
result_objects.append(
|
|||
|
{
|
|||
|
"task": "keyword",
|
|||
|
"key": "llama3_keywords_prompt",
|
|||
|
"label": "لاما3 - با پرامپت",
|
|||
|
"values": values,
|
|||
|
})
|
|||
|
|
|||
|
|
|||
|
id = "ai_kw_"+ str(count).zfill(5)
|
|||
|
|
|||
|
# new_json_data.append({
|
|||
|
# "id": id,
|
|||
|
# "content": content
|
|||
|
# })
|
|||
|
|
|||
|
data = createIndex(id, content, result_objects)
|
|||
|
tokenNumber = -1
|
|||
|
content = ""
|
|||
|
result_token = []
|
|||
|
result_objects = []
|
|||
|
|
|||
|
bulk_data.append(data)
|
|||
|
|
|||
|
bulk_count += 1
|
|||
|
if bulk_data.__len__() > 10:
|
|||
|
print("=" * 30)
|
|||
|
print("count " + str(count))
|
|||
|
payload = json.dumps(bulk_data, cls=JSONEncoder) # Works!
|
|||
|
response = requests.request("POST", url, headers=headers, data=payload)
|
|||
|
print(response.text)
|
|||
|
bulk_data = []
|
|||
|
bulk_count = 1
|
|||
|
|
|||
|
|
|||
|
if bulk_data.__len__() > 0:
|
|||
|
print(bulk_count)
|
|||
|
payload = json.dumps(bulk_data, cls=JSONEncoder) # Works!
|
|||
|
response = requests.request("POST", url, headers=headers, data=payload)
|
|||
|
print(response.text)
|
|||
|
# print(len(aikeis))
|
|||
|
|
|||
|
# address2 = os.getcwd()
|
|||
|
# if "impoert_data" in address2:
|
|||
|
# address2 += "\\data\\sample_sections.json"
|
|||
|
# else:
|
|||
|
# address2 += "\\impoert_data\\data\\sample_sections.json"
|
|||
|
# write_to_json(new_json_data, address2)
|
|||
|
|
|||
|
# نمایش دیکشنری خروجی به صورت JSON
|
|||
|
print("******* insert operation finished! *******")
|