import json import requests from decimal import Decimal import os from funcs import read_from_json, write_to_json TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTk5MzI2MDUsImp0aSI6IkZcLzRQcnRYckdnbDZ5bFg0QVUzdnVzRFlMZ2ZHXC93NnUiLCJpc3MiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZXhwIjoxNzIxMjMyNjA0LCJhdWQiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZGF0YSI6eyJpZCI6MSwiZmlyc3RfbmFtZSI6Ilx1MDYyOFx1MDYzMVx1MDY0Nlx1MDYyN1x1MDY0NVx1MDY0NyBcdTA2NDZcdTA2NDhcdTA2Y2NcdTA2MzMiLCJsYXN0X25hbWUiOiJcdTA2NDFcdTA2NDZcdTA2Y2MiLCJlbWFpbCI6ImRldkBnbWFpbC5jb20iLCJ1c2VybmFtZSI6ImRldiIsInVzZXJfbGV2ZWwiOjJ9fQ.f8y7Qce6u88atEtuyGI-cS3eqcupH5-Ow1ihfQTkV98" ACCEPT = "application/json" HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT} url = "https://api.tavasi.ir/repo/dataset/multi/add/rgsection/keyword" headers = HEADERS address = os.getcwd() if "impoert_data" in address: address += "/data/section_keywords_0413.json" else: address += "/impoert_data/data/section_keywords_0413.json" # open .json file lines = read_from_json(address) # تبدیل متن به لیستی از خطوط # lines = input_text.strip().split('\n') key = "" begin = -1 end = -1 tokenNumber = -1 content = "" result_token = [] class JSONEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, Decimal): return float(obj) return json.JSONEncoder.default(self, obj) def createIndex(id, content, result_objects): # result_objects = [ # { # "task": "keyword", # "key": "base_bert", # "label": "برت اصلی", # "values": result_token, # }, # { # "task": "keyword", # "key": "trained_bert", # "label": "برت آموزش دیده", # "values": result_token, # }, # {"task": "keyword", "key": "llama3", "label": "لاما 3", "values": result_token}, # ] output = { "id" : id, "content": content, "domain": "استخراج کلیدواژه", "ref_id": "", "ref_url": "", "result_objects": result_objects, } # print(output) # print(json.dumps(output, indent=4, ensure_ascii=False)) return output def appendResultToken(text, key, begin, tokenNumber, result_token): end = -1 if key: end = tokenNumber - 1 result_token.append( {"begin": begin, "end": end, "result_key": key, "text": text} ) begin = -1 end = -1 key = "" return key, begin, end, result_token def extract_keywords(keywords_text): # جدا کردن بخشی از متن که شامل کلمات کلیدی است temp_text = keywords_text.split(":\n")[1] temp_text = temp_text.lstrip("\n") keywords = temp_text.split("\n") final_keywords = [] for index, key in enumerate(keywords): if key == '': continue # این برای حالتی است که بعد از کلید واژه ها، توضیحات اضافه وجود داشته باشد if keywords[index-1] == "": break first_character = key[0] if first_character.isdigit(): if key[1].isdigit(): # اگر در ابتدای این کلیدواژه یک عدد دورقمی است از کاراکتر چهارم به بعد را به عنوان کلیدواژه انتخاب کن final_keywords.append({"text":(key[3:]).strip()}) continue final_keywords.append({"text":(key[2:]).strip()}) return final_keywords def convert_to_obj(kewords_list): keywords_obj_list = [] for key in kewords_list: keywords_obj_list.append({ "text":key[0], "score":str(key[1]), }) return keywords_obj_list bulk_data = [] bulk_count = 1 count = 0 text = "" new_json_data = [] aikeis = [] for i, line in enumerate(lines): #print("line: " + str(i)) count += 1 tokenNumber = tokenNumber + 1 content = line["content"] result_objects = [] if line["llama3_org_keywords"]: llam_kw = convert_to_obj(line["llama3_org_keywords"]) result_objects.append( { "task": "keyword", "key": "llama3_keywords_noprompt", "label": "لاما - بدون پرامپت", "values": llam_kw, }) if line["ai_keywords"]: llam_prompt_kw = line["ai_keywords"] values = extract_keywords(llam_prompt_kw) #aikeis.append(values) if len(values) == 0: print("missing keywords; line: " + str(i)) result_objects.append( { "task": "keyword", "key": "llama3_keywords_prompt", "label": "لاما - با پرامپت", "values": values, }) if line["bert_ft_2_keywords"]: trained_bert_kw = convert_to_obj(line["bert_ft_2_keywords"]) result_objects.append( { "task": "keyword", "key": "bert_ft_2_keywords", "label": "برت آموزش دیده", "values": trained_bert_kw, } ) if line["bert_ft_org_keywords"]: base_bert_kw = convert_to_obj(line["bert_ft_org_keywords"]) result_objects.append( { "task": "keyword", "key": "bert_ft_org_keywords", "label": "برت اصلی", "values": base_bert_kw, }) id = "ai_kw_"+ str(count).zfill(5) new_json_data.append({ "id": id, "content": content }) data = createIndex(id, content, result_objects) tokenNumber = -1 content = "" result_token = [] result_objects = [] bulk_data.append(data) bulk_count += 1 if bulk_data.__len__() > 10: print("=" * 30) print("count " + str(count)) payload = json.dumps(bulk_data, cls=JSONEncoder) # Works! response = requests.request("POST", url, headers=headers, data=payload) print(response.text) bulk_data = [] bulk_count = 1 if bulk_data.__len__() > 0: print(bulk_count) payload = json.dumps(bulk_data, cls=JSONEncoder) # Works! response = requests.request("POST", url, headers=headers, data=payload) print(response.text) # print(len(aikeis)) address2 = os.getcwd() if "impoert_data" in address2: address2 += "\\data\\sample_sections.json" else: address2 += "\\impoert_data\\data\\sample_sections.json" write_to_json(new_json_data, address2) # نمایش دیکشنری خروجی به صورت JSON print("******* insert operation finished! *******")