ارسال کلیدواژه های 15 هزارتایی به سرور 160

2024-09-22 19:32:38 +03:30 · 2024-09-22 19:32:38 +03:30 · 81c395d364
commit 81c395d364
parent 8c283df51d
2 changed files with 214296 additions and 0 deletions
--- a/import_data/ai_keyword_insert_02.py
+++ b/import_data/ai_keyword_insert_02.py
@ -0,0 +1,182 @@
+import json
+import requests
+from decimal import Decimal
+import os
+from funcs import read_from_json, write_to_json
+
+TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MjcwMjA3MjQsImp0aSI6IlFsQUIxVmdjT0M5eTFGbkNnbkRHbU45a3VHYXVtNThaIiwiaXNzIjpudWxsLCJleHAiOjE3MjgzMjA3MjMsImF1ZCI6bnVsbCwiZGF0YSI6eyJpZCI6MSwiZmlyc3RfbmFtZSI6Ilx1MDYyOFx1MDYzMVx1MDY0Nlx1MDYyN1x1MDY0NVx1MDY0NyBcdTA2NDZcdTA2NDhcdTA2Y2NcdTA2MzMiLCJsYXN0X25hbWUiOiJcdTA2NDFcdTA2NDZcdTA2Y2MiLCJlbWFpbCI6ImRldkBnbWFpbC5jb20iLCJ1c2VybmFtZSI6ImRldiIsInVzZXJfbGV2ZWwiOjJ9fQ.qhTF5dXo3zYJnMdlXmH4S8e6FmWjobrp2TabM19bLr4"
+ACCEPT = "application/json"
+HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
+
+# url = "https://api.tavasi.ir/repo/dataset/multi/add/rgsection/keyword"
+url = "http://192.168.23.160:8080/repo/dataset/multi/add/rgsection/keyword"
+headers = HEADERS
+
+address = os.getcwd()
+if "import_data" in address:
+    address += "/data/clean_sections_kw_15k.json"
+else:
+    address += "/import_data/data/clean_sections_kw_15k.json"
+
+# open .json file
+lines = read_from_json(address)
+
+# تبدیل متن به لیستی از خطوط
+# lines = input_text.strip().split('\n')
+
+key = ""
+begin = -1
+end = -1
+tokenNumber = -1
+content = ""
+result_token = []
+
+
+class JSONEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, Decimal):
+            return float(obj)
+        return json.JSONEncoder.default(self, obj)
+
+
+def createIndex(id, content, result_objects):
+    # result_objects = [
+    #     {
+    #         "task": "keyword",
+    #         "key": "base_bert",
+    #         "label": "برت اصلی",
+    #         "values": result_token,
+    #     },
+    #     {
+    #         "task": "keyword",
+    #         "key": "trained_bert",
+    #         "label": "برت آموزش دیده",
+    #         "values": result_token,
+    #     },
+    #     {"task": "keyword", "key": "llama3", "label": "لاما 3", "values": result_token},
+    # ]
+    output = {
+        "id" : id,
+        "content": content,
+        "domain": "استخراج کلیدواژه 15 هزارتایی",
+        "ref_id": "",
+        "ref_url": "",
+        "result_objects": result_objects,
+    }
+    # print(output)
+    # print(json.dumps(output, indent=4, ensure_ascii=False))
+    return output
+
+
+def appendResultToken(text, key, begin, tokenNumber, result_token):
+    end = -1
+
+    if key:
+
+        end = tokenNumber - 1
+        result_token.append(
+            {"begin": begin, "end": end, "result_key": key, "text": text}
+        )
+        begin = -1
+        end = -1
+        key = ""
+    return key, begin, end, result_token
+
+def extract_keywords(keywords_text):
+    # جدا کردن بخشی از متن که شامل کلمات کلیدی است
+    temp_text = keywords_text.split(":\n")[1]
+    temp_text = temp_text.lstrip("\n")
+    keywords = temp_text.split("\n")
+    final_keywords = []
+    for index, key in enumerate(keywords):
+        if key == '':
+            continue
+        # این برای حالتی است که بعد از کلید واژه ها، توضیحات اضافه وجود داشته باشد
+        if keywords[index-1] == "":
+            break
+        first_character = key[0]
+        if first_character.isdigit():
+            if key[1].isdigit():
+                # اگر در ابتدای این کلیدواژه یک عدد دورقمی است از کاراکتر چهارم به بعد را به عنوان کلیدواژه انتخاب کن
+                final_keywords.append({"text":(key[3:]).strip()})
+                continue
+            final_keywords.append({"text":(key[2:]).strip()})
+    
+    return final_keywords
+
+def convert_to_obj(kewords_list):
+    keywords_obj_list = []
+    for key in kewords_list:
+        keywords_obj_list.append({
+            "text":key[0],
+            "score":str(key[1]),
+        })
+    return keywords_obj_list
+      
+bulk_data = []
+bulk_count = 1
+count = 0
+text = ""
+new_json_data = []
+aikeis = []
+for i, line in enumerate(lines):
+    #print("line: " + str(i))
+    count += 1
+    tokenNumber = tokenNumber + 1
+
+    content = line["content"]
+    result_objects = []
+    llam_prompt_kws = line["keywords"]
+    # values = extract_keywords(llam_prompt_kw)
+    values = llam_prompt_kws
+    result_objects.append(
+    {
+        "task": "keyword",
+        "key": "llama3_keywords_prompt",
+        "label": "لاما3 - با پرامپت",
+        "values": values,
+        })    
+    
+    
+    id = "ai_kw_"+ str(count).zfill(5)
+    
+    # new_json_data.append({
+    #     "id": id,
+    #     "content": content
+    # })
+    
+    data = createIndex(id, content, result_objects)
+    tokenNumber = -1
+    content = ""
+    result_token = []
+    result_objects = []
+
+    bulk_data.append(data)
+
+    bulk_count += 1
+    if bulk_data.__len__() > 10:
+        print("=" * 30)
+        print("count " + str(count))
+        payload = json.dumps(bulk_data, cls=JSONEncoder)  # Works!
+        response = requests.request("POST", url, headers=headers, data=payload)
+        print(response.text)
+        bulk_data = []
+        bulk_count = 1
+
+ 
+if bulk_data.__len__() > 0:
+    print(bulk_count)
+    payload = json.dumps(bulk_data, cls=JSONEncoder)  # Works!
+    response = requests.request("POST", url, headers=headers, data=payload)
+    print(response.text)
+# print(len(aikeis))
+
+# address2 = os.getcwd()
+# if "impoert_data" in address2:
+#     address2 += "\\data\\sample_sections.json"
+# else:
+#     address2 += "\\impoert_data\\data\\sample_sections.json"
+# write_to_json(new_json_data, address2)
+
+# نمایش دیکشنری خروجی به صورت JSON
+print("******* insert operation finished! *******")
--- a/import_data/data/clean_sections_kw_15k.json
+++ b/import_data/data/clean_sections_kw_15k.json