From 6f7b04efdc39dce0e208d0c887938edfc4aa1e8c Mon Sep 17 00:00:00 2001 From: ajokar Date: Tue, 17 Sep 2024 16:44:08 +0000 Subject: [PATCH] Upload files to "import_data" --- import_data/ai_keyword_insert.py | 219 +++++++++++++++++++++++++++++++ import_data/ai_keyword_update.py | 172 ++++++++++++++++++++++++ import_data/arman_dataset.py | 175 ++++++++++++++++++++++++ import_data/clean_tags.py | 20 +++ import_data/config_base.py | 42 ++++++ 5 files changed, 628 insertions(+) create mode 100644 import_data/ai_keyword_insert.py create mode 100644 import_data/ai_keyword_update.py create mode 100644 import_data/arman_dataset.py create mode 100644 import_data/clean_tags.py create mode 100644 import_data/config_base.py diff --git a/import_data/ai_keyword_insert.py b/import_data/ai_keyword_insert.py new file mode 100644 index 0000000..5b0913d --- /dev/null +++ b/import_data/ai_keyword_insert.py @@ -0,0 +1,219 @@ +import json +import requests +from decimal import Decimal +import os +from funcs import read_from_json, write_to_json + +TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTk5MzI2MDUsImp0aSI6IkZcLzRQcnRYckdnbDZ5bFg0QVUzdnVzRFlMZ2ZHXC93NnUiLCJpc3MiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZXhwIjoxNzIxMjMyNjA0LCJhdWQiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZGF0YSI6eyJpZCI6MSwiZmlyc3RfbmFtZSI6Ilx1MDYyOFx1MDYzMVx1MDY0Nlx1MDYyN1x1MDY0NVx1MDY0NyBcdTA2NDZcdTA2NDhcdTA2Y2NcdTA2MzMiLCJsYXN0X25hbWUiOiJcdTA2NDFcdTA2NDZcdTA2Y2MiLCJlbWFpbCI6ImRldkBnbWFpbC5jb20iLCJ1c2VybmFtZSI6ImRldiIsInVzZXJfbGV2ZWwiOjJ9fQ.f8y7Qce6u88atEtuyGI-cS3eqcupH5-Ow1ihfQTkV98" +ACCEPT = "application/json" +HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT} + +url = "https://api.tavasi.ir/repo/dataset/multi/add/rgsection/keyword" +headers = HEADERS + +address = os.getcwd() +if "impoert_data" in address: + address += "/data/section_keywords_0413.json" +else: + address += "/impoert_data/data/section_keywords_0413.json" + +# open .json file +lines = read_from_json(address) + +# تبدیل متن به لیستی از خطوط +# lines = input_text.strip().split('\n') + +key = "" +begin = -1 +end = -1 +tokenNumber = -1 +content = "" +result_token = [] + + +class JSONEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, Decimal): + return float(obj) + return json.JSONEncoder.default(self, obj) + + +def createIndex(id, content, result_objects): + # result_objects = [ + # { + # "task": "keyword", + # "key": "base_bert", + # "label": "برت اصلی", + # "values": result_token, + # }, + # { + # "task": "keyword", + # "key": "trained_bert", + # "label": "برت آموزش دیده", + # "values": result_token, + # }, + # {"task": "keyword", "key": "llama3", "label": "لاما 3", "values": result_token}, + # ] + output = { + "id" : id, + "content": content, + "domain": "استخراج کلیدواژه", + "ref_id": "", + "ref_url": "", + "result_objects": result_objects, + } + # print(output) + # print(json.dumps(output, indent=4, ensure_ascii=False)) + return output + + +def appendResultToken(text, key, begin, tokenNumber, result_token): + end = -1 + + if key: + + end = tokenNumber - 1 + result_token.append( + {"begin": begin, "end": end, "result_key": key, "text": text} + ) + begin = -1 + end = -1 + key = "" + return key, begin, end, result_token + +def extract_keywords(keywords_text): + # جدا کردن بخشی از متن که شامل کلمات کلیدی است + temp_text = keywords_text.split(":\n")[1] + temp_text = temp_text.lstrip("\n") + keywords = temp_text.split("\n") + final_keywords = [] + for index, key in enumerate(keywords): + if key == '': + continue + # این برای حالتی است که بعد از کلید واژه ها، توضیحات اضافه وجود داشته باشد + if keywords[index-1] == "": + break + first_character = key[0] + if first_character.isdigit(): + if key[1].isdigit(): + # اگر در ابتدای این کلیدواژه یک عدد دورقمی است از کاراکتر چهارم به بعد را به عنوان کلیدواژه انتخاب کن + final_keywords.append({"text":(key[3:]).strip()}) + continue + final_keywords.append({"text":(key[2:]).strip()}) + + return final_keywords + +def convert_to_obj(kewords_list): + keywords_obj_list = [] + for key in kewords_list: + keywords_obj_list.append({ + "text":key[0], + "score":str(key[1]), + }) + return keywords_obj_list + +bulk_data = [] +bulk_count = 1 +count = 0 +text = "" +new_json_data = [] +aikeis = [] +for i, line in enumerate(lines): + #print("line: " + str(i)) + count += 1 + tokenNumber = tokenNumber + 1 + + content = line["content"] + result_objects = [] + + + if line["llama3_org_keywords"]: + llam_kw = convert_to_obj(line["llama3_org_keywords"]) + + result_objects.append( + { + "task": "keyword", + "key": "llama3_keywords_noprompt", + "label": "لاما - بدون پرامپت", + "values": llam_kw, + }) + + if line["ai_keywords"]: + llam_prompt_kw = line["ai_keywords"] + + values = extract_keywords(llam_prompt_kw) + #aikeis.append(values) + if len(values) == 0: + print("missing keywords; line: " + str(i)) + result_objects.append( + { + "task": "keyword", + "key": "llama3_keywords_prompt", + "label": "لاما - با پرامپت", + "values": values, + }) + + + if line["bert_ft_2_keywords"]: + trained_bert_kw = convert_to_obj(line["bert_ft_2_keywords"]) + + result_objects.append( + { + "task": "keyword", + "key": "bert_ft_2_keywords", + "label": "برت آموزش دیده", + "values": trained_bert_kw, + } + ) + if line["bert_ft_org_keywords"]: + base_bert_kw = convert_to_obj(line["bert_ft_org_keywords"]) + result_objects.append( + { + "task": "keyword", + "key": "bert_ft_org_keywords", + "label": "برت اصلی", + "values": base_bert_kw, + }) + + id = "ai_kw_"+ str(count).zfill(5) + + new_json_data.append({ + "id": id, + "content": content + }) + + data = createIndex(id, content, result_objects) + tokenNumber = -1 + content = "" + result_token = [] + result_objects = [] + + bulk_data.append(data) + + bulk_count += 1 + if bulk_data.__len__() > 10: + print("=" * 30) + print("count " + str(count)) + payload = json.dumps(bulk_data, cls=JSONEncoder) # Works! + response = requests.request("POST", url, headers=headers, data=payload) + print(response.text) + bulk_data = [] + bulk_count = 1 + + +if bulk_data.__len__() > 0: + print(bulk_count) + payload = json.dumps(bulk_data, cls=JSONEncoder) # Works! + response = requests.request("POST", url, headers=headers, data=payload) + print(response.text) +# print(len(aikeis)) + +address2 = os.getcwd() +if "impoert_data" in address2: + address2 += "\\data\\sample_sections.json" +else: + address2 += "\\impoert_data\\data\\sample_sections.json" +write_to_json(new_json_data, address2) + +# نمایش دیکشنری خروجی به صورت JSON +print("******* insert operation finished! *******") diff --git a/import_data/ai_keyword_update.py b/import_data/ai_keyword_update.py new file mode 100644 index 0000000..c70eab2 --- /dev/null +++ b/import_data/ai_keyword_update.py @@ -0,0 +1,172 @@ +import json +import requests +from decimal import Decimal +import os +from funcs import read_from_json + +TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTk5MzI2MDUsImp0aSI6IkZcLzRQcnRYckdnbDZ5bFg0QVUzdnVzRFlMZ2ZHXC93NnUiLCJpc3MiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZXhwIjoxNzIxMjMyNjA0LCJhdWQiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZGF0YSI6eyJpZCI6MSwiZmlyc3RfbmFtZSI6Ilx1MDYyOFx1MDYzMVx1MDY0Nlx1MDYyN1x1MDY0NVx1MDY0NyBcdTA2NDZcdTA2NDhcdTA2Y2NcdTA2MzMiLCJsYXN0X25hbWUiOiJcdTA2NDFcdTA2NDZcdTA2Y2MiLCJlbWFpbCI6ImRldkBnbWFpbC5jb20iLCJ1c2VybmFtZSI6ImRldiIsInVzZXJfbGV2ZWwiOjJ9fQ.f8y7Qce6u88atEtuyGI-cS3eqcupH5-Ow1ihfQTkV98" +ACCEPT = "application/json" +HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT} + +url = "https://api.tavasi.ir/repo/dataset/multi/add/rgsection/keyword" +headers = HEADERS + +address = os.getcwd() +if "impoert_data" in address: + address += "/data/section_keywords_0413.json" +else: + address += "/impoert_data/data/section_keywords_0413.json" + +# open .json file +lines = read_from_json(address) + +key = "" +begin = -1 +end = -1 +tokenNumber = -1 +content = "" +result_token = [] + + +class JSONEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, Decimal): + return float(obj) + return json.JSONEncoder.default(self, obj) + + +def createIndex(content, result_objects): + # result_objects = [ + # { + # "task": "keyword", + # "key": "base_bert", + # "label": "برت اصلی", + # "values": result_token, + # }, + # { + # "task": "keyword", + # "key": "trained_bert", + # "label": "برت آموزش دیده", + # "values": result_token, + # }, + # {"task": "keyword", "key": "llama3", "label": "لاما 3", "values": result_token}, + # ] + output = { + "content": content, + "domain": "استخراج کلیدواژه", + "ref_id": "", + "ref_url": "", + "result_objects": result_objects, + } + # print(output) + # print(json.dumps(output, indent=4, ensure_ascii=False)) + return output + + +def appendResultToken(text, key, begin, tokenNumber, result_token): + end = -1 + + if key: + + end = tokenNumber - 1 + result_token.append( + {"begin": begin, "end": end, "result_key": key, "text": text} + ) + begin = -1 + end = -1 + key = "" + return key, begin, end, result_token + +def extract_keywords(keywords_text): + # جدا کردن بخشی از متن که شامل کلمات کلیدی است + temp_text = keywords_text.split(":\n")[1] + temp_text = temp_text.lstrip("\n") + keywords = temp_text.split("\n") + final_keywords = [] + for index, key in enumerate(keywords): + if key == '': + continue + # این برای حالتی است که بعد از کلید واژه ها، توضیحات اضافه وجود داشته باشد + if keywords[index-1] == "": + break + first_character = key[0] + if first_character.isdigit(): + if key[1].isdigit(): + # اگر در ابتدای این کلیدواژه یک عدد دورقمی است از کاراکتر چهارم به بعد را به عنوان کلیدواژه انتخاب کن + final_keywords.append({"text":(key[3:]).strip()}) + continue + final_keywords.append({"text":(key[2:]).strip()}) + + return final_keywords + +def convert_to_obj(kewords_list): + keywords_obj_list = [] + for key in kewords_list: + keywords_obj_list.append({ + "text":key[0], + "score":str(key[1]), + }) + return keywords_obj_list + +bulk_data = [] +bulk_count = 1 +count = 0 +text = "" + +aikeis = [] +for i, line in enumerate(lines): + #print("line: " + str(i)) + count += 1 + tokenNumber = tokenNumber + 1 + + content = line["content"] + result_objects = [] + + + if line["llama3_org_keywords"]: + llam_kw = convert_to_obj(line["llama3_org_keywords"]) + + result_objects.append( + { + "task": "keyword", + "key": "llama3_keywords_noprompt2", + "label": "لاما - بدون پرامپت 2", + "values": llam_kw, + }) + + data = createIndex(content, result_objects) + tokenNumber = -1 + content = "" + result_token = [] + result_objects = [] + + bulk_data.append(data) + bulk_count += 1 + if bulk_data.__len__() > 10: + print("=" * 30) + print("count " + str(count)) + payload = json.dumps(bulk_data, cls=JSONEncoder) # Works! + response = requests.request("POST", url, headers=headers, data=payload) + print(response.text) + bulk_data = [] + bulk_count = 1 + + + +if content != "": + # key, begin, end, result_token = appendResultToken( + # text, key, begin, tokenNumber, result_token + # ) + data = createIndex(content, result_token) + bulk_data.append(data) + bulk_count += 1 + + +if bulk_data.__len__() > 0: + print(bulk_count) + payload = json.dumps(bulk_data, cls=JSONEncoder) # Works! + response = requests.request("POST", url, headers=headers, data=payload) + print(response.text) + +# نمایش دیکشنری خروجی به صورت JSON +print("******* update operation finished! *******") diff --git a/import_data/arman_dataset.py b/import_data/arman_dataset.py new file mode 100644 index 0000000..81af79f --- /dev/null +++ b/import_data/arman_dataset.py @@ -0,0 +1,175 @@ +import json +import requests +from decimal import Decimal +import os + +TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTg0NTE3MTUsImp0aSI6InNWaDljNkdlRWdEK0IzM1N2UHNzbXkybkxUK0tWWnBjIiwiaXNzIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImV4cCI6MTcxOTc1MTcxNCwiYXVkIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImRhdGEiOnsiaWQiOjQwLCJmaXJzdF9uYW1lIjoiXHUwNjM5XHUwNjQ1XHUwNjI3XHUwNjMxIiwibGFzdF9uYW1lIjoiXHUwNjJjXHUwNjQ4XHUwNmE5XHUwNjI3XHUwNjMxIiwiZW1haWwiOiJham9rYXI5MUB5YWhvby5jb20iLCJ1c2VybmFtZSI6ImFqb2thciIsInVzZXJfbGV2ZWwiOjF9fQ.frkLaR1HYyfZ8sFhLtiuEZaBPwDPWKSxKDQpql6aOZc' +ACCEPT = "application/json" +HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT} + +url = "https://api.tavasi.ir/repo/dataset/multi/add/arman/ner" +headers = HEADERS + +address = os.getcwd() +if 'impoert_data' in address: + address += '/data/arman_train.txt' +else: + address += '/impoert_data/data/arman_train.txt' + +# باز کردن فایل متنی +with open(address, 'r', encoding='utf-8') as file: + input_text = file.read() + +# تبدیل متن به لیستی از خطوط +lines = input_text.strip().split('\n') + +key = '' +begin = -1 +end = -1 +tokenNumber = -1 +content = '' +result_token = [] + +class JSONEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, Decimal): + return float(obj) + return json.JSONEncoder.default(self, obj) + + + +# def createIndex(content, result_token): +# output = { +# "content": content, +# "domain": "پیکره آرمان", +# "ref_id": "", +# "ref_url": "", +# "result_token": result_token, +# } +# # print(output) +# # print(json.dumps(output, indent=4, ensure_ascii=False)) +# return output +def createIndex(content, result_token): + result_objects = [{ + "task":"ner", + "key":"arman_ner", + "label":"خروجی تیم آرمان", + "values":result_token + } ] + output ={ + "content": content, + "domain": "پیکره آرمان", + "ref_id": "", + "ref_url": "", + "result_objects": result_objects, + } + # print(output) + # print(json.dumps(output, indent=4, ensure_ascii=False)) + return output + + + +def appendResultToken(text, key, begin, tokenNumber, result_token): + end = -1 + # if key == 'HALFREFERENCE' : + # key = 'H_REF' + # elif key == 'REFERENCE' : + # key = 'REF' + if key: + if key == 'org' : + key = 'ORG' + elif key == 'loc' : + key = 'LOC' + elif key == 'Facility' : + key = 'fac' + elif key == 'event' : + key = 'EVENT' + elif key == 'pro' : + key = 'PRO' + elif key == 'pers' : + key = 'PER' + + end = tokenNumber -1 + result_token.append({ + "begin": begin, + "end": end, + "result_key": key, + "text" : text + }) + begin = -1 + end = -1 + key = '' + return key, begin, end, result_token + +bulk_data = [] +bulk_count = 1 +count = 0 +text = '' + +for i, line in enumerate(lines): + print('line: ' + str(i)) + count += 1 + tokenNumber = tokenNumber + 1 + + if line.strip() == '' : + key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) + + data = createIndex(content, result_token) + tokenNumber = -1 + content = '' + result_token = [] + + bulk_data.append(data) + bulk_count +=1 + if bulk_data.__len__() > 100: + print('=' * 30 ) + print('count ' + str(count)) + payload = json.dumps(bulk_data, cls=JSONEncoder) #Works! + response = requests.request("POST", url, headers=headers, data=payload) + print(response) + bulk_data = [] + bulk_count = 1 + + continue + + + + parts = line.split() + if len(parts) != 2: + continue + + + content += ' ' + parts[0] + + result_key = parts[1] + if result_key.startswith('I-'): + text += ' ' + parts[0] + continue + + if result_key == 'O' : + key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) + + if result_key.startswith('B-'): + key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) + + text = parts[0] + begin = tokenNumber + end = -1 + key = result_key.replace('B-', '') + + +if content != '' : + key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) + data = createIndex(content, result_token) + bulk_data.append(data) + bulk_count +=1 + + +if bulk_data.__len__() > 0: + print(bulk_count) + payload = json.dumps(bulk_data, cls=JSONEncoder) #Works! + response = requests.request("POST", url, headers=headers, data=payload) + print(response.text) + +# نمایش دیکشنری خروجی به صورت JSON +print("***************** end ") \ No newline at end of file diff --git a/import_data/clean_tags.py b/import_data/clean_tags.py new file mode 100644 index 0000000..2c5cdc1 --- /dev/null +++ b/import_data/clean_tags.py @@ -0,0 +1,20 @@ +import re + +def clean_text_file(input_file, output_file): + with open(input_file, 'r', encoding='utf-8') as file: + lines = file.readlines() + + combined_text = ' '.join([line.strip() for line in lines]) + + cleaned_text = re.sub(r'\b(B-ORG|I-ORG|O|B-HALFREFERENCE|I-HALFREFERENCE|B-REFERENCE|I-REFERENCE|B-DATE2|I-DATE2)\b', '', combined_text) + cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() + + with open(output_file, 'w', encoding='utf-8') as file: + file.write(cleaned_text) + +input_path = 'test_dataset.txt' +output_path = 'output2.txt' + +clean_text_file(input_path, output_path) + +print("محتوای فایل تمیز شد و در فایل جدید ذخیره گردید.") diff --git a/import_data/config_base.py b/import_data/config_base.py new file mode 100644 index 0000000..1cc38f9 --- /dev/null +++ b/import_data/config_base.py @@ -0,0 +1,42 @@ +import pyodbc + +# region constant values +# این توکن بسیار مهم است. چون هر از گاهی این توکن منقضی می شود +# و باید توکن معتبر جدید از سایت قوانین درحالت لاگین شده، +# دریافت شود و به جای توکن زیر جایگزین شود +TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MjA1MDUwMzAsImp0aSI6ImxPYVpGUWRhZXlSVkVUY3NNeUp2bUNTNXhHbmJZSTBHIiwiaXNzIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImV4cCI6MTcyMTgwNTAyOSwiYXVkIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImRhdGEiOnsiaWQiOjEsImZpcnN0X25hbWUiOiJcdTA2MjhcdTA2MzFcdTA2NDZcdTA2MjdcdTA2NDVcdTA2NDcgXHUwNjQ2XHUwNjQ4XHUwNmNjXHUwNjMzIiwibGFzdF9uYW1lIjoiXHUwNjQxXHUwNjQ2XHUwNmNjIiwiZW1haWwiOiJkZXZAZ21haWwuY29tIiwidXNlcm5hbWUiOiJkZXYiLCJ1c2VyX2xldmVsIjoyfX0.PBIThW_3mG29ZPHiYH3wfwbBLN5jkG_q1qv6IFkwYz8' +ACCEPT = "application/json" +HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT} +# endregion + +def create_cursor(dbname = ''): + if dbname == '': + dbname = "Qavanin" + + # server = "DESKTOP-ENVCI85" + # database = dbname + # username = "Mofid3" + # password = "12345" + + # ******** مهم ********* + # uncomment for jokar pc + server = "MOFID5" + database = dbname + username = "Mofid5" + password = "12345" + + # ENCRYPT defaults to yes starting in ODBC Driver SQL Server. It's good to always specify ENCRYPT=yes on the client side to avoid MITM attacks. + cnxn = pyodbc.connect( + "DRIVER={SQL Server};SERVER=" + + server + + ";DATABASE=" + + database + + ";Trusted_Connection=yes;UID=" + + username + + ";PWD=" + + password + ) + # cnxn = pyodbc.connect('DSN=sqldsn;') + cursor = cnxn.cursor() + return cursor, cnxn +