Upload files to "import_data"
This commit is contained in:
commit
6f7b04efdc
219
import_data/ai_keyword_insert.py
Normal file
219
import_data/ai_keyword_insert.py
Normal file
|
@ -0,0 +1,219 @@
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
from decimal import Decimal
|
||||||
|
import os
|
||||||
|
from funcs import read_from_json, write_to_json
|
||||||
|
|
||||||
|
TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTk5MzI2MDUsImp0aSI6IkZcLzRQcnRYckdnbDZ5bFg0QVUzdnVzRFlMZ2ZHXC93NnUiLCJpc3MiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZXhwIjoxNzIxMjMyNjA0LCJhdWQiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZGF0YSI6eyJpZCI6MSwiZmlyc3RfbmFtZSI6Ilx1MDYyOFx1MDYzMVx1MDY0Nlx1MDYyN1x1MDY0NVx1MDY0NyBcdTA2NDZcdTA2NDhcdTA2Y2NcdTA2MzMiLCJsYXN0X25hbWUiOiJcdTA2NDFcdTA2NDZcdTA2Y2MiLCJlbWFpbCI6ImRldkBnbWFpbC5jb20iLCJ1c2VybmFtZSI6ImRldiIsInVzZXJfbGV2ZWwiOjJ9fQ.f8y7Qce6u88atEtuyGI-cS3eqcupH5-Ow1ihfQTkV98"
|
||||||
|
ACCEPT = "application/json"
|
||||||
|
HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
|
||||||
|
|
||||||
|
url = "https://api.tavasi.ir/repo/dataset/multi/add/rgsection/keyword"
|
||||||
|
headers = HEADERS
|
||||||
|
|
||||||
|
address = os.getcwd()
|
||||||
|
if "impoert_data" in address:
|
||||||
|
address += "/data/section_keywords_0413.json"
|
||||||
|
else:
|
||||||
|
address += "/impoert_data/data/section_keywords_0413.json"
|
||||||
|
|
||||||
|
# open .json file
|
||||||
|
lines = read_from_json(address)
|
||||||
|
|
||||||
|
# تبدیل متن به لیستی از خطوط
|
||||||
|
# lines = input_text.strip().split('\n')
|
||||||
|
|
||||||
|
key = ""
|
||||||
|
begin = -1
|
||||||
|
end = -1
|
||||||
|
tokenNumber = -1
|
||||||
|
content = ""
|
||||||
|
result_token = []
|
||||||
|
|
||||||
|
|
||||||
|
class JSONEncoder(json.JSONEncoder):
|
||||||
|
def default(self, obj):
|
||||||
|
if isinstance(obj, Decimal):
|
||||||
|
return float(obj)
|
||||||
|
return json.JSONEncoder.default(self, obj)
|
||||||
|
|
||||||
|
|
||||||
|
def createIndex(id, content, result_objects):
|
||||||
|
# result_objects = [
|
||||||
|
# {
|
||||||
|
# "task": "keyword",
|
||||||
|
# "key": "base_bert",
|
||||||
|
# "label": "برت اصلی",
|
||||||
|
# "values": result_token,
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "task": "keyword",
|
||||||
|
# "key": "trained_bert",
|
||||||
|
# "label": "برت آموزش دیده",
|
||||||
|
# "values": result_token,
|
||||||
|
# },
|
||||||
|
# {"task": "keyword", "key": "llama3", "label": "لاما 3", "values": result_token},
|
||||||
|
# ]
|
||||||
|
output = {
|
||||||
|
"id" : id,
|
||||||
|
"content": content,
|
||||||
|
"domain": "استخراج کلیدواژه",
|
||||||
|
"ref_id": "",
|
||||||
|
"ref_url": "",
|
||||||
|
"result_objects": result_objects,
|
||||||
|
}
|
||||||
|
# print(output)
|
||||||
|
# print(json.dumps(output, indent=4, ensure_ascii=False))
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def appendResultToken(text, key, begin, tokenNumber, result_token):
|
||||||
|
end = -1
|
||||||
|
|
||||||
|
if key:
|
||||||
|
|
||||||
|
end = tokenNumber - 1
|
||||||
|
result_token.append(
|
||||||
|
{"begin": begin, "end": end, "result_key": key, "text": text}
|
||||||
|
)
|
||||||
|
begin = -1
|
||||||
|
end = -1
|
||||||
|
key = ""
|
||||||
|
return key, begin, end, result_token
|
||||||
|
|
||||||
|
def extract_keywords(keywords_text):
|
||||||
|
# جدا کردن بخشی از متن که شامل کلمات کلیدی است
|
||||||
|
temp_text = keywords_text.split(":\n")[1]
|
||||||
|
temp_text = temp_text.lstrip("\n")
|
||||||
|
keywords = temp_text.split("\n")
|
||||||
|
final_keywords = []
|
||||||
|
for index, key in enumerate(keywords):
|
||||||
|
if key == '':
|
||||||
|
continue
|
||||||
|
# این برای حالتی است که بعد از کلید واژه ها، توضیحات اضافه وجود داشته باشد
|
||||||
|
if keywords[index-1] == "":
|
||||||
|
break
|
||||||
|
first_character = key[0]
|
||||||
|
if first_character.isdigit():
|
||||||
|
if key[1].isdigit():
|
||||||
|
# اگر در ابتدای این کلیدواژه یک عدد دورقمی است از کاراکتر چهارم به بعد را به عنوان کلیدواژه انتخاب کن
|
||||||
|
final_keywords.append({"text":(key[3:]).strip()})
|
||||||
|
continue
|
||||||
|
final_keywords.append({"text":(key[2:]).strip()})
|
||||||
|
|
||||||
|
return final_keywords
|
||||||
|
|
||||||
|
def convert_to_obj(kewords_list):
|
||||||
|
keywords_obj_list = []
|
||||||
|
for key in kewords_list:
|
||||||
|
keywords_obj_list.append({
|
||||||
|
"text":key[0],
|
||||||
|
"score":str(key[1]),
|
||||||
|
})
|
||||||
|
return keywords_obj_list
|
||||||
|
|
||||||
|
bulk_data = []
|
||||||
|
bulk_count = 1
|
||||||
|
count = 0
|
||||||
|
text = ""
|
||||||
|
new_json_data = []
|
||||||
|
aikeis = []
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
#print("line: " + str(i))
|
||||||
|
count += 1
|
||||||
|
tokenNumber = tokenNumber + 1
|
||||||
|
|
||||||
|
content = line["content"]
|
||||||
|
result_objects = []
|
||||||
|
|
||||||
|
|
||||||
|
if line["llama3_org_keywords"]:
|
||||||
|
llam_kw = convert_to_obj(line["llama3_org_keywords"])
|
||||||
|
|
||||||
|
result_objects.append(
|
||||||
|
{
|
||||||
|
"task": "keyword",
|
||||||
|
"key": "llama3_keywords_noprompt",
|
||||||
|
"label": "لاما - بدون پرامپت",
|
||||||
|
"values": llam_kw,
|
||||||
|
})
|
||||||
|
|
||||||
|
if line["ai_keywords"]:
|
||||||
|
llam_prompt_kw = line["ai_keywords"]
|
||||||
|
|
||||||
|
values = extract_keywords(llam_prompt_kw)
|
||||||
|
#aikeis.append(values)
|
||||||
|
if len(values) == 0:
|
||||||
|
print("missing keywords; line: " + str(i))
|
||||||
|
result_objects.append(
|
||||||
|
{
|
||||||
|
"task": "keyword",
|
||||||
|
"key": "llama3_keywords_prompt",
|
||||||
|
"label": "لاما - با پرامپت",
|
||||||
|
"values": values,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
if line["bert_ft_2_keywords"]:
|
||||||
|
trained_bert_kw = convert_to_obj(line["bert_ft_2_keywords"])
|
||||||
|
|
||||||
|
result_objects.append(
|
||||||
|
{
|
||||||
|
"task": "keyword",
|
||||||
|
"key": "bert_ft_2_keywords",
|
||||||
|
"label": "برت آموزش دیده",
|
||||||
|
"values": trained_bert_kw,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if line["bert_ft_org_keywords"]:
|
||||||
|
base_bert_kw = convert_to_obj(line["bert_ft_org_keywords"])
|
||||||
|
result_objects.append(
|
||||||
|
{
|
||||||
|
"task": "keyword",
|
||||||
|
"key": "bert_ft_org_keywords",
|
||||||
|
"label": "برت اصلی",
|
||||||
|
"values": base_bert_kw,
|
||||||
|
})
|
||||||
|
|
||||||
|
id = "ai_kw_"+ str(count).zfill(5)
|
||||||
|
|
||||||
|
new_json_data.append({
|
||||||
|
"id": id,
|
||||||
|
"content": content
|
||||||
|
})
|
||||||
|
|
||||||
|
data = createIndex(id, content, result_objects)
|
||||||
|
tokenNumber = -1
|
||||||
|
content = ""
|
||||||
|
result_token = []
|
||||||
|
result_objects = []
|
||||||
|
|
||||||
|
bulk_data.append(data)
|
||||||
|
|
||||||
|
bulk_count += 1
|
||||||
|
if bulk_data.__len__() > 10:
|
||||||
|
print("=" * 30)
|
||||||
|
print("count " + str(count))
|
||||||
|
payload = json.dumps(bulk_data, cls=JSONEncoder) # Works!
|
||||||
|
response = requests.request("POST", url, headers=headers, data=payload)
|
||||||
|
print(response.text)
|
||||||
|
bulk_data = []
|
||||||
|
bulk_count = 1
|
||||||
|
|
||||||
|
|
||||||
|
if bulk_data.__len__() > 0:
|
||||||
|
print(bulk_count)
|
||||||
|
payload = json.dumps(bulk_data, cls=JSONEncoder) # Works!
|
||||||
|
response = requests.request("POST", url, headers=headers, data=payload)
|
||||||
|
print(response.text)
|
||||||
|
# print(len(aikeis))
|
||||||
|
|
||||||
|
address2 = os.getcwd()
|
||||||
|
if "impoert_data" in address2:
|
||||||
|
address2 += "\\data\\sample_sections.json"
|
||||||
|
else:
|
||||||
|
address2 += "\\impoert_data\\data\\sample_sections.json"
|
||||||
|
write_to_json(new_json_data, address2)
|
||||||
|
|
||||||
|
# نمایش دیکشنری خروجی به صورت JSON
|
||||||
|
print("******* insert operation finished! *******")
|
172
import_data/ai_keyword_update.py
Normal file
172
import_data/ai_keyword_update.py
Normal file
|
@ -0,0 +1,172 @@
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
from decimal import Decimal
|
||||||
|
import os
|
||||||
|
from funcs import read_from_json
|
||||||
|
|
||||||
|
TOKEN = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTk5MzI2MDUsImp0aSI6IkZcLzRQcnRYckdnbDZ5bFg0QVUzdnVzRFlMZ2ZHXC93NnUiLCJpc3MiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZXhwIjoxNzIxMjMyNjA0LCJhdWQiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZGF0YSI6eyJpZCI6MSwiZmlyc3RfbmFtZSI6Ilx1MDYyOFx1MDYzMVx1MDY0Nlx1MDYyN1x1MDY0NVx1MDY0NyBcdTA2NDZcdTA2NDhcdTA2Y2NcdTA2MzMiLCJsYXN0X25hbWUiOiJcdTA2NDFcdTA2NDZcdTA2Y2MiLCJlbWFpbCI6ImRldkBnbWFpbC5jb20iLCJ1c2VybmFtZSI6ImRldiIsInVzZXJfbGV2ZWwiOjJ9fQ.f8y7Qce6u88atEtuyGI-cS3eqcupH5-Ow1ihfQTkV98"
|
||||||
|
ACCEPT = "application/json"
|
||||||
|
HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
|
||||||
|
|
||||||
|
url = "https://api.tavasi.ir/repo/dataset/multi/add/rgsection/keyword"
|
||||||
|
headers = HEADERS
|
||||||
|
|
||||||
|
address = os.getcwd()
|
||||||
|
if "impoert_data" in address:
|
||||||
|
address += "/data/section_keywords_0413.json"
|
||||||
|
else:
|
||||||
|
address += "/impoert_data/data/section_keywords_0413.json"
|
||||||
|
|
||||||
|
# open .json file
|
||||||
|
lines = read_from_json(address)
|
||||||
|
|
||||||
|
key = ""
|
||||||
|
begin = -1
|
||||||
|
end = -1
|
||||||
|
tokenNumber = -1
|
||||||
|
content = ""
|
||||||
|
result_token = []
|
||||||
|
|
||||||
|
|
||||||
|
class JSONEncoder(json.JSONEncoder):
|
||||||
|
def default(self, obj):
|
||||||
|
if isinstance(obj, Decimal):
|
||||||
|
return float(obj)
|
||||||
|
return json.JSONEncoder.default(self, obj)
|
||||||
|
|
||||||
|
|
||||||
|
def createIndex(content, result_objects):
|
||||||
|
# result_objects = [
|
||||||
|
# {
|
||||||
|
# "task": "keyword",
|
||||||
|
# "key": "base_bert",
|
||||||
|
# "label": "برت اصلی",
|
||||||
|
# "values": result_token,
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# "task": "keyword",
|
||||||
|
# "key": "trained_bert",
|
||||||
|
# "label": "برت آموزش دیده",
|
||||||
|
# "values": result_token,
|
||||||
|
# },
|
||||||
|
# {"task": "keyword", "key": "llama3", "label": "لاما 3", "values": result_token},
|
||||||
|
# ]
|
||||||
|
output = {
|
||||||
|
"content": content,
|
||||||
|
"domain": "استخراج کلیدواژه",
|
||||||
|
"ref_id": "",
|
||||||
|
"ref_url": "",
|
||||||
|
"result_objects": result_objects,
|
||||||
|
}
|
||||||
|
# print(output)
|
||||||
|
# print(json.dumps(output, indent=4, ensure_ascii=False))
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def appendResultToken(text, key, begin, tokenNumber, result_token):
|
||||||
|
end = -1
|
||||||
|
|
||||||
|
if key:
|
||||||
|
|
||||||
|
end = tokenNumber - 1
|
||||||
|
result_token.append(
|
||||||
|
{"begin": begin, "end": end, "result_key": key, "text": text}
|
||||||
|
)
|
||||||
|
begin = -1
|
||||||
|
end = -1
|
||||||
|
key = ""
|
||||||
|
return key, begin, end, result_token
|
||||||
|
|
||||||
|
def extract_keywords(keywords_text):
|
||||||
|
# جدا کردن بخشی از متن که شامل کلمات کلیدی است
|
||||||
|
temp_text = keywords_text.split(":\n")[1]
|
||||||
|
temp_text = temp_text.lstrip("\n")
|
||||||
|
keywords = temp_text.split("\n")
|
||||||
|
final_keywords = []
|
||||||
|
for index, key in enumerate(keywords):
|
||||||
|
if key == '':
|
||||||
|
continue
|
||||||
|
# این برای حالتی است که بعد از کلید واژه ها، توضیحات اضافه وجود داشته باشد
|
||||||
|
if keywords[index-1] == "":
|
||||||
|
break
|
||||||
|
first_character = key[0]
|
||||||
|
if first_character.isdigit():
|
||||||
|
if key[1].isdigit():
|
||||||
|
# اگر در ابتدای این کلیدواژه یک عدد دورقمی است از کاراکتر چهارم به بعد را به عنوان کلیدواژه انتخاب کن
|
||||||
|
final_keywords.append({"text":(key[3:]).strip()})
|
||||||
|
continue
|
||||||
|
final_keywords.append({"text":(key[2:]).strip()})
|
||||||
|
|
||||||
|
return final_keywords
|
||||||
|
|
||||||
|
def convert_to_obj(kewords_list):
|
||||||
|
keywords_obj_list = []
|
||||||
|
for key in kewords_list:
|
||||||
|
keywords_obj_list.append({
|
||||||
|
"text":key[0],
|
||||||
|
"score":str(key[1]),
|
||||||
|
})
|
||||||
|
return keywords_obj_list
|
||||||
|
|
||||||
|
bulk_data = []
|
||||||
|
bulk_count = 1
|
||||||
|
count = 0
|
||||||
|
text = ""
|
||||||
|
|
||||||
|
aikeis = []
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
#print("line: " + str(i))
|
||||||
|
count += 1
|
||||||
|
tokenNumber = tokenNumber + 1
|
||||||
|
|
||||||
|
content = line["content"]
|
||||||
|
result_objects = []
|
||||||
|
|
||||||
|
|
||||||
|
if line["llama3_org_keywords"]:
|
||||||
|
llam_kw = convert_to_obj(line["llama3_org_keywords"])
|
||||||
|
|
||||||
|
result_objects.append(
|
||||||
|
{
|
||||||
|
"task": "keyword",
|
||||||
|
"key": "llama3_keywords_noprompt2",
|
||||||
|
"label": "لاما - بدون پرامپت 2",
|
||||||
|
"values": llam_kw,
|
||||||
|
})
|
||||||
|
|
||||||
|
data = createIndex(content, result_objects)
|
||||||
|
tokenNumber = -1
|
||||||
|
content = ""
|
||||||
|
result_token = []
|
||||||
|
result_objects = []
|
||||||
|
|
||||||
|
bulk_data.append(data)
|
||||||
|
bulk_count += 1
|
||||||
|
if bulk_data.__len__() > 10:
|
||||||
|
print("=" * 30)
|
||||||
|
print("count " + str(count))
|
||||||
|
payload = json.dumps(bulk_data, cls=JSONEncoder) # Works!
|
||||||
|
response = requests.request("POST", url, headers=headers, data=payload)
|
||||||
|
print(response.text)
|
||||||
|
bulk_data = []
|
||||||
|
bulk_count = 1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if content != "":
|
||||||
|
# key, begin, end, result_token = appendResultToken(
|
||||||
|
# text, key, begin, tokenNumber, result_token
|
||||||
|
# )
|
||||||
|
data = createIndex(content, result_token)
|
||||||
|
bulk_data.append(data)
|
||||||
|
bulk_count += 1
|
||||||
|
|
||||||
|
|
||||||
|
if bulk_data.__len__() > 0:
|
||||||
|
print(bulk_count)
|
||||||
|
payload = json.dumps(bulk_data, cls=JSONEncoder) # Works!
|
||||||
|
response = requests.request("POST", url, headers=headers, data=payload)
|
||||||
|
print(response.text)
|
||||||
|
|
||||||
|
# نمایش دیکشنری خروجی به صورت JSON
|
||||||
|
print("******* update operation finished! *******")
|
175
import_data/arman_dataset.py
Normal file
175
import_data/arman_dataset.py
Normal file
|
@ -0,0 +1,175 @@
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
from decimal import Decimal
|
||||||
|
import os
|
||||||
|
|
||||||
|
TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTg0NTE3MTUsImp0aSI6InNWaDljNkdlRWdEK0IzM1N2UHNzbXkybkxUK0tWWnBjIiwiaXNzIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImV4cCI6MTcxOTc1MTcxNCwiYXVkIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImRhdGEiOnsiaWQiOjQwLCJmaXJzdF9uYW1lIjoiXHUwNjM5XHUwNjQ1XHUwNjI3XHUwNjMxIiwibGFzdF9uYW1lIjoiXHUwNjJjXHUwNjQ4XHUwNmE5XHUwNjI3XHUwNjMxIiwiZW1haWwiOiJham9rYXI5MUB5YWhvby5jb20iLCJ1c2VybmFtZSI6ImFqb2thciIsInVzZXJfbGV2ZWwiOjF9fQ.frkLaR1HYyfZ8sFhLtiuEZaBPwDPWKSxKDQpql6aOZc'
|
||||||
|
ACCEPT = "application/json"
|
||||||
|
HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
|
||||||
|
|
||||||
|
url = "https://api.tavasi.ir/repo/dataset/multi/add/arman/ner"
|
||||||
|
headers = HEADERS
|
||||||
|
|
||||||
|
address = os.getcwd()
|
||||||
|
if 'impoert_data' in address:
|
||||||
|
address += '/data/arman_train.txt'
|
||||||
|
else:
|
||||||
|
address += '/impoert_data/data/arman_train.txt'
|
||||||
|
|
||||||
|
# باز کردن فایل متنی
|
||||||
|
with open(address, 'r', encoding='utf-8') as file:
|
||||||
|
input_text = file.read()
|
||||||
|
|
||||||
|
# تبدیل متن به لیستی از خطوط
|
||||||
|
lines = input_text.strip().split('\n')
|
||||||
|
|
||||||
|
key = ''
|
||||||
|
begin = -1
|
||||||
|
end = -1
|
||||||
|
tokenNumber = -1
|
||||||
|
content = ''
|
||||||
|
result_token = []
|
||||||
|
|
||||||
|
class JSONEncoder(json.JSONEncoder):
|
||||||
|
def default(self, obj):
|
||||||
|
if isinstance(obj, Decimal):
|
||||||
|
return float(obj)
|
||||||
|
return json.JSONEncoder.default(self, obj)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# def createIndex(content, result_token):
|
||||||
|
# output = {
|
||||||
|
# "content": content,
|
||||||
|
# "domain": "پیکره آرمان",
|
||||||
|
# "ref_id": "",
|
||||||
|
# "ref_url": "",
|
||||||
|
# "result_token": result_token,
|
||||||
|
# }
|
||||||
|
# # print(output)
|
||||||
|
# # print(json.dumps(output, indent=4, ensure_ascii=False))
|
||||||
|
# return output
|
||||||
|
def createIndex(content, result_token):
|
||||||
|
result_objects = [{
|
||||||
|
"task":"ner",
|
||||||
|
"key":"arman_ner",
|
||||||
|
"label":"خروجی تیم آرمان",
|
||||||
|
"values":result_token
|
||||||
|
} ]
|
||||||
|
output ={
|
||||||
|
"content": content,
|
||||||
|
"domain": "پیکره آرمان",
|
||||||
|
"ref_id": "",
|
||||||
|
"ref_url": "",
|
||||||
|
"result_objects": result_objects,
|
||||||
|
}
|
||||||
|
# print(output)
|
||||||
|
# print(json.dumps(output, indent=4, ensure_ascii=False))
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def appendResultToken(text, key, begin, tokenNumber, result_token):
|
||||||
|
end = -1
|
||||||
|
# if key == 'HALFREFERENCE' :
|
||||||
|
# key = 'H_REF'
|
||||||
|
# elif key == 'REFERENCE' :
|
||||||
|
# key = 'REF'
|
||||||
|
if key:
|
||||||
|
if key == 'org' :
|
||||||
|
key = 'ORG'
|
||||||
|
elif key == 'loc' :
|
||||||
|
key = 'LOC'
|
||||||
|
elif key == 'Facility' :
|
||||||
|
key = 'fac'
|
||||||
|
elif key == 'event' :
|
||||||
|
key = 'EVENT'
|
||||||
|
elif key == 'pro' :
|
||||||
|
key = 'PRO'
|
||||||
|
elif key == 'pers' :
|
||||||
|
key = 'PER'
|
||||||
|
|
||||||
|
end = tokenNumber -1
|
||||||
|
result_token.append({
|
||||||
|
"begin": begin,
|
||||||
|
"end": end,
|
||||||
|
"result_key": key,
|
||||||
|
"text" : text
|
||||||
|
})
|
||||||
|
begin = -1
|
||||||
|
end = -1
|
||||||
|
key = ''
|
||||||
|
return key, begin, end, result_token
|
||||||
|
|
||||||
|
bulk_data = []
|
||||||
|
bulk_count = 1
|
||||||
|
count = 0
|
||||||
|
text = ''
|
||||||
|
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
print('line: ' + str(i))
|
||||||
|
count += 1
|
||||||
|
tokenNumber = tokenNumber + 1
|
||||||
|
|
||||||
|
if line.strip() == '' :
|
||||||
|
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||||||
|
|
||||||
|
data = createIndex(content, result_token)
|
||||||
|
tokenNumber = -1
|
||||||
|
content = ''
|
||||||
|
result_token = []
|
||||||
|
|
||||||
|
bulk_data.append(data)
|
||||||
|
bulk_count +=1
|
||||||
|
if bulk_data.__len__() > 100:
|
||||||
|
print('=' * 30 )
|
||||||
|
print('count ' + str(count))
|
||||||
|
payload = json.dumps(bulk_data, cls=JSONEncoder) #Works!
|
||||||
|
response = requests.request("POST", url, headers=headers, data=payload)
|
||||||
|
print(response)
|
||||||
|
bulk_data = []
|
||||||
|
bulk_count = 1
|
||||||
|
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) != 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
content += ' ' + parts[0]
|
||||||
|
|
||||||
|
result_key = parts[1]
|
||||||
|
if result_key.startswith('I-'):
|
||||||
|
text += ' ' + parts[0]
|
||||||
|
continue
|
||||||
|
|
||||||
|
if result_key == 'O' :
|
||||||
|
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||||||
|
|
||||||
|
if result_key.startswith('B-'):
|
||||||
|
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||||||
|
|
||||||
|
text = parts[0]
|
||||||
|
begin = tokenNumber
|
||||||
|
end = -1
|
||||||
|
key = result_key.replace('B-', '')
|
||||||
|
|
||||||
|
|
||||||
|
if content != '' :
|
||||||
|
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||||||
|
data = createIndex(content, result_token)
|
||||||
|
bulk_data.append(data)
|
||||||
|
bulk_count +=1
|
||||||
|
|
||||||
|
|
||||||
|
if bulk_data.__len__() > 0:
|
||||||
|
print(bulk_count)
|
||||||
|
payload = json.dumps(bulk_data, cls=JSONEncoder) #Works!
|
||||||
|
response = requests.request("POST", url, headers=headers, data=payload)
|
||||||
|
print(response.text)
|
||||||
|
|
||||||
|
# نمایش دیکشنری خروجی به صورت JSON
|
||||||
|
print("***************** end ")
|
20
import_data/clean_tags.py
Normal file
20
import_data/clean_tags.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
import re
|
||||||
|
|
||||||
|
def clean_text_file(input_file, output_file):
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
|
||||||
|
combined_text = ' '.join([line.strip() for line in lines])
|
||||||
|
|
||||||
|
cleaned_text = re.sub(r'\b(B-ORG|I-ORG|O|B-HALFREFERENCE|I-HALFREFERENCE|B-REFERENCE|I-REFERENCE|B-DATE2|I-DATE2)\b', '', combined_text)
|
||||||
|
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
|
||||||
|
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as file:
|
||||||
|
file.write(cleaned_text)
|
||||||
|
|
||||||
|
input_path = 'test_dataset.txt'
|
||||||
|
output_path = 'output2.txt'
|
||||||
|
|
||||||
|
clean_text_file(input_path, output_path)
|
||||||
|
|
||||||
|
print("محتوای فایل تمیز شد و در فایل جدید ذخیره گردید.")
|
42
import_data/config_base.py
Normal file
42
import_data/config_base.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
import pyodbc
|
||||||
|
|
||||||
|
# region constant values
|
||||||
|
# این توکن بسیار مهم است. چون هر از گاهی این توکن منقضی می شود
|
||||||
|
# و باید توکن معتبر جدید از سایت قوانین درحالت لاگین شده،
|
||||||
|
# دریافت شود و به جای توکن زیر جایگزین شود
|
||||||
|
TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MjA1MDUwMzAsImp0aSI6ImxPYVpGUWRhZXlSVkVUY3NNeUp2bUNTNXhHbmJZSTBHIiwiaXNzIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImV4cCI6MTcyMTgwNTAyOSwiYXVkIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImRhdGEiOnsiaWQiOjEsImZpcnN0X25hbWUiOiJcdTA2MjhcdTA2MzFcdTA2NDZcdTA2MjdcdTA2NDVcdTA2NDcgXHUwNjQ2XHUwNjQ4XHUwNmNjXHUwNjMzIiwibGFzdF9uYW1lIjoiXHUwNjQxXHUwNjQ2XHUwNmNjIiwiZW1haWwiOiJkZXZAZ21haWwuY29tIiwidXNlcm5hbWUiOiJkZXYiLCJ1c2VyX2xldmVsIjoyfX0.PBIThW_3mG29ZPHiYH3wfwbBLN5jkG_q1qv6IFkwYz8'
|
||||||
|
ACCEPT = "application/json"
|
||||||
|
HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
|
||||||
|
# endregion
|
||||||
|
|
||||||
|
def create_cursor(dbname = ''):
|
||||||
|
if dbname == '':
|
||||||
|
dbname = "Qavanin"
|
||||||
|
|
||||||
|
# server = "DESKTOP-ENVCI85"
|
||||||
|
# database = dbname
|
||||||
|
# username = "Mofid3"
|
||||||
|
# password = "12345"
|
||||||
|
|
||||||
|
# ******** مهم *********
|
||||||
|
# uncomment for jokar pc
|
||||||
|
server = "MOFID5"
|
||||||
|
database = dbname
|
||||||
|
username = "Mofid5"
|
||||||
|
password = "12345"
|
||||||
|
|
||||||
|
# ENCRYPT defaults to yes starting in ODBC Driver SQL Server. It's good to always specify ENCRYPT=yes on the client side to avoid MITM attacks.
|
||||||
|
cnxn = pyodbc.connect(
|
||||||
|
"DRIVER={SQL Server};SERVER="
|
||||||
|
+ server
|
||||||
|
+ ";DATABASE="
|
||||||
|
+ database
|
||||||
|
+ ";Trusted_Connection=yes;UID="
|
||||||
|
+ username
|
||||||
|
+ ";PWD="
|
||||||
|
+ password
|
||||||
|
)
|
||||||
|
# cnxn = pyodbc.connect('DSN=sqldsn;')
|
||||||
|
cursor = cnxn.cursor()
|
||||||
|
return cursor, cnxn
|
||||||
|
|
Loading…
Reference in New Issue
Block a user