172 lines
5.8 KiB
Python
172 lines
5.8 KiB
Python
|
from decimal import Decimal
|
||
|
import requests
|
||
|
import json
|
||
|
import os
|
||
|
|
||
|
""" !!! place of token:: Application > Local Storage > id_token !!! """
|
||
|
|
||
|
# توکن جدید را از آدرس زیر دریافت و جایگزین می کنیم:
|
||
|
# api.tavasi.ir token and url
|
||
|
TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MjQ5Mzk3NTksImp0aSI6IktwbjlBdEV1ZGh2WngxZUFXOFRRRUFrcmpcL01xa2JmWiIsImlzcyI6bnVsbCwiZXhwIjoxNzI2MjM5NzU4LCJhdWQiOm51bGwsImRhdGEiOnsiaWQiOjQwLCJmaXJzdF9uYW1lIjoiXHUwNjM5XHUwNjQ1XHUwNjI3XHUwNjMxIiwibGFzdF9uYW1lIjoiXHUwNjJjXHUwNjQ4XHUwNmE5XHUwNjI3XHUwNjMxIiwiZW1haWwiOiJham9rYXI5MUB5YWhvby5jb20iLCJ1c2VybmFtZSI6ImFqb2thciIsInVzZXJfbGV2ZWwiOjF9fQ.r_A8IMQdN50ZM8oPmIq31Pz-phPDJyfQYLmbsLjv4Ao'
|
||
|
url = "https://api.tavasi.ir/repo/dataset/multi/add/qasection_test/ner"
|
||
|
|
||
|
# Mortaza Server token and url
|
||
|
# توکن جدید را از آدرس زیر دریافت و جایگزین می کنیم:
|
||
|
# 192.168.23.160 token
|
||
|
#TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MjUxMDk1NDAsImp0aSI6IjdWWUhCTFhiMVJFb3VpY2dORzE1dHRkcnBObU1cL0JobCIsImlzcyI6bnVsbCwiZXhwIjoxNzI2NDA5NTM5LCJhdWQiOm51bGwsImRhdGEiOnsiaWQiOjEsImZpcnN0X25hbWUiOiJcdTA2MjhcdTA2MzFcdTA2NDZcdTA2MjdcdTA2NDVcdTA2NDcgXHUwNjQ2XHUwNjQ4XHUwNmNjXHUwNjMzIiwibGFzdF9uYW1lIjoiXHUwNjQxXHUwNjQ2XHUwNmNjIiwiZW1haWwiOiJkZXZAZ21haWwuY29tIiwidXNlcm5hbWUiOiJkZXYiLCJ1c2VyX2xldmVsIjoyfX0.WayvUXRnBukSyc6o_Qv4dClTv9Hn75uOTcCfzGQ9El8'
|
||
|
#url = "http://192.168.23.160:8080/repo/dataset/multi/add/qasection_test/ner"
|
||
|
|
||
|
ACCEPT = "application/json"
|
||
|
HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
|
||
|
headers = HEADERS
|
||
|
|
||
|
address = './impoert_data/data/sample_dataset.txt'
|
||
|
# address = os.getcwd()
|
||
|
# if 'impoert_data' in address:
|
||
|
# address += '/data/sample_dataset.txt' ###
|
||
|
# else:
|
||
|
# address += '/impoert_data/data/sample_dataset.txt'
|
||
|
# باز کردن فایل متنی
|
||
|
with open(address, 'r', encoding='utf-8') as file:
|
||
|
input_text = file.read()
|
||
|
|
||
|
# تبدیل متن به لیستی از خطوط
|
||
|
lines = input_text.strip().split('\n')
|
||
|
|
||
|
key = ''
|
||
|
begin = -1
|
||
|
end = -1
|
||
|
tokenNumber = -1
|
||
|
content = ''
|
||
|
result_token = []
|
||
|
|
||
|
class JSONEncoder(json.JSONEncoder):
|
||
|
def default(self, obj):
|
||
|
if isinstance(obj, Decimal):
|
||
|
return float(obj)
|
||
|
return json.JSONEncoder.default(self, obj)
|
||
|
|
||
|
|
||
|
def createIndex(content, result_token, section_order):
|
||
|
result_objects = [{
|
||
|
"task":"ner",
|
||
|
"key":"qavanin_ner",
|
||
|
"label":"خروجی تیم همتا",
|
||
|
"values":result_token
|
||
|
} ]
|
||
|
output ={
|
||
|
"content": content,
|
||
|
"domain": "پیکره قوانین آزمایشی",
|
||
|
"ref_id": "",
|
||
|
"ref_url": "",
|
||
|
"result_objects": result_objects,
|
||
|
"order": str(section_order),
|
||
|
}
|
||
|
# print(output)
|
||
|
# print(json.dumps(output, indent=4, ensure_ascii=False))
|
||
|
return output
|
||
|
|
||
|
def appendResultToken(text, key, begin, tokenNumber, result_token):
|
||
|
end = -1
|
||
|
# if key == 'HALFREFERENCE' :
|
||
|
# key = 'H_REF'
|
||
|
# elif key == 'REFERENCE' :
|
||
|
# key = 'REF'
|
||
|
if key:
|
||
|
# if key == 'org' :
|
||
|
# key = 'ORG'
|
||
|
# elif key == 'loc' :
|
||
|
# key = 'LOC'
|
||
|
# elif key == 'Facility' :
|
||
|
# key = 'fac'
|
||
|
# elif key == 'event' :
|
||
|
# key = 'EVENT'
|
||
|
# elif key == 'pro' :
|
||
|
# key = 'PRO'
|
||
|
# elif key == 'pers' :
|
||
|
# key = 'PER'
|
||
|
|
||
|
end = tokenNumber -1
|
||
|
result_token.append({
|
||
|
"begin": begin,
|
||
|
"end": end,
|
||
|
"result_key": key,
|
||
|
"text" : text
|
||
|
})
|
||
|
begin = -1
|
||
|
end = -1
|
||
|
key = ''
|
||
|
return key, begin, end, result_token
|
||
|
|
||
|
bulk_data = []
|
||
|
bulk_count = 1
|
||
|
count = 0
|
||
|
section_order = 0
|
||
|
text = ''
|
||
|
|
||
|
for i, line in enumerate(lines):
|
||
|
print('line: ' + str(i))
|
||
|
count += 1
|
||
|
tokenNumber = tokenNumber + 1
|
||
|
|
||
|
if line.strip() == '' :
|
||
|
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||
|
section_order += 1
|
||
|
data = createIndex(content, result_token, section_order)
|
||
|
tokenNumber = -1
|
||
|
content = ''
|
||
|
result_token = []
|
||
|
|
||
|
bulk_data.append(data)
|
||
|
bulk_count +=1
|
||
|
if bulk_data.__len__() > 100:
|
||
|
print('=' * 30 )
|
||
|
print('count ' + str(count))
|
||
|
payload = json.dumps(bulk_data, cls=JSONEncoder) #Works!
|
||
|
response = requests.request("POST", url, headers=headers, data=payload)
|
||
|
print(response)
|
||
|
bulk_data = []
|
||
|
bulk_count = 1
|
||
|
|
||
|
continue
|
||
|
|
||
|
|
||
|
|
||
|
parts = line.split()
|
||
|
if len(parts) != 2:
|
||
|
continue
|
||
|
|
||
|
content += ' ' + parts[0]
|
||
|
|
||
|
result_key = parts[1]
|
||
|
if result_key.startswith('I-'):
|
||
|
text += ' ' + parts[0]
|
||
|
continue
|
||
|
|
||
|
if result_key == 'O' :
|
||
|
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||
|
|
||
|
if result_key.startswith('B-'):
|
||
|
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||
|
|
||
|
text = parts[0]
|
||
|
begin = tokenNumber
|
||
|
end = -1
|
||
|
key = result_key.replace('B-', '')
|
||
|
|
||
|
if content != '' :
|
||
|
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||
|
section_order += 1
|
||
|
data = createIndex(content, result_token, section_order)
|
||
|
bulk_data.append(data)
|
||
|
bulk_count +=1
|
||
|
|
||
|
|
||
|
if bulk_data.__len__() > 0:
|
||
|
print(bulk_count)
|
||
|
payload = json.dumps(bulk_data, cls=JSONEncoder) #Works!
|
||
|
response = requests.request("POST", url, headers=headers, data=payload)
|
||
|
print(response)
|
||
|
|
||
|
# نمایش دیکشنری خروجی به صورت JSON
|
||
|
print("***************** end *****************")
|