ai_dataset/import_data/peyma_dataset.py

177 lines
5.2 KiB
Python

import json
import requests
from decimal import Decimal
import os
TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTg0NTE3MTUsImp0aSI6InNWaDljNkdlRWdEK0IzM1N2UHNzbXkybkxUK0tWWnBjIiwiaXNzIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImV4cCI6MTcxOTc1MTcxNCwiYXVkIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImRhdGEiOnsiaWQiOjQwLCJmaXJzdF9uYW1lIjoiXHUwNjM5XHUwNjQ1XHUwNjI3XHUwNjMxIiwibGFzdF9uYW1lIjoiXHUwNjJjXHUwNjQ4XHUwNmE5XHUwNjI3XHUwNjMxIiwiZW1haWwiOiJham9rYXI5MUB5YWhvby5jb20iLCJ1c2VybmFtZSI6ImFqb2thciIsInVzZXJfbGV2ZWwiOjF9fQ.frkLaR1HYyfZ8sFhLtiuEZaBPwDPWKSxKDQpql6aOZc'
ACCEPT = "application/json"
HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
url = "https://api.tavasi.ir/repo/dataset/multi/add/peyma/ner"
headers = HEADERS
address = os.getcwd()
if 'impoert_data' in address:
address += '/data/peyma_train.txt'
else:
address += '/impoert_data/data/peyma_train.txt'
# باز کردن فایل متنی
with open(address, 'r', encoding='utf-8') as file:
input_text = file.read()
# تبدیل متن به لیستی از خطوط
lines = input_text.strip().split('\n')
key = ''
begin = -1
end = -1
tokenNumber = -1
content = ''
result_token = []
class JSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, Decimal):
return float(obj)
return json.JSONEncoder.default(self, obj)
# def createIndex(content, result_token):
# output = {
# "content": content,
# "domain": "پیکره پیما",
# "ref_id": "",
# "ref_url": "",
# "result_token": result_token,
# }
# # print(output)
# # print(json.dumps(output, indent=4, ensure_ascii=False))
# return output
def createIndex(content, result_token):
result_objects = [{
"task":"ner",
"key":"peyma_ner",
"label":"خروجی تیم پیما",
"values":result_token
} ]
output ={
"content": content,
"domain": "پیکره پیما",
"ref_id": "",
"ref_url": "",
"result_objects": result_objects,
}
# print(output)
# print(json.dumps(output, indent=4, ensure_ascii=False))
return output
def appendResultToken(text, key, begin, tokenNumber, result_token):
end = -1
# if key == 'HALFREFERENCE' :
# key = 'H_REF'
# elif key == 'REFERENCE' :
# key = 'REF'
if key:
if key == 'ORG' :
key = 'ORG'
elif key == 'MON' :
key = 'MON'
elif key == 'LOC' :
key = 'LOC'
elif key == 'DAT' :
key = 'DAT'
elif key == 'TIM' :
key = 'TIM'
elif key == 'PER' :
key = 'PER'
elif key == 'PCT' :
key = 'PCT'
end = tokenNumber -1
result_token.append({
"begin": begin,
"end": end,
"result_key": key,
"text" : text
})
begin = -1
end = -1
key = ''
return key, begin, end, result_token
bulk_data = []
bulk_count = 1
count = 0
text = ''
for i, line in enumerate(lines):
print('line: ' + str(i))
count += 1
tokenNumber += 1
if line.strip() == '' :
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
data = createIndex(content, result_token)
tokenNumber = -1
content = ''
result_token = []
bulk_data.append(data)
bulk_count +=1
if bulk_data.__len__() > 100:
print('=' * 30 )
print('count ' + str(count))
payload = json.dumps(bulk_data, cls=JSONEncoder) #Works!
response = requests.request("POST", url, headers=headers, data=payload)
print(response)
bulk_data = []
bulk_count = 1
continue
parts = line.split('|')
if len(parts) != 2:
continue
result_key = parts[1]
token_part = (parts[0].strip()).split()
if len(token_part) > 1:
new_token_part = token_part[0].strip() + '\u200c' + token_part[1].strip()
content += ' ' + parts[0]
else:
content += ' ' + parts[0]
if result_key.startswith('I_'):
text += ' ' + parts[0]
continue
if result_key == 'O' :
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
if result_key.startswith('B_'):
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
text = parts[0]
begin = tokenNumber
end = -1
key = result_key.replace('B_', '')
if content != '' :
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
data = createIndex(content, result_token)
bulk_data.append(data)
bulk_count +=1
if bulk_data.__len__() > 0:
print(bulk_count)
payload = json.dumps(bulk_data, cls=JSONEncoder) #Works!
response = requests.request("POST", url, headers=headers, data=payload)
print(response.text)
# نمایش دیکشنری خروجی به صورت JSON
print("***************** end ")