import json import requests from decimal import Decimal import os TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTg0NTE3MTUsImp0aSI6InNWaDljNkdlRWdEK0IzM1N2UHNzbXkybkxUK0tWWnBjIiwiaXNzIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImV4cCI6MTcxOTc1MTcxNCwiYXVkIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImRhdGEiOnsiaWQiOjQwLCJmaXJzdF9uYW1lIjoiXHUwNjM5XHUwNjQ1XHUwNjI3XHUwNjMxIiwibGFzdF9uYW1lIjoiXHUwNjJjXHUwNjQ4XHUwNmE5XHUwNjI3XHUwNjMxIiwiZW1haWwiOiJham9rYXI5MUB5YWhvby5jb20iLCJ1c2VybmFtZSI6ImFqb2thciIsInVzZXJfbGV2ZWwiOjF9fQ.frkLaR1HYyfZ8sFhLtiuEZaBPwDPWKSxKDQpql6aOZc' ACCEPT = "application/json" HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT} url = "https://api.tavasi.ir/repo/dataset/multi/add/peyma/ner" headers = HEADERS address = os.getcwd() if 'impoert_data' in address: address += '/data/peyma_train.txt' else: address += '/impoert_data/data/peyma_train.txt' # باز کردن فایل متنی with open(address, 'r', encoding='utf-8') as file: input_text = file.read() # تبدیل متن به لیستی از خطوط lines = input_text.strip().split('\n') key = '' begin = -1 end = -1 tokenNumber = -1 content = '' result_token = [] class JSONEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, Decimal): return float(obj) return json.JSONEncoder.default(self, obj) # def createIndex(content, result_token): # output = { # "content": content, # "domain": "پیکره پیما", # "ref_id": "", # "ref_url": "", # "result_token": result_token, # } # # print(output) # # print(json.dumps(output, indent=4, ensure_ascii=False)) # return output def createIndex(content, result_token): result_objects = [{ "task":"ner", "key":"peyma_ner", "label":"خروجی تیم پیما", "values":result_token } ] output ={ "content": content, "domain": "پیکره پیما", "ref_id": "", "ref_url": "", "result_objects": result_objects, } # print(output) # print(json.dumps(output, indent=4, ensure_ascii=False)) return output def appendResultToken(text, key, begin, tokenNumber, result_token): end = -1 # if key == 'HALFREFERENCE' : # key = 'H_REF' # elif key == 'REFERENCE' : # key = 'REF' if key: if key == 'ORG' : key = 'ORG' elif key == 'MON' : key = 'MON' elif key == 'LOC' : key = 'LOC' elif key == 'DAT' : key = 'DAT' elif key == 'TIM' : key = 'TIM' elif key == 'PER' : key = 'PER' elif key == 'PCT' : key = 'PCT' end = tokenNumber -1 result_token.append({ "begin": begin, "end": end, "result_key": key, "text" : text }) begin = -1 end = -1 key = '' return key, begin, end, result_token bulk_data = [] bulk_count = 1 count = 0 text = '' for i, line in enumerate(lines): print('line: ' + str(i)) count += 1 tokenNumber += 1 if line.strip() == '' : key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) data = createIndex(content, result_token) tokenNumber = -1 content = '' result_token = [] bulk_data.append(data) bulk_count +=1 if bulk_data.__len__() > 100: print('=' * 30 ) print('count ' + str(count)) payload = json.dumps(bulk_data, cls=JSONEncoder) #Works! response = requests.request("POST", url, headers=headers, data=payload) print(response) bulk_data = [] bulk_count = 1 continue parts = line.split('|') if len(parts) != 2: continue result_key = parts[1] token_part = (parts[0].strip()).split() if len(token_part) > 1: new_token_part = token_part[0].strip() + '\u200c' + token_part[1].strip() content += ' ' + parts[0] else: content += ' ' + parts[0] if result_key.startswith('I_'): text += ' ' + parts[0] continue if result_key == 'O' : key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) if result_key.startswith('B_'): key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) text = parts[0] begin = tokenNumber end = -1 key = result_key.replace('B_', '') if content != '' : key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) data = createIndex(content, result_token) bulk_data.append(data) bulk_count +=1 if bulk_data.__len__() > 0: print(bulk_count) payload = json.dumps(bulk_data, cls=JSONEncoder) #Works! response = requests.request("POST", url, headers=headers, data=payload) print(response.text) # نمایش دیکشنری خروجی به صورت JSON print("***************** end ")