import json import requests from decimal import Decimal import os TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTg0NTE3MTUsImp0aSI6InNWaDljNkdlRWdEK0IzM1N2UHNzbXkybkxUK0tWWnBjIiwiaXNzIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImV4cCI6MTcxOTc1MTcxNCwiYXVkIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImRhdGEiOnsiaWQiOjQwLCJmaXJzdF9uYW1lIjoiXHUwNjM5XHUwNjQ1XHUwNjI3XHUwNjMxIiwibGFzdF9uYW1lIjoiXHUwNjJjXHUwNjQ4XHUwNmE5XHUwNjI3XHUwNjMxIiwiZW1haWwiOiJham9rYXI5MUB5YWhvby5jb20iLCJ1c2VybmFtZSI6ImFqb2thciIsInVzZXJfbGV2ZWwiOjF9fQ.frkLaR1HYyfZ8sFhLtiuEZaBPwDPWKSxKDQpql6aOZc' ACCEPT = "application/json" HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT} url = "https://api.tavasi.ir/repo/dataset/multi/add/arman/ner" headers = HEADERS address = os.getcwd() if 'impoert_data' in address: address += '/data/arman_train.txt' else: address += '/impoert_data/data/arman_train.txt' # باز کردن فایل متنی with open(address, 'r', encoding='utf-8') as file: input_text = file.read() # تبدیل متن به لیستی از خطوط lines = input_text.strip().split('\n') key = '' begin = -1 end = -1 tokenNumber = -1 content = '' result_token = [] class JSONEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, Decimal): return float(obj) return json.JSONEncoder.default(self, obj) # def createIndex(content, result_token): # output = { # "content": content, # "domain": "پیکره آرمان", # "ref_id": "", # "ref_url": "", # "result_token": result_token, # } # # print(output) # # print(json.dumps(output, indent=4, ensure_ascii=False)) # return output def createIndex(content, result_token): result_objects = [{ "task":"ner", "key":"arman_ner", "label":"خروجی تیم آرمان", "values":result_token } ] output ={ "content": content, "domain": "پیکره آرمان", "ref_id": "", "ref_url": "", "result_objects": result_objects, } # print(output) # print(json.dumps(output, indent=4, ensure_ascii=False)) return output def appendResultToken(text, key, begin, tokenNumber, result_token): end = -1 # if key == 'HALFREFERENCE' : # key = 'H_REF' # elif key == 'REFERENCE' : # key = 'REF' if key: if key == 'org' : key = 'ORG' elif key == 'loc' : key = 'LOC' elif key == 'Facility' : key = 'fac' elif key == 'event' : key = 'EVENT' elif key == 'pro' : key = 'PRO' elif key == 'pers' : key = 'PER' end = tokenNumber -1 result_token.append({ "begin": begin, "end": end, "result_key": key, "text" : text }) begin = -1 end = -1 key = '' return key, begin, end, result_token bulk_data = [] bulk_count = 1 count = 0 text = '' for i, line in enumerate(lines): print('line: ' + str(i)) count += 1 tokenNumber = tokenNumber + 1 if line.strip() == '' : key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) data = createIndex(content, result_token) tokenNumber = -1 content = '' result_token = [] bulk_data.append(data) bulk_count +=1 if bulk_data.__len__() > 100: print('=' * 30 ) print('count ' + str(count)) payload = json.dumps(bulk_data, cls=JSONEncoder) #Works! response = requests.request("POST", url, headers=headers, data=payload) print(response) bulk_data = [] bulk_count = 1 continue parts = line.split() if len(parts) != 2: continue content += ' ' + parts[0] result_key = parts[1] if result_key.startswith('I-'): text += ' ' + parts[0] continue if result_key == 'O' : key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) if result_key.startswith('B-'): key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) text = parts[0] begin = tokenNumber end = -1 key = result_key.replace('B-', '') if content != '' : key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) data = createIndex(content, result_token) bulk_data.append(data) bulk_count +=1 if bulk_data.__len__() > 0: print(bulk_count) payload = json.dumps(bulk_data, cls=JSONEncoder) #Works! response = requests.request("POST", url, headers=headers, data=payload) print(response.text) # نمایش دیکشنری خروجی به صورت JSON print("***************** end ")