Upload files to "import_data"
This commit is contained in:
parent
faadbda008
commit
18db5b6fbc
175
import_data/hamta_dataset.py
Normal file
175
import_data/hamta_dataset.py
Normal file
|
@ -0,0 +1,175 @@
|
|||
import json
|
||||
import requests
|
||||
from decimal import Decimal
|
||||
import os
|
||||
|
||||
TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTg0NTE3MTUsImp0aSI6InNWaDljNkdlRWdEK0IzM1N2UHNzbXkybkxUK0tWWnBjIiwiaXNzIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImV4cCI6MTcxOTc1MTcxNCwiYXVkIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImRhdGEiOnsiaWQiOjQwLCJmaXJzdF9uYW1lIjoiXHUwNjM5XHUwNjQ1XHUwNjI3XHUwNjMxIiwibGFzdF9uYW1lIjoiXHUwNjJjXHUwNjQ4XHUwNmE5XHUwNjI3XHUwNjMxIiwiZW1haWwiOiJham9rYXI5MUB5YWhvby5jb20iLCJ1c2VybmFtZSI6ImFqb2thciIsInVzZXJfbGV2ZWwiOjF9fQ.frkLaR1HYyfZ8sFhLtiuEZaBPwDPWKSxKDQpql6aOZc'
|
||||
ACCEPT = "application/json"
|
||||
HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
|
||||
|
||||
url = ""
|
||||
headers = HEADERS
|
||||
|
||||
address = os.getcwd()
|
||||
if 'impoert_data' in address:
|
||||
address += '/data/arman_train.txt'
|
||||
else:
|
||||
address += '/impoert_data/data/arman_train.txt'
|
||||
|
||||
# باز کردن فایل متنی
|
||||
with open(address, 'r', encoding='utf-8') as file:
|
||||
input_text = file.read()
|
||||
|
||||
# تبدیل متن به لیستی از خطوط
|
||||
lines = input_text.strip().split('\n')
|
||||
|
||||
key = ''
|
||||
begin = -1
|
||||
end = -1
|
||||
tokenNumber = -1
|
||||
content = ''
|
||||
result_token = []
|
||||
|
||||
class JSONEncoder(json.JSONEncoder):
|
||||
def default(self, obj):
|
||||
if isinstance(obj, Decimal):
|
||||
return float(obj)
|
||||
return json.JSONEncoder.default(self, obj)
|
||||
|
||||
|
||||
|
||||
# def createIndex(content, result_token):
|
||||
# output = {
|
||||
# "content": content,
|
||||
# "domain": "پیکره آرمان",
|
||||
# "ref_id": "",
|
||||
# "ref_url": "",
|
||||
# "result_token": result_token,
|
||||
# }
|
||||
# # print(output)
|
||||
# # print(json.dumps(output, indent=4, ensure_ascii=False))
|
||||
# return output
|
||||
def createIndex(content, result_token):
|
||||
result_objects = [{
|
||||
"task":"ner",
|
||||
"key":"qavanin_keyword",
|
||||
"label":"خروجی تیم همتا",
|
||||
"values":result_token
|
||||
} ]
|
||||
output ={
|
||||
"content": content,
|
||||
"domain": "پیکره قوانین(کلیدواژه)",
|
||||
"ref_id": "",
|
||||
"ref_url": "",
|
||||
"result_objects": result_objects,
|
||||
}
|
||||
# print(output)
|
||||
# print(json.dumps(output, indent=4, ensure_ascii=False))
|
||||
return output
|
||||
|
||||
|
||||
|
||||
def appendResultToken(text, key, begin, tokenNumber, result_token):
|
||||
end = -1
|
||||
# if key == 'HALFREFERENCE' :
|
||||
# key = 'H_REF'
|
||||
# elif key == 'REFERENCE' :
|
||||
# key = 'REF'
|
||||
if key:
|
||||
if key == 'org' :
|
||||
key = 'ORG'
|
||||
elif key == 'loc' :
|
||||
key = 'LOC'
|
||||
elif key == 'Facility' :
|
||||
key = 'fac'
|
||||
elif key == 'event' :
|
||||
key = 'EVENT'
|
||||
elif key == 'pro' :
|
||||
key = 'PRO'
|
||||
elif key == 'pers' :
|
||||
key = 'PER'
|
||||
|
||||
end = tokenNumber -1
|
||||
result_token.append({
|
||||
"begin": begin,
|
||||
"end": end,
|
||||
"result_key": key,
|
||||
"text" : text
|
||||
})
|
||||
begin = -1
|
||||
end = -1
|
||||
key = ''
|
||||
return key, begin, end, result_token
|
||||
|
||||
bulk_data = []
|
||||
bulk_count = 1
|
||||
count = 0
|
||||
text = ''
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
print('line: ' + str(i))
|
||||
count += 1
|
||||
tokenNumber = tokenNumber + 1
|
||||
|
||||
if line.strip() == '' :
|
||||
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||||
|
||||
data = createIndex(content, result_token)
|
||||
tokenNumber = -1
|
||||
content = ''
|
||||
result_token = []
|
||||
|
||||
bulk_data.append(data)
|
||||
bulk_count +=1
|
||||
if bulk_data.__len__() > 100:
|
||||
print('=' * 30 )
|
||||
print('count ' + str(count))
|
||||
payload = json.dumps(bulk_data, cls=JSONEncoder) #Works!
|
||||
response = requests.request("POST", url, headers=headers, data=payload)
|
||||
print(response)
|
||||
bulk_data = []
|
||||
bulk_count = 1
|
||||
|
||||
continue
|
||||
|
||||
|
||||
|
||||
parts = line.split()
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
|
||||
|
||||
content += ' ' + parts[0]
|
||||
|
||||
result_key = parts[1]
|
||||
if result_key.startswith('I-'):
|
||||
text += ' ' + parts[0]
|
||||
continue
|
||||
|
||||
if result_key == 'O' :
|
||||
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||||
|
||||
if result_key.startswith('B-'):
|
||||
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||||
|
||||
text = parts[0]
|
||||
begin = tokenNumber
|
||||
end = -1
|
||||
key = result_key.replace('B-', '')
|
||||
|
||||
|
||||
if content != '' :
|
||||
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||||
data = createIndex(content, result_token)
|
||||
bulk_data.append(data)
|
||||
bulk_count +=1
|
||||
|
||||
|
||||
if bulk_data.__len__() > 0:
|
||||
print(bulk_count)
|
||||
payload = json.dumps(bulk_data, cls=JSONEncoder) #Works!
|
||||
response = requests.request("POST", url, headers=headers, data=payload)
|
||||
print(response.text)
|
||||
|
||||
# نمایش دیکشنری خروجی به صورت JSON
|
||||
print("***************** end ")
|
67
import_data/label_tokens.py
Normal file
67
import_data/label_tokens.py
Normal file
|
@ -0,0 +1,67 @@
|
|||
from docx import Document
|
||||
import re
|
||||
|
||||
def extract_key_phrases(file_path):
|
||||
doc = Document(file_path)
|
||||
key_phrases = []
|
||||
pattern = re.compile(r'!(.*?)!')
|
||||
|
||||
for paragraph in doc.paragraphs:
|
||||
matches = pattern.findall(paragraph.text)
|
||||
key_phrases.extend(matches)
|
||||
|
||||
return key_phrases
|
||||
|
||||
def extract_text_from_docx(file_path):
|
||||
doc = Document(file_path)
|
||||
text = ''
|
||||
|
||||
for paragraph in doc.paragraphs:
|
||||
text += paragraph.text + ' '
|
||||
|
||||
return text.strip()
|
||||
|
||||
def tokenize_and_tag(text, key_phrases):
|
||||
|
||||
key_phrases_sorted = sorted(key_phrases, key=len, reverse=True)
|
||||
for key_phrase in key_phrases_sorted:
|
||||
|
||||
text = text.replace(f'!{key_phrase}!', f' {key_phrase} ')
|
||||
|
||||
tokens = re.findall(r'\S+|[^\s\w]', text)
|
||||
tagged_tokens = []
|
||||
i = 0
|
||||
|
||||
while i < len(tokens):
|
||||
token = tokens[i]
|
||||
matched = False
|
||||
|
||||
|
||||
for key_phrase in key_phrases_sorted:
|
||||
key_tokens = key_phrase.split()
|
||||
if tokens[i:i + len(key_tokens)] == key_tokens:
|
||||
tagged_tokens.append((key_tokens[0], 'B-key'))
|
||||
for key_token in key_tokens[1:]:
|
||||
tagged_tokens.append((key_token, 'I-key'))
|
||||
i += len(key_tokens)
|
||||
matched = True
|
||||
break
|
||||
|
||||
if not matched:
|
||||
tagged_tokens.append((token, 'O'))
|
||||
i += 1
|
||||
|
||||
return tagged_tokens
|
||||
|
||||
def write_tokens_to_file(tagged_tokens, output_file):
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
for token, tag in tagged_tokens:
|
||||
f.write(f"{token}\t{tag}\n")
|
||||
|
||||
file_path = 'D:/project/output.docx'
|
||||
output_file = 'output.txt'
|
||||
|
||||
text = extract_text_from_docx(file_path)
|
||||
key_phrases = extract_key_phrases(file_path)
|
||||
tagged_tokens = tokenize_and_tag(text, key_phrases)
|
||||
write_tokens_to_file(tagged_tokens, output_file)
|
1367
import_data/normalizer.py
Normal file
1367
import_data/normalizer.py
Normal file
File diff suppressed because it is too large
Load Diff
177
import_data/peyma_dataset.py
Normal file
177
import_data/peyma_dataset.py
Normal file
|
@ -0,0 +1,177 @@
|
|||
import json
|
||||
import requests
|
||||
from decimal import Decimal
|
||||
import os
|
||||
|
||||
TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTg0NTE3MTUsImp0aSI6InNWaDljNkdlRWdEK0IzM1N2UHNzbXkybkxUK0tWWnBjIiwiaXNzIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImV4cCI6MTcxOTc1MTcxNCwiYXVkIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImRhdGEiOnsiaWQiOjQwLCJmaXJzdF9uYW1lIjoiXHUwNjM5XHUwNjQ1XHUwNjI3XHUwNjMxIiwibGFzdF9uYW1lIjoiXHUwNjJjXHUwNjQ4XHUwNmE5XHUwNjI3XHUwNjMxIiwiZW1haWwiOiJham9rYXI5MUB5YWhvby5jb20iLCJ1c2VybmFtZSI6ImFqb2thciIsInVzZXJfbGV2ZWwiOjF9fQ.frkLaR1HYyfZ8sFhLtiuEZaBPwDPWKSxKDQpql6aOZc'
|
||||
ACCEPT = "application/json"
|
||||
HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
|
||||
|
||||
url = "https://api.tavasi.ir/repo/dataset/multi/add/peyma/ner"
|
||||
headers = HEADERS
|
||||
|
||||
address = os.getcwd()
|
||||
if 'impoert_data' in address:
|
||||
address += '/data/peyma_train.txt'
|
||||
else:
|
||||
address += '/impoert_data/data/peyma_train.txt'
|
||||
# باز کردن فایل متنی
|
||||
with open(address, 'r', encoding='utf-8') as file:
|
||||
input_text = file.read()
|
||||
|
||||
# تبدیل متن به لیستی از خطوط
|
||||
lines = input_text.strip().split('\n')
|
||||
|
||||
key = ''
|
||||
begin = -1
|
||||
end = -1
|
||||
tokenNumber = -1
|
||||
content = ''
|
||||
result_token = []
|
||||
|
||||
class JSONEncoder(json.JSONEncoder):
|
||||
def default(self, obj):
|
||||
if isinstance(obj, Decimal):
|
||||
return float(obj)
|
||||
return json.JSONEncoder.default(self, obj)
|
||||
|
||||
|
||||
|
||||
# def createIndex(content, result_token):
|
||||
# output = {
|
||||
# "content": content,
|
||||
# "domain": "پیکره پیما",
|
||||
# "ref_id": "",
|
||||
# "ref_url": "",
|
||||
# "result_token": result_token,
|
||||
# }
|
||||
# # print(output)
|
||||
# # print(json.dumps(output, indent=4, ensure_ascii=False))
|
||||
# return output
|
||||
|
||||
def createIndex(content, result_token):
|
||||
result_objects = [{
|
||||
"task":"ner",
|
||||
"key":"peyma_ner",
|
||||
"label":"خروجی تیم پیما",
|
||||
"values":result_token
|
||||
} ]
|
||||
output ={
|
||||
"content": content,
|
||||
"domain": "پیکره پیما",
|
||||
"ref_id": "",
|
||||
"ref_url": "",
|
||||
"result_objects": result_objects,
|
||||
}
|
||||
# print(output)
|
||||
# print(json.dumps(output, indent=4, ensure_ascii=False))
|
||||
return output
|
||||
|
||||
def appendResultToken(text, key, begin, tokenNumber, result_token):
|
||||
end = -1
|
||||
# if key == 'HALFREFERENCE' :
|
||||
# key = 'H_REF'
|
||||
# elif key == 'REFERENCE' :
|
||||
# key = 'REF'
|
||||
if key:
|
||||
if key == 'ORG' :
|
||||
key = 'ORG'
|
||||
elif key == 'MON' :
|
||||
key = 'MON'
|
||||
elif key == 'LOC' :
|
||||
key = 'LOC'
|
||||
elif key == 'DAT' :
|
||||
key = 'DAT'
|
||||
elif key == 'TIM' :
|
||||
key = 'TIM'
|
||||
elif key == 'PER' :
|
||||
key = 'PER'
|
||||
elif key == 'PCT' :
|
||||
key = 'PCT'
|
||||
|
||||
end = tokenNumber -1
|
||||
result_token.append({
|
||||
"begin": begin,
|
||||
"end": end,
|
||||
"result_key": key,
|
||||
"text" : text
|
||||
})
|
||||
begin = -1
|
||||
end = -1
|
||||
key = ''
|
||||
return key, begin, end, result_token
|
||||
|
||||
bulk_data = []
|
||||
bulk_count = 1
|
||||
count = 0
|
||||
text = ''
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
print('line: ' + str(i))
|
||||
count += 1
|
||||
tokenNumber += 1
|
||||
|
||||
if line.strip() == '' :
|
||||
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||||
|
||||
data = createIndex(content, result_token)
|
||||
tokenNumber = -1
|
||||
content = ''
|
||||
result_token = []
|
||||
|
||||
bulk_data.append(data)
|
||||
bulk_count +=1
|
||||
if bulk_data.__len__() > 100:
|
||||
print('=' * 30 )
|
||||
print('count ' + str(count))
|
||||
payload = json.dumps(bulk_data, cls=JSONEncoder) #Works!
|
||||
response = requests.request("POST", url, headers=headers, data=payload)
|
||||
print(response)
|
||||
bulk_data = []
|
||||
bulk_count = 1
|
||||
|
||||
continue
|
||||
|
||||
parts = line.split('|')
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
|
||||
result_key = parts[1]
|
||||
token_part = (parts[0].strip()).split()
|
||||
if len(token_part) > 1:
|
||||
new_token_part = token_part[0].strip() + '\u200c' + token_part[1].strip()
|
||||
content += ' ' + parts[0]
|
||||
else:
|
||||
content += ' ' + parts[0]
|
||||
|
||||
if result_key.startswith('I_'):
|
||||
text += ' ' + parts[0]
|
||||
continue
|
||||
|
||||
if result_key == 'O' :
|
||||
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||||
|
||||
if result_key.startswith('B_'):
|
||||
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||||
|
||||
text = parts[0]
|
||||
begin = tokenNumber
|
||||
end = -1
|
||||
key = result_key.replace('B_', '')
|
||||
|
||||
|
||||
if content != '' :
|
||||
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||||
data = createIndex(content, result_token)
|
||||
bulk_data.append(data)
|
||||
bulk_count +=1
|
||||
|
||||
|
||||
if bulk_data.__len__() > 0:
|
||||
print(bulk_count)
|
||||
payload = json.dumps(bulk_data, cls=JSONEncoder) #Works!
|
||||
response = requests.request("POST", url, headers=headers, data=payload)
|
||||
print(response.text)
|
||||
|
||||
# نمایش دیکشنری خروجی به صورت JSON
|
||||
print("***************** end ")
|
172
import_data/qavanin_dataset_test.py
Normal file
172
import_data/qavanin_dataset_test.py
Normal file
|
@ -0,0 +1,172 @@
|
|||
from decimal import Decimal
|
||||
import requests
|
||||
import json
|
||||
import os
|
||||
|
||||
""" !!! place of token:: Application > Local Storage > id_token !!! """
|
||||
|
||||
# توکن جدید را از آدرس زیر دریافت و جایگزین می کنیم:
|
||||
# api.tavasi.ir token and url
|
||||
TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MjQ5Mzk3NTksImp0aSI6IktwbjlBdEV1ZGh2WngxZUFXOFRRRUFrcmpcL01xa2JmWiIsImlzcyI6bnVsbCwiZXhwIjoxNzI2MjM5NzU4LCJhdWQiOm51bGwsImRhdGEiOnsiaWQiOjQwLCJmaXJzdF9uYW1lIjoiXHUwNjM5XHUwNjQ1XHUwNjI3XHUwNjMxIiwibGFzdF9uYW1lIjoiXHUwNjJjXHUwNjQ4XHUwNmE5XHUwNjI3XHUwNjMxIiwiZW1haWwiOiJham9rYXI5MUB5YWhvby5jb20iLCJ1c2VybmFtZSI6ImFqb2thciIsInVzZXJfbGV2ZWwiOjF9fQ.r_A8IMQdN50ZM8oPmIq31Pz-phPDJyfQYLmbsLjv4Ao'
|
||||
url = "https://api.tavasi.ir/repo/dataset/multi/add/qasection_test/ner"
|
||||
|
||||
# Mortaza Server token and url
|
||||
# توکن جدید را از آدرس زیر دریافت و جایگزین می کنیم:
|
||||
# 192.168.23.160 token
|
||||
#TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MjUxMDk1NDAsImp0aSI6IjdWWUhCTFhiMVJFb3VpY2dORzE1dHRkcnBObU1cL0JobCIsImlzcyI6bnVsbCwiZXhwIjoxNzI2NDA5NTM5LCJhdWQiOm51bGwsImRhdGEiOnsiaWQiOjEsImZpcnN0X25hbWUiOiJcdTA2MjhcdTA2MzFcdTA2NDZcdTA2MjdcdTA2NDVcdTA2NDcgXHUwNjQ2XHUwNjQ4XHUwNmNjXHUwNjMzIiwibGFzdF9uYW1lIjoiXHUwNjQxXHUwNjQ2XHUwNmNjIiwiZW1haWwiOiJkZXZAZ21haWwuY29tIiwidXNlcm5hbWUiOiJkZXYiLCJ1c2VyX2xldmVsIjoyfX0.WayvUXRnBukSyc6o_Qv4dClTv9Hn75uOTcCfzGQ9El8'
|
||||
#url = "http://192.168.23.160:8080/repo/dataset/multi/add/qasection_test/ner"
|
||||
|
||||
ACCEPT = "application/json"
|
||||
HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
|
||||
headers = HEADERS
|
||||
|
||||
address = './impoert_data/data/sample_dataset.txt'
|
||||
# address = os.getcwd()
|
||||
# if 'impoert_data' in address:
|
||||
# address += '/data/sample_dataset.txt' ###
|
||||
# else:
|
||||
# address += '/impoert_data/data/sample_dataset.txt'
|
||||
# باز کردن فایل متنی
|
||||
with open(address, 'r', encoding='utf-8') as file:
|
||||
input_text = file.read()
|
||||
|
||||
# تبدیل متن به لیستی از خطوط
|
||||
lines = input_text.strip().split('\n')
|
||||
|
||||
key = ''
|
||||
begin = -1
|
||||
end = -1
|
||||
tokenNumber = -1
|
||||
content = ''
|
||||
result_token = []
|
||||
|
||||
class JSONEncoder(json.JSONEncoder):
|
||||
def default(self, obj):
|
||||
if isinstance(obj, Decimal):
|
||||
return float(obj)
|
||||
return json.JSONEncoder.default(self, obj)
|
||||
|
||||
|
||||
def createIndex(content, result_token, section_order):
|
||||
result_objects = [{
|
||||
"task":"ner",
|
||||
"key":"qavanin_ner",
|
||||
"label":"خروجی تیم همتا",
|
||||
"values":result_token
|
||||
} ]
|
||||
output ={
|
||||
"content": content,
|
||||
"domain": "پیکره قوانین آزمایشی",
|
||||
"ref_id": "",
|
||||
"ref_url": "",
|
||||
"result_objects": result_objects,
|
||||
"order": str(section_order),
|
||||
}
|
||||
# print(output)
|
||||
# print(json.dumps(output, indent=4, ensure_ascii=False))
|
||||
return output
|
||||
|
||||
def appendResultToken(text, key, begin, tokenNumber, result_token):
|
||||
end = -1
|
||||
# if key == 'HALFREFERENCE' :
|
||||
# key = 'H_REF'
|
||||
# elif key == 'REFERENCE' :
|
||||
# key = 'REF'
|
||||
if key:
|
||||
# if key == 'org' :
|
||||
# key = 'ORG'
|
||||
# elif key == 'loc' :
|
||||
# key = 'LOC'
|
||||
# elif key == 'Facility' :
|
||||
# key = 'fac'
|
||||
# elif key == 'event' :
|
||||
# key = 'EVENT'
|
||||
# elif key == 'pro' :
|
||||
# key = 'PRO'
|
||||
# elif key == 'pers' :
|
||||
# key = 'PER'
|
||||
|
||||
end = tokenNumber -1
|
||||
result_token.append({
|
||||
"begin": begin,
|
||||
"end": end,
|
||||
"result_key": key,
|
||||
"text" : text
|
||||
})
|
||||
begin = -1
|
||||
end = -1
|
||||
key = ''
|
||||
return key, begin, end, result_token
|
||||
|
||||
bulk_data = []
|
||||
bulk_count = 1
|
||||
count = 0
|
||||
section_order = 0
|
||||
text = ''
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
print('line: ' + str(i))
|
||||
count += 1
|
||||
tokenNumber = tokenNumber + 1
|
||||
|
||||
if line.strip() == '' :
|
||||
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||||
section_order += 1
|
||||
data = createIndex(content, result_token, section_order)
|
||||
tokenNumber = -1
|
||||
content = ''
|
||||
result_token = []
|
||||
|
||||
bulk_data.append(data)
|
||||
bulk_count +=1
|
||||
if bulk_data.__len__() > 100:
|
||||
print('=' * 30 )
|
||||
print('count ' + str(count))
|
||||
payload = json.dumps(bulk_data, cls=JSONEncoder) #Works!
|
||||
response = requests.request("POST", url, headers=headers, data=payload)
|
||||
print(response)
|
||||
bulk_data = []
|
||||
bulk_count = 1
|
||||
|
||||
continue
|
||||
|
||||
|
||||
|
||||
parts = line.split()
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
|
||||
content += ' ' + parts[0]
|
||||
|
||||
result_key = parts[1]
|
||||
if result_key.startswith('I-'):
|
||||
text += ' ' + parts[0]
|
||||
continue
|
||||
|
||||
if result_key == 'O' :
|
||||
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||||
|
||||
if result_key.startswith('B-'):
|
||||
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||||
|
||||
text = parts[0]
|
||||
begin = tokenNumber
|
||||
end = -1
|
||||
key = result_key.replace('B-', '')
|
||||
|
||||
if content != '' :
|
||||
key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
|
||||
section_order += 1
|
||||
data = createIndex(content, result_token, section_order)
|
||||
bulk_data.append(data)
|
||||
bulk_count +=1
|
||||
|
||||
|
||||
if bulk_data.__len__() > 0:
|
||||
print(bulk_count)
|
||||
payload = json.dumps(bulk_data, cls=JSONEncoder) #Works!
|
||||
response = requests.request("POST", url, headers=headers, data=payload)
|
||||
print(response)
|
||||
|
||||
# نمایش دیکشنری خروجی به صورت JSON
|
||||
print("***************** end *****************")
|
Loading…
Reference in New Issue
Block a user