ai_dataset/main_qa_data/remote_api.py

148 lines
5.1 KiB
Python
Raw Permalink Normal View History

2024-09-17 16:57:26 +00:00
"""
https://api.text-mining.ir/index.html
O : عدم وجود موجودیت
B-PER : شروع موجودیت شخص یا نام فرد
I-PER : ادامه موجودیت شخص یا نام فرد
B-LOC : شروع موجودیت مکان یا محل خاص
I-LOC : ادامه موجودیت مکان یا محل خاص
B-ORG : شروع موجودیت نام سازمان یا تشکل
I-ORG : ادامه موجودیت نام سازمان یا تشکل
B-DAT : شروع موجودیت تاریخ یا زمان
I-DAT : ادامه موجودیت تاریخ یا زمان
B-EVE : شروع موجودیت رویداد یا حادثه
I-EVE : ادامه موجودیت رویداد یا حادثه
"""
from trace import Trace
import requests
import json
from funcs import write_to_json, save_to_file_by_address, read_from_json, read_file_by_address
from general_functions import normalize_content
import os
import time
base_address = os.getcwd()
# k_address = base_address + "/main_qa_data/data/dataset.txt"
# data = read_file_by_address(k_address)
# senteces = data.split("\n\n")
##################### Get Token by Api Key ##########################
api_key = "08938eac-f648-ef11-af5f-00163e6496fc"
baseUrl = "http://api.text-mining.ir/api/"
url = baseUrl + "Token/GetToken"
querystring = {"apikey":api_key} # replace YOUR_API_KEY
response = requests.request("GET", url, params=querystring)
data = json.loads(response.text)
tokenKey = data['token']
# -*- coding: utf-8 -*-
try:
# Fix UTF8 output issues on Windows console.
# Does nothing if package is not installed
from win_unicode_console import enable
enable()
except ImportError:
pass
def utfReverse(s):
# CAVEAT: this will mess up characters that are
# more than 2 bytes long in utf 16
u = s.decode("utf-8")
return u[::-1].encode("utf-8")
def callApi(url, data, tokenKey):
headers = {
'Content-Type': "application/json",
'Authorization': "Bearer " + tokenKey,
'Cache-Control': "no-cache"
}
response = requests.request("POST", url, data=data.encode("utf-8"), headers=headers)
result = response.text # utfReverse(response.text.encode("utf-8"))
response.close()
return result
def api_ner_task(section):
payload = u'"{0}"'.format(section)
result = json.loads(callApi(url, payload, tokenKey))
words_dict = []
section_text = ''
try:
for index, phrase in enumerate(result):
#print(f"({phrase['Word']},{phrase['Tags']['NER']['Item1']}) ")
pos = phrase['Tags']['POS']['Item1']
ner_tag = phrase['Tags']['NER']['Item1']
word = phrase['Word']
words_dict.append({
"index" : str(index),
"word" : phrase['Word'],
"WordComment" : phrase['WordComment'],
"StartCharIndex" : phrase['StartCharIndex'],
"SimplePos" : phrase['SimplePos'],
"Pos" : pos,
"Ner" : ner_tag,
"WordCount" : phrase['WordCount'],
"Length" : phrase['Length'],
"IsPunc" : phrase['IsPunc'],
"IsVerb" : phrase['IsVerb'],
"VerbInformation": phrase['VerbInformation']
})
section_text += word + " " + ner_tag + "\n"
except Exception as e:
if result["status"] == 400: print("\n\nfailed! Error 400 ..."), exit()
return section_text, words_dict
############################ Call NER ###############################
url = baseUrl + "NamedEntityRecognition/Detect"
finall_ner_dataset = ''
sections_ner_dict = []
sections_85_address = base_address + "/main_qa_data/data/qa_85_sections.json"
sections_15k_address = base_address + "/main_qa_data/data/sections.json"
sections_85 = read_from_json(sections_85_address)
sections_15K = read_file_by_address(sections_15k_address)
sections_15K = sections_15K.splitlines()
sections_15K_list = []
for item in sections_15K:
section = json.loads(item)
sections_15K_list.append({
"id" : section["id"],
"content": section["content"]
})
sections_85_list = [item["id"] for item in sections_85]
current_sections = []
for id in sections_85_list:
for item in sections_15K_list:
if id == item["id"]:
current_sections.append({
"id": item["id"],
"content": item["content"]
})
break
for i, item in enumerate(current_sections):
time.sleep(3)
content = normalize_content(item["content"])
content = content.replace("\n", " ")
current_section_dataset, current_section_dict = api_ner_task(content)
finall_ner_dataset += current_section_dataset + "\n\n"
sections_ner_dict.append({
"id" : item["id"],
"tokens_info" : current_section_dict,
"section_dataset": current_section_dataset
})
print(str(i) + "/" + str(len(current_sections)))
save_to_file_by_address(base_address + "/main_qa_data/data/dataset.txt", finall_ner_dataset.strip())
write_to_json(sections_ner_dict, base_address + "/main_qa_data/data/dataset.json")
print(" *** Finished! *** ")