148 lines
5.1 KiB
Python
148 lines
5.1 KiB
Python
"""
|
|
https://api.text-mining.ir/index.html
|
|
O : عدم وجود موجودیت
|
|
B-PER : شروع موجودیت شخص یا نام فرد
|
|
I-PER : ادامه موجودیت شخص یا نام فرد
|
|
B-LOC : شروع موجودیت مکان یا محل خاص
|
|
I-LOC : ادامه موجودیت مکان یا محل خاص
|
|
B-ORG : شروع موجودیت نام سازمان یا تشکل
|
|
I-ORG : ادامه موجودیت نام سازمان یا تشکل
|
|
B-DAT : شروع موجودیت تاریخ یا زمان
|
|
I-DAT : ادامه موجودیت تاریخ یا زمان
|
|
B-EVE : شروع موجودیت رویداد یا حادثه
|
|
I-EVE : ادامه موجودیت رویداد یا حادثه
|
|
"""
|
|
from trace import Trace
|
|
import requests
|
|
import json
|
|
from funcs import write_to_json, save_to_file_by_address, read_from_json, read_file_by_address
|
|
from general_functions import normalize_content
|
|
import os
|
|
import time
|
|
|
|
base_address = os.getcwd()
|
|
# k_address = base_address + "/main_qa_data/data/dataset.txt"
|
|
# data = read_file_by_address(k_address)
|
|
# senteces = data.split("\n\n")
|
|
|
|
##################### Get Token by Api Key ##########################
|
|
api_key = "08938eac-f648-ef11-af5f-00163e6496fc"
|
|
|
|
baseUrl = "http://api.text-mining.ir/api/"
|
|
url = baseUrl + "Token/GetToken"
|
|
querystring = {"apikey":api_key} # replace YOUR_API_KEY
|
|
response = requests.request("GET", url, params=querystring)
|
|
data = json.loads(response.text)
|
|
tokenKey = data['token']
|
|
|
|
# -*- coding: utf-8 -*-
|
|
try:
|
|
# Fix UTF8 output issues on Windows console.
|
|
# Does nothing if package is not installed
|
|
from win_unicode_console import enable
|
|
enable()
|
|
except ImportError:
|
|
pass
|
|
|
|
def utfReverse(s):
|
|
# CAVEAT: this will mess up characters that are
|
|
# more than 2 bytes long in utf 16
|
|
u = s.decode("utf-8")
|
|
return u[::-1].encode("utf-8")
|
|
|
|
|
|
def callApi(url, data, tokenKey):
|
|
headers = {
|
|
'Content-Type': "application/json",
|
|
'Authorization': "Bearer " + tokenKey,
|
|
'Cache-Control': "no-cache"
|
|
}
|
|
response = requests.request("POST", url, data=data.encode("utf-8"), headers=headers)
|
|
result = response.text # utfReverse(response.text.encode("utf-8"))
|
|
response.close()
|
|
return result
|
|
|
|
def api_ner_task(section):
|
|
payload = u'"{0}"'.format(section)
|
|
result = json.loads(callApi(url, payload, tokenKey))
|
|
words_dict = []
|
|
section_text = ''
|
|
try:
|
|
for index, phrase in enumerate(result):
|
|
#print(f"({phrase['Word']},{phrase['Tags']['NER']['Item1']}) ")
|
|
pos = phrase['Tags']['POS']['Item1']
|
|
ner_tag = phrase['Tags']['NER']['Item1']
|
|
word = phrase['Word']
|
|
words_dict.append({
|
|
"index" : str(index),
|
|
"word" : phrase['Word'],
|
|
"WordComment" : phrase['WordComment'],
|
|
"StartCharIndex" : phrase['StartCharIndex'],
|
|
"SimplePos" : phrase['SimplePos'],
|
|
"Pos" : pos,
|
|
"Ner" : ner_tag,
|
|
"WordCount" : phrase['WordCount'],
|
|
"Length" : phrase['Length'],
|
|
"IsPunc" : phrase['IsPunc'],
|
|
"IsVerb" : phrase['IsVerb'],
|
|
"VerbInformation": phrase['VerbInformation']
|
|
})
|
|
section_text += word + " " + ner_tag + "\n"
|
|
except Exception as e:
|
|
if result["status"] == 400: print("\n\nfailed! Error 400 ..."), exit()
|
|
return section_text, words_dict
|
|
|
|
############################ Call NER ###############################
|
|
url = baseUrl + "NamedEntityRecognition/Detect"
|
|
|
|
finall_ner_dataset = ''
|
|
sections_ner_dict = []
|
|
|
|
sections_85_address = base_address + "/main_qa_data/data/qa_85_sections.json"
|
|
sections_15k_address = base_address + "/main_qa_data/data/sections.json"
|
|
sections_85 = read_from_json(sections_85_address)
|
|
sections_15K = read_file_by_address(sections_15k_address)
|
|
|
|
sections_15K = sections_15K.splitlines()
|
|
sections_15K_list = []
|
|
|
|
for item in sections_15K:
|
|
section = json.loads(item)
|
|
sections_15K_list.append({
|
|
"id" : section["id"],
|
|
"content": section["content"]
|
|
})
|
|
|
|
sections_85_list = [item["id"] for item in sections_85]
|
|
current_sections = []
|
|
|
|
for id in sections_85_list:
|
|
for item in sections_15K_list:
|
|
if id == item["id"]:
|
|
current_sections.append({
|
|
"id": item["id"],
|
|
"content": item["content"]
|
|
})
|
|
break
|
|
for i, item in enumerate(current_sections):
|
|
time.sleep(3)
|
|
content = normalize_content(item["content"])
|
|
content = content.replace("\n", " ")
|
|
current_section_dataset, current_section_dict = api_ner_task(content)
|
|
|
|
finall_ner_dataset += current_section_dataset + "\n\n"
|
|
|
|
sections_ner_dict.append({
|
|
"id" : item["id"],
|
|
"tokens_info" : current_section_dict,
|
|
"section_dataset": current_section_dataset
|
|
})
|
|
print(str(i) + "/" + str(len(current_sections)))
|
|
|
|
save_to_file_by_address(base_address + "/main_qa_data/data/dataset.txt", finall_ner_dataset.strip())
|
|
write_to_json(sections_ner_dict, base_address + "/main_qa_data/data/dataset.json")
|
|
|
|
print(" *** Finished! *** ")
|
|
|
|
|