ai_dataset/main_qa_data/remote_api.py

"""
https://api.text-mining.ir/index.html
O : عدم وجود موجودیت
B-PER : شروع موجودیت شخص یا نام فرد
I-PER : ادامه موجودیت شخص یا نام فرد
B-LOC : شروع موجودیت مکان یا محل خاص
I-LOC : ادامه موجودیت مکان یا محل خاص
B-ORG : شروع موجودیت نام سازمان یا تشکل
I-ORG : ادامه موجودیت نام سازمان یا تشکل
B-DAT : شروع موجودیت تاریخ یا زمان
I-DAT : ادامه موجودیت تاریخ یا زمان
B-EVE : شروع موجودیت رویداد یا حادثه
I-EVE : ادامه موجودیت رویداد یا حادثه
"""
from trace import Trace
import requests
import json
from funcs import write_to_json, save_to_file_by_address, read_from_json, read_file_by_address
from general_functions import normalize_content
import os
import time

base_address = os.getcwd()
# k_address = base_address + "/main_qa_data/data/dataset.txt"
# data = read_file_by_address(k_address)
# senteces = data.split("\n\n")

##################### Get Token by Api Key ##########################
api_key = "08938eac-f648-ef11-af5f-00163e6496fc"

baseUrl = "http://api.text-mining.ir/api/"
url = baseUrl + "Token/GetToken"
querystring = {"apikey":api_key}  # replace YOUR_API_KEY
response = requests.request("GET", url, params=querystring)
data = json.loads(response.text)
tokenKey = data['token']

# -*- coding: utf-8 -*-
try:
    # Fix UTF8 output issues on Windows console.
    # Does nothing if package is not installed
    from win_unicode_console import enable
    enable()
except ImportError:
    pass

def utfReverse(s):
    # CAVEAT: this will mess up characters that are
    # more than 2 bytes long in utf 16
    u = s.decode("utf-8")
    return u[::-1].encode("utf-8")


def callApi(url, data, tokenKey):
    headers = {
        'Content-Type': "application/json",
        'Authorization': "Bearer " + tokenKey,
        'Cache-Control': "no-cache"
    }
    response = requests.request("POST", url, data=data.encode("utf-8"), headers=headers)
    result = response.text  # utfReverse(response.text.encode("utf-8"))
    response.close()
    return result

def api_ner_task(section):
    payload = u'"{0}"'.format(section)
    result = json.loads(callApi(url, payload, tokenKey))
    words_dict = []
    section_text = ''
    try:
        for index, phrase in enumerate(result):
            #print(f"({phrase['Word']},{phrase['Tags']['NER']['Item1']}) ")
            pos  = phrase['Tags']['POS']['Item1']
            ner_tag  = phrase['Tags']['NER']['Item1']
            word = phrase['Word']
            words_dict.append({
                "index"          : str(index),
                "word"           : phrase['Word'],
                "WordComment"    : phrase['WordComment'],
                "StartCharIndex" : phrase['StartCharIndex'],
                "SimplePos"      : phrase['SimplePos'],
                "Pos"            : pos,
                "Ner"            : ner_tag,
                "WordCount"      : phrase['WordCount'],
                "Length"         : phrase['Length'],
                "IsPunc"         : phrase['IsPunc'],
                "IsVerb"         : phrase['IsVerb'],
                "VerbInformation": phrase['VerbInformation']
            })
            section_text += word + " " + ner_tag + "\n"
    except Exception as e:
        if result["status"] == 400: print("\n\nfailed! Error 400 ..."), exit()
    return section_text, words_dict

############################ Call NER ###############################
url =  baseUrl + "NamedEntityRecognition/Detect"

finall_ner_dataset = ''
sections_ner_dict = []

sections_85_address = base_address + "/main_qa_data/data/qa_85_sections.json"
sections_15k_address = base_address + "/main_qa_data/data/sections.json"
sections_85  = read_from_json(sections_85_address)
sections_15K = read_file_by_address(sections_15k_address)

sections_15K = sections_15K.splitlines()
sections_15K_list = []

for item in sections_15K:
    section = json.loads(item)
    sections_15K_list.append({
        "id"     : section["id"],
        "content": section["content"]
    })

sections_85_list = [item["id"] for item in sections_85]
current_sections = []

for id in sections_85_list:
    for item in sections_15K_list:
        if id == item["id"]:
            current_sections.append({
                "id": item["id"],
                "content": item["content"]
            })
            break
for i, item in enumerate(current_sections):
    time.sleep(3)
    content = normalize_content(item["content"])
    content = content.replace("\n", " ")
    current_section_dataset, current_section_dict = api_ner_task(content)

    finall_ner_dataset += current_section_dataset + "\n\n"

    sections_ner_dict.append({
        "id"             : item["id"],
        "tokens_info"    : current_section_dict,
        "section_dataset": current_section_dataset
    })
    print(str(i) + "/" + str(len(current_sections)))

save_to_file_by_address(base_address + "/main_qa_data/data/dataset.txt", finall_ner_dataset.strip())
write_to_json(sections_ner_dict, base_address + "/main_qa_data/data/dataset.json")

print(" *** Finished! *** ")