""" https://api.text-mining.ir/index.html O : عدم وجود موجودیت B-PER : شروع موجودیت شخص یا نام فرد I-PER : ادامه موجودیت شخص یا نام فرد B-LOC : شروع موجودیت مکان یا محل خاص I-LOC : ادامه موجودیت مکان یا محل خاص B-ORG : شروع موجودیت نام سازمان یا تشکل I-ORG : ادامه موجودیت نام سازمان یا تشکل B-DAT : شروع موجودیت تاریخ یا زمان I-DAT : ادامه موجودیت تاریخ یا زمان B-EVE : شروع موجودیت رویداد یا حادثه I-EVE : ادامه موجودیت رویداد یا حادثه """ from trace import Trace import requests import json from funcs import write_to_json, save_to_file_by_address, read_from_json, read_file_by_address from general_functions import normalize_content import os import time base_address = os.getcwd() # k_address = base_address + "/main_qa_data/data/dataset.txt" # data = read_file_by_address(k_address) # senteces = data.split("\n\n") ##################### Get Token by Api Key ########################## api_key = "08938eac-f648-ef11-af5f-00163e6496fc" baseUrl = "http://api.text-mining.ir/api/" url = baseUrl + "Token/GetToken" querystring = {"apikey":api_key} # replace YOUR_API_KEY response = requests.request("GET", url, params=querystring) data = json.loads(response.text) tokenKey = data['token'] # -*- coding: utf-8 -*- try: # Fix UTF8 output issues on Windows console. # Does nothing if package is not installed from win_unicode_console import enable enable() except ImportError: pass def utfReverse(s): # CAVEAT: this will mess up characters that are # more than 2 bytes long in utf 16 u = s.decode("utf-8") return u[::-1].encode("utf-8") def callApi(url, data, tokenKey): headers = { 'Content-Type': "application/json", 'Authorization': "Bearer " + tokenKey, 'Cache-Control': "no-cache" } response = requests.request("POST", url, data=data.encode("utf-8"), headers=headers) result = response.text # utfReverse(response.text.encode("utf-8")) response.close() return result def api_ner_task(section): payload = u'"{0}"'.format(section) result = json.loads(callApi(url, payload, tokenKey)) words_dict = [] section_text = '' try: for index, phrase in enumerate(result): #print(f"({phrase['Word']},{phrase['Tags']['NER']['Item1']}) ") pos = phrase['Tags']['POS']['Item1'] ner_tag = phrase['Tags']['NER']['Item1'] word = phrase['Word'] words_dict.append({ "index" : str(index), "word" : phrase['Word'], "WordComment" : phrase['WordComment'], "StartCharIndex" : phrase['StartCharIndex'], "SimplePos" : phrase['SimplePos'], "Pos" : pos, "Ner" : ner_tag, "WordCount" : phrase['WordCount'], "Length" : phrase['Length'], "IsPunc" : phrase['IsPunc'], "IsVerb" : phrase['IsVerb'], "VerbInformation": phrase['VerbInformation'] }) section_text += word + " " + ner_tag + "\n" except Exception as e: if result["status"] == 400: print("\n\nfailed! Error 400 ..."), exit() return section_text, words_dict ############################ Call NER ############################### url = baseUrl + "NamedEntityRecognition/Detect" finall_ner_dataset = '' sections_ner_dict = [] sections_85_address = base_address + "/main_qa_data/data/qa_85_sections.json" sections_15k_address = base_address + "/main_qa_data/data/sections.json" sections_85 = read_from_json(sections_85_address) sections_15K = read_file_by_address(sections_15k_address) sections_15K = sections_15K.splitlines() sections_15K_list = [] for item in sections_15K: section = json.loads(item) sections_15K_list.append({ "id" : section["id"], "content": section["content"] }) sections_85_list = [item["id"] for item in sections_85] current_sections = [] for id in sections_85_list: for item in sections_15K_list: if id == item["id"]: current_sections.append({ "id": item["id"], "content": item["content"] }) break for i, item in enumerate(current_sections): time.sleep(3) content = normalize_content(item["content"]) content = content.replace("\n", " ") current_section_dataset, current_section_dict = api_ner_task(content) finall_ner_dataset += current_section_dataset + "\n\n" sections_ner_dict.append({ "id" : item["id"], "tokens_info" : current_section_dict, "section_dataset": current_section_dataset }) print(str(i) + "/" + str(len(current_sections))) save_to_file_by_address(base_address + "/main_qa_data/data/dataset.txt", finall_ner_dataset.strip()) write_to_json(sections_ner_dict, base_address + "/main_qa_data/data/dataset.json") print(" *** Finished! *** ")