Flair_NER/relation_update.py

from elasticsearch7 import Elasticsearch
from collections import Counter
from general_functions import save_error, normalize_content
from funcs import save_to_file_by_address, read_file_by_address, write_to_json
import datetime
import os

# ##################################
# برای محتوای مواد و احکام قانون که از معاونت قوانین مجلس در ایندکس الاستیک ذخیره شده است
# qanon_section-v02
# تحلیل روی بعضی فیلدها می کند و تاریخ های آن را استخراج و تبدیل به فرمت خاص تایم استمپ می کند
# و در فیدل مناسب در همان ایندکس الاستیک ذخیره میکند
# توجه : دسترسی به الاستیک باید باشد
# ##################################


def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="25m", **kwargs):

    global counter
    global total
    is_first = True
    while True:
        # Scroll next
        if is_first:  # Initialize scroll
            # result = es.search(index=index, scroll="2m", **kwargs, body={
            #     "size": pagesize
            # })
            result = es.search(
                index=index,
                scroll="2m",
                **kwargs,
                size=pagesize,
                body={
                    "query": {
                        "bool": {
                            "must_not": [
                                {"exists": {"field": "nlp_parser.type"}},
                                {"match": {"content_len": 0}},
                                {"match": {"parse_state": 1}},
                                {"match": {"parse_state": 2}}
                            ]
                        }
                    }
                }
            )
            total = result["hits"]["total"]["value"]
            print("total = %d" % total)
            is_first = False
        else:
            result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
        scroll_id = result["_scroll_id"]
        hits = result["hits"]["hits"]
        counter += len(hits)
        print("progress -> %.2f %%" % ((counter / total) * 100))
        # Stop after no more docs
        if not hits:
            break
        # Yield each entry
        yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)

def es_iterate_some_documents(es, index, records, pagesize=250, scroll_timeout="25m", **kwargs):

    global counter
    global total
    is_first = True
    query = {
            "query": {
                    "terms": {
                        "_id": records
                        }
                    }
            }
    while True:
        # Scroll next
        if is_first:  # Initialize scroll
            # result = es.search(index=index, scroll="2m", **kwargs, body={
            #     "size": pagesize
            # })
            result = es.search(
                index=index,
                scroll="2m",
                **kwargs,
                size=pagesize,
                body= query
            )
            total = result["hits"]["total"]["value"]
            print("total = %d" % total)
            is_first = False
        else:
            result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
        scroll_id = result["_scroll_id"]
        hits = result["hits"]["hits"]
        counter += len(hits)
        print("progress -> %.2f %%" % ((counter / total) * 100))
        # Stop after no more docs
        if not hits:
            break
        # Yield each entry
        yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)

def prepare_data(ner_obj_list):
    ner_data_list = []
    for ner_obj in ner_obj_list:
        ner_data = {
                            "key"    :ner_obj['ner_key'],
                            "value"  :ner_obj['ner_value'],
                            "begin"  :ner_obj['ner_start_token'],
                            "end"    :ner_obj['ner_end_token'],
                            "score"  :ner_obj['ner_score']
                            }

        ner_data_list.append(ner_data)
    return ner_data_list


def relation_finder(all_orgs):
    new_orgs = []

    for index, section in enumerate(all_orgs):
        if index % 1000 == 0:
            print(f"relation finder progress: {(index/len(all_orgs)) * 100:.2f} %")

        # if index > 1000:
        #     break
        related_sections = []
        orgs = section['ai_key']

        for org in orgs:
            for compare_item in all_orgs:
                compare_item_orgs = compare_item['ai_key']
                if section['section_id'] == compare_item['section_id']:# جلوگیری از ارتباط یک مقرره با خودش
                    continue
                if org in compare_item_orgs:
                    # related_sections.append(compare_item['id'])
                    related_sections.append({
                        'section_id': compare_item['section_id'],
                        'qanon_id': compare_item['qanon_id'],
                        'ai_key': org,
                        'type': 'ORG',
                        'weight': 1,
                    })

        new_orgs.append({
            'id': section['section_id'],
            'qanon_id': section['qanon_id'],
            'ai_key': section['orgs'],

            # 'orgs_text': section['orgs_text'],
            'relations': related_sections,
        })
    return new_orgs

def extract_weight(relation_list):
    relation_list_temp = []
    for rel_item in relation_list:
        weight = 0
        rel_labels = []
        current_section_id = rel_item['section_id']
        for item in relation_list:
            if item['section_id'] == current_section_id:
                weight += 1
                rel_labels.append(item['ai_key'])

        for rel_item2 in relation_list_temp:
            if current_section_id == rel_item2['section_id']:
                break
            else:
                relation_list_temp.append({
                    "section_id": current_section_id,
                    "qanon_id": rel_item['qanon_id'],
                    "ai_key": rel_labels,
                    "type": rel_item['type'],
                    "weight": weight
                })
                break
        if len(relation_list_temp) == 0:
            relation_list_temp.append({
                    "section_id": current_section_id,
                    "qanon_id": rel_item['qanon_id'],
                    "ai_key": rel_labels,
                    "type": rel_item['type'],
                    "weight": weight
                })
    return relation_list_temp

print(datetime.datetime.now())
index_name_i = "ai_mj_qa_section-v07" # الاستیک موجود روی جی پی یو
# index_name_o =  'mj_qa_test-v01'
# is_update_state =  False
index_name_o = "ai_mj_qa_section-v03"
is_update_state = False

mapping_o = ""

es = Elasticsearch(
    "http://127.0.0.1:6900",
    basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
)

try:
    if not es.indices.exists(index=index_name_o):
        response = es.indices.create(index=index_name_o, body=mapping_o)
        # print out the response:
        print("create index response:", response)
except:
    print("elastic error")

counter = 0
total = 0
id = ""

novalid = -15000000000
all_orgs = []
all_orgs_text = ''
orgs_list = []

count = 0
section_list1 = es_iterate_all_documents(es, index_name_i)
for mentry1 in section_list1:
    count += 1
    id1 = mentry1["id"]

    # if count > 10000:
    #         break
    # print('relation task for section: ' + str(count))
    if count % 100 == 0:
        print(f"relation finder progress: {(count / 273442) * 100:.2f} %")

    entry1 = mentry1["source"]
    content1 = entry1.get("content", "")
    content_ai1 = entry1["content_ai"]
    content_len1 = entry1.get("content_len", "")
    qanon_id1 = entry1.get("qanon_id", "")
    ners1 = entry1['ners_v1']
    relations = entry1['relations']

    temp = relations
    before = len(relations)
    relations = extract_weight(relations)
    after = len(relations)
    if before != after:
        print()
        print("id: "+ id1 +" merged find! diffrence = "+str(before-after))
        print()

    data1 = {
            "qanon_id" : qanon_id1,
            "content_ai": content_ai1,
            "ners_v1": ners1,
            "relations": relations
            }

    eid = id1

    try:
        if is_update_state:
            resp = es.update(index=index_name_o, id=eid, doc=data1)
        else:
            #write_to_json(data, './data/regulations_ner.json')
            resp = es.index(index=index_name_o, id=eid, document=data1)
            #pass


    except Exception as e:
        save_error(id, e)

print(datetime.datetime.now())
print(" # # # relation update finished! # # # ")