ai_dataset/main_qa_data/sections15k__add_childorder.py

"""
این کد برای اضافه کردن فیلد child_order
به داده های فایل سکشن های 15 هزارتایی اصلی ایجاد شده است.
"""

from html import escape
from lxml import etree
from datetime import datetime
from elasticsearch import Elasticsearch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
from threading import Thread
import torch
import time
from concurrent.futures import ThreadPoolExecutor
import concurrent
import threading
import json
import numpy as np

from funcs import write_to_json, read_from_json
import os


index_name_i = 'mj_qa_section-v02'# semantic_search-v10


es = Elasticsearch(
    "http://127.0.0.1:6900",
    basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
)

counter = 0
total = 0
remained = 0
id = ''
keywords_count = 15

body_query = "{'query': {'match_all': {}}}"

def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="25m", **kwargs):
    """
    Helper to iterate ALL values from a single index
    Yields all the documents.
    """
    global counter
    global total
    is_first = True
    while True:
        # Scroll next
        if is_first:  # Initialize scroll

            result = es.search(
                index=index,
                scroll="2m",
                **kwargs,
                size=pagesize,
                body={
  "query": {
    "bool": {
      "must": [
        {
          "bool": {
            "must_not": [

              {
                "match": {
                  "other_info.full_path": "موخره"
                }
              },
              {
                "match": {
                  "other_info.full_path": "امضاء"
                }
              },
              {
                "match": {
                  "other_info.full_path": "عنوان"
                }
              }
            ]
          }
        },
        {
          "bool": {
            "filter": {
              "bool": {
                "must": [
                  {
                    "term": {
                      "qanon_etebar": "معتبر"
                    }
                  },
                  {
                    "term": {
                      "title_type": "عادی"
                    }
                  },
                  {
                    "term": {
                      "ts_ref.keyword": "مجلس شورای اسلامی"
                    }
                  },
                  {
                    "term": {
                      "sub_type": "عادی"
                    }
                  }
                ]
              }
            }
          }
        }
      ]
    }
  },
  "sort": {
    "sort_date_timestamp": {
      "order": "desc"
    }
  },
  "track_total_hits": True,
  "aggs": {
    "total_collapse": {
      "cardinality": {
        "field": "qanon_id"
      }
    }
  }
}

            )
            total = result["hits"]["total"]["value"]
            print("total = %d" % total)
            is_first = False
        else:
            result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
        scroll_id = result["_scroll_id"]
        hits = result["hits"]["hits"]
        counter += len(hits)
        print("progress -> %.2f %%" % ((counter / total) * 100))
        # Stop after no more docs
        if not hits:
            break
        # Yield each entry
        yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)


def add_section(section):
    data = ({
                  "id": section["id"],
                  "qanon_id": section["qanon_id"],
                  "content": section["content"],
                  "main_topic": section["tcode_main_old"][0],
                  "all_topics": section["tcode_main_old"],
                  "ts_year": section["ts_year"],
                  "state_etebar": section["state_etebar"],
                  "ners": section["ners_v1"]
                })
    return data

if __name__ == "__main__":

    start_time = time.time()

    base_address = os.getcwd()  # debugger
    #base_address = "/home/gpu/tnlp/jokar/llama" # terminal

    json_address_15k_sections = base_address + "/data/sections_15k.json"

    data15k = read_from_json(json_address_15k_sections)

    all_sections = es_iterate_all_documents(es, index_name_i)

    all_sections_arr = []
    for mentry in all_sections:
                section_id = mentry["id"]
                source = mentry["source"]
                all_sections_arr.append([section_id, source])
    # انتقال داده های الستیک به یک لیست نامپای برای سرعت بیشتر جستجو در داده ها
    np_sections_arr = np.array(all_sections_arr)

    selected_sections = []
    index = -1
    x = 0
    try:

        for i, line in enumerate(data15k):
            # if i == 813:
            #   pass
            id = line['id']
            law_id = line["law_id"]
            content = line['content']

            try:
                # جستجوی شناسه سکشن جاری در داده الستیک
                foun_item = np.where(np_sections_arr[:, 0] == id)[0][0]
                local_id = np_sections_arr[foun_item][1]["id"]
                if not local_id == id:
                    pass
                # دریافت فیلد html
                # اگر این فیلد پر باشد، به این معناست که سکشن جاری، دارای جدول است و نباید به فایل اصلی اضافه شود
                html = np_sections_arr[foun_item][1]["child_order"]
                has_html = False
                if html:
                  has_html = True
                # پیدا کردن ترتیب اولویت برای این سکشن بر اساس داده الستیک
                child_order = str(int(np_sections_arr[foun_item][1]["child_order"]))
                selected_sections.append({
                  "id": id,
                  "law_id": law_id,
                  "content": content,
                  "child_order": child_order,
                  "has_html": has_html
                })
            except Exception as e:
                result = -1

            print(i+1)


    except Exception as inst:
            print(type(inst))    # the exception type
            print(inst.args)     # arguments stored in .args
            print(inst)          # __str__ allows args to be printed directly,
                                # but may be overridden in exception subclasses
            print("Exception:=> %s -> %.2f " % (id , counter / total))

    print(len(selected_sections))
    path = "./data/main_sections_15k.json" # os.getcwd() +
    write_to_json(selected_sections, path)

    end_time = time.time()
    print(f"elapsed time: {end_time-start_time} seconds.")
    print(" *** finished! *** ")