""" این کد برای انتخاب سکشن های اصلی که بند و عنوان و موخره و امضا نیستند و معتبر هستند و فقط از مصوبات مجلس هستند ایجاد شده است که شامل حدود 9 هزار سکشن می شود. """ from html import escape from lxml import etree from datetime import datetime from elasticsearch import Elasticsearch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer from threading import Thread import torch import time from concurrent.futures import ThreadPoolExecutor import concurrent import threading import json import numpy as np from funcs import write_to_json, write_to_excel import os index_name_i = 'mj_qa_section-v02'# semantic_search-v10 es = Elasticsearch( "http://127.0.0.1:6900", basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm") ) counter = 0 total = 0 remained = 0 id = '' keywords_count = 15 body_query = "{'query': {'match_all': {}}}" def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="25m", **kwargs): """ Helper to iterate ALL values from a single index Yields all the documents. """ global counter global total is_first = True while True: # Scroll next if is_first: # Initialize scroll result = es.search( index=index, scroll="2m", **kwargs, size=pagesize, body={ "query": { "bool": { "must": [ { "bool": { "must_not": [ { "wildcard": { "other_info.full_path": "بند*" } }, { "match": { "other_info.full_path": "موخره" } }, { "match": { "other_info.full_path": "امضاء" } }, { "match": { "other_info.full_path": "عنوان" } } ] } }, { "bool": { "filter": { "bool": { "must": [ { "term": { "qanon_etebar": "معتبر" } }, { "term": { "title_type": "عادی" } }, { "term": { "ts_ref.keyword": "مجلس شورای اسلامی" } }, { "term": { "sub_type": "عادی" } } ] } } } } ] } }, "sort": { "sort_date_timestamp": { "order": "desc" } }, "track_total_hits": True, "aggs": { "total_collapse": { "cardinality": { "field": "qanon_id" } } } } ) total = result["hits"]["total"]["value"] print("total = %d" % total) is_first = False else: result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout) scroll_id = result["_scroll_id"] hits = result["hits"]["hits"] counter += len(hits) print("progress -> %.2f %%" % ((counter / total) * 100)) # Stop after no more docs if not hits: break # Yield each entry yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits) def add_section(section): data = ({ "id": section["id"], "qanon_id": section["qanon_id"], "content": section["content"], "main_topic": section["tcode_main_old"][0], "all_topics": section["tcode_main_old"], "ts_year": section["ts_year"], "state_etebar": section["state_etebar"], "ners": section["ners_v1"] }) return data if __name__ == "__main__": base_address = os.getcwd() # debugger #base_address = "/home/gpu/tnlp/jokar/llama" # terminal # json_address_15k_sections = base_address + "/data/qa_sections_15k.json" start_time = time.time() all_sections = es_iterate_all_documents(es, index_name_i) all_sections_arr = [] for mentry in all_sections: section_id = mentry["id"] source = mentry["source"] all_sections_arr.append([section_id, source]) np_sections_arr = np.array(all_sections_arr) selected_sections = [] index = -1 x = 0 try: np_topics = [] for i, dataline in enumerate(np_sections_arr): # if i == 813: # pass line = dataline[1] id = line['id'] content = line['content'] state_etebar = line["state_etebar"] if not state_etebar == "معتبر": continue if not content: continue if content.__contains__("به‌ شرح پیوست")\ or content.startswith("تبصره"): continue if content[len(content)-1] == ":": continue if len(content.split()) < 30 or len(content.split()) > 150: continue try: main_topic = line["tcode_main_old"][0]# tcode_main_old except: x += 1 continue if len(np_topics) > 0: try: # result = np.where(np_topics[:, 0] == main_topic)[0] np_topics2 = np.array(np_topics) result = np.where(np_topics2[:, 0] == main_topic)[0][0] except Exception as e: result = -1 if result != -1: repeat_factor = (np_topics[result])[1] if repeat_factor == 10: continue else: (np_topics[result])[1] += 1 selected_sections.append(add_section(line)) #np_topics.append([main_topic, repeat_factor+1]) #repeat_factor+1 else: np_topics.append([main_topic, 1]) #repeat_factor+1 selected_sections.append(add_section(line)) else: selected_sections.append(add_section(line)) np_topics.append([main_topic, 1]) print(np_topics) print(i+1) except Exception as inst: print(type(inst)) # the exception type print(inst.args) # arguments stored in .args print(inst) # __str__ allows args to be printed directly, # but may be overridden in exception subclasses print("Exception:=> %s -> %.2f " % (id , counter / total)) print(len(selected_sections)) path = "./data/selected_sections.json" # os.getcwd() + write_to_json(selected_sections, path) excel_path = "./data/selected_sections.xlsx" write_to_excel(selected_sections, excel_path) end_time = time.time() print(f"elapsed time: {end_time-start_time}") print(" *** finished! *** ")