236 lines
6.8 KiB
Python
236 lines
6.8 KiB
Python
"""
|
||
این کد برای اضافه کردن فیلد child_order
|
||
به داده های فایل سکشن های 15 هزارتایی اصلی ایجاد شده است.
|
||
"""
|
||
|
||
from html import escape
|
||
from lxml import etree
|
||
from datetime import datetime
|
||
from elasticsearch import Elasticsearch
|
||
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
|
||
from threading import Thread
|
||
import torch
|
||
import time
|
||
from concurrent.futures import ThreadPoolExecutor
|
||
import concurrent
|
||
import threading
|
||
import json
|
||
import numpy as np
|
||
|
||
from funcs import write_to_json, read_from_json
|
||
import os
|
||
|
||
|
||
index_name_i = 'mj_qa_section-v02'# semantic_search-v10
|
||
|
||
|
||
es = Elasticsearch(
|
||
"http://127.0.0.1:6900",
|
||
basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
|
||
)
|
||
|
||
counter = 0
|
||
total = 0
|
||
remained = 0
|
||
id = ''
|
||
keywords_count = 15
|
||
|
||
body_query = "{'query': {'match_all': {}}}"
|
||
|
||
def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="25m", **kwargs):
|
||
"""
|
||
Helper to iterate ALL values from a single index
|
||
Yields all the documents.
|
||
"""
|
||
global counter
|
||
global total
|
||
is_first = True
|
||
while True:
|
||
# Scroll next
|
||
if is_first: # Initialize scroll
|
||
|
||
result = es.search(
|
||
index=index,
|
||
scroll="2m",
|
||
**kwargs,
|
||
size=pagesize,
|
||
body={
|
||
"query": {
|
||
"bool": {
|
||
"must": [
|
||
{
|
||
"bool": {
|
||
"must_not": [
|
||
|
||
{
|
||
"match": {
|
||
"other_info.full_path": "موخره"
|
||
}
|
||
},
|
||
{
|
||
"match": {
|
||
"other_info.full_path": "امضاء"
|
||
}
|
||
},
|
||
{
|
||
"match": {
|
||
"other_info.full_path": "عنوان"
|
||
}
|
||
}
|
||
]
|
||
}
|
||
},
|
||
{
|
||
"bool": {
|
||
"filter": {
|
||
"bool": {
|
||
"must": [
|
||
{
|
||
"term": {
|
||
"qanon_etebar": "معتبر"
|
||
}
|
||
},
|
||
{
|
||
"term": {
|
||
"title_type": "عادی"
|
||
}
|
||
},
|
||
{
|
||
"term": {
|
||
"ts_ref.keyword": "مجلس شورای اسلامی"
|
||
}
|
||
},
|
||
{
|
||
"term": {
|
||
"sub_type": "عادی"
|
||
}
|
||
}
|
||
]
|
||
}
|
||
}
|
||
}
|
||
}
|
||
]
|
||
}
|
||
},
|
||
"sort": {
|
||
"sort_date_timestamp": {
|
||
"order": "desc"
|
||
}
|
||
},
|
||
"track_total_hits": True,
|
||
"aggs": {
|
||
"total_collapse": {
|
||
"cardinality": {
|
||
"field": "qanon_id"
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
)
|
||
total = result["hits"]["total"]["value"]
|
||
print("total = %d" % total)
|
||
is_first = False
|
||
else:
|
||
result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
|
||
scroll_id = result["_scroll_id"]
|
||
hits = result["hits"]["hits"]
|
||
counter += len(hits)
|
||
print("progress -> %.2f %%" % ((counter / total) * 100))
|
||
# Stop after no more docs
|
||
if not hits:
|
||
break
|
||
# Yield each entry
|
||
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
|
||
|
||
|
||
def add_section(section):
|
||
data = ({
|
||
"id": section["id"],
|
||
"qanon_id": section["qanon_id"],
|
||
"content": section["content"],
|
||
"main_topic": section["tcode_main_old"][0],
|
||
"all_topics": section["tcode_main_old"],
|
||
"ts_year": section["ts_year"],
|
||
"state_etebar": section["state_etebar"],
|
||
"ners": section["ners_v1"]
|
||
})
|
||
return data
|
||
|
||
if __name__ == "__main__":
|
||
|
||
start_time = time.time()
|
||
|
||
base_address = os.getcwd() # debugger
|
||
#base_address = "/home/gpu/tnlp/jokar/llama" # terminal
|
||
|
||
json_address_15k_sections = base_address + "/data/sections_15k.json"
|
||
|
||
data15k = read_from_json(json_address_15k_sections)
|
||
|
||
all_sections = es_iterate_all_documents(es, index_name_i)
|
||
|
||
all_sections_arr = []
|
||
for mentry in all_sections:
|
||
section_id = mentry["id"]
|
||
source = mentry["source"]
|
||
all_sections_arr.append([section_id, source])
|
||
# انتقال داده های الستیک به یک لیست نامپای برای سرعت بیشتر جستجو در داده ها
|
||
np_sections_arr = np.array(all_sections_arr)
|
||
|
||
selected_sections = []
|
||
index = -1
|
||
x = 0
|
||
try:
|
||
|
||
for i, line in enumerate(data15k):
|
||
# if i == 813:
|
||
# pass
|
||
id = line['id']
|
||
law_id = line["law_id"]
|
||
content = line['content']
|
||
|
||
try:
|
||
# جستجوی شناسه سکشن جاری در داده الستیک
|
||
foun_item = np.where(np_sections_arr[:, 0] == id)[0][0]
|
||
local_id = np_sections_arr[foun_item][1]["id"]
|
||
if not local_id == id:
|
||
pass
|
||
# دریافت فیلد html
|
||
# اگر این فیلد پر باشد، به این معناست که سکشن جاری، دارای جدول است و نباید به فایل اصلی اضافه شود
|
||
html = np_sections_arr[foun_item][1]["child_order"]
|
||
has_html = False
|
||
if html:
|
||
has_html = True
|
||
# پیدا کردن ترتیب اولویت برای این سکشن بر اساس داده الستیک
|
||
child_order = str(int(np_sections_arr[foun_item][1]["child_order"]))
|
||
selected_sections.append({
|
||
"id": id,
|
||
"law_id": law_id,
|
||
"content": content,
|
||
"child_order": child_order,
|
||
"has_html": has_html
|
||
})
|
||
except Exception as e:
|
||
result = -1
|
||
|
||
print(i+1)
|
||
|
||
|
||
|
||
except Exception as inst:
|
||
print(type(inst)) # the exception type
|
||
print(inst.args) # arguments stored in .args
|
||
print(inst) # __str__ allows args to be printed directly,
|
||
# but may be overridden in exception subclasses
|
||
print("Exception:=> %s -> %.2f " % (id , counter / total))
|
||
|
||
print(len(selected_sections))
|
||
path = "./data/main_sections_15k.json" # os.getcwd() +
|
||
write_to_json(selected_sections, path)
|
||
|
||
end_time = time.time()
|
||
print(f"elapsed time: {end_time-start_time} seconds.")
|
||
print(" *** finished! *** ")
|