236 lines
6.8 KiB
Python
236 lines
6.8 KiB
Python
|
"""
|
|||
|
این کد برای اضافه کردن فیلد child_order
|
|||
|
به داده های فایل سکشن های 15 هزارتایی اصلی ایجاد شده است.
|
|||
|
"""
|
|||
|
|
|||
|
from html import escape
|
|||
|
from lxml import etree
|
|||
|
from datetime import datetime
|
|||
|
from elasticsearch import Elasticsearch
|
|||
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
|
|||
|
from threading import Thread
|
|||
|
import torch
|
|||
|
import time
|
|||
|
from concurrent.futures import ThreadPoolExecutor
|
|||
|
import concurrent
|
|||
|
import threading
|
|||
|
import json
|
|||
|
import numpy as np
|
|||
|
|
|||
|
from funcs import write_to_json, read_from_json
|
|||
|
import os
|
|||
|
|
|||
|
|
|||
|
index_name_i = 'mj_qa_section-v02'# semantic_search-v10
|
|||
|
|
|||
|
|
|||
|
es = Elasticsearch(
|
|||
|
"http://127.0.0.1:6900",
|
|||
|
basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
|
|||
|
)
|
|||
|
|
|||
|
counter = 0
|
|||
|
total = 0
|
|||
|
remained = 0
|
|||
|
id = ''
|
|||
|
keywords_count = 15
|
|||
|
|
|||
|
body_query = "{'query': {'match_all': {}}}"
|
|||
|
|
|||
|
def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="25m", **kwargs):
|
|||
|
"""
|
|||
|
Helper to iterate ALL values from a single index
|
|||
|
Yields all the documents.
|
|||
|
"""
|
|||
|
global counter
|
|||
|
global total
|
|||
|
is_first = True
|
|||
|
while True:
|
|||
|
# Scroll next
|
|||
|
if is_first: # Initialize scroll
|
|||
|
|
|||
|
result = es.search(
|
|||
|
index=index,
|
|||
|
scroll="2m",
|
|||
|
**kwargs,
|
|||
|
size=pagesize,
|
|||
|
body={
|
|||
|
"query": {
|
|||
|
"bool": {
|
|||
|
"must": [
|
|||
|
{
|
|||
|
"bool": {
|
|||
|
"must_not": [
|
|||
|
|
|||
|
{
|
|||
|
"match": {
|
|||
|
"other_info.full_path": "موخره"
|
|||
|
}
|
|||
|
},
|
|||
|
{
|
|||
|
"match": {
|
|||
|
"other_info.full_path": "امضاء"
|
|||
|
}
|
|||
|
},
|
|||
|
{
|
|||
|
"match": {
|
|||
|
"other_info.full_path": "عنوان"
|
|||
|
}
|
|||
|
}
|
|||
|
]
|
|||
|
}
|
|||
|
},
|
|||
|
{
|
|||
|
"bool": {
|
|||
|
"filter": {
|
|||
|
"bool": {
|
|||
|
"must": [
|
|||
|
{
|
|||
|
"term": {
|
|||
|
"qanon_etebar": "معتبر"
|
|||
|
}
|
|||
|
},
|
|||
|
{
|
|||
|
"term": {
|
|||
|
"title_type": "عادی"
|
|||
|
}
|
|||
|
},
|
|||
|
{
|
|||
|
"term": {
|
|||
|
"ts_ref.keyword": "مجلس شورای اسلامی"
|
|||
|
}
|
|||
|
},
|
|||
|
{
|
|||
|
"term": {
|
|||
|
"sub_type": "عادی"
|
|||
|
}
|
|||
|
}
|
|||
|
]
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
]
|
|||
|
}
|
|||
|
},
|
|||
|
"sort": {
|
|||
|
"sort_date_timestamp": {
|
|||
|
"order": "desc"
|
|||
|
}
|
|||
|
},
|
|||
|
"track_total_hits": True,
|
|||
|
"aggs": {
|
|||
|
"total_collapse": {
|
|||
|
"cardinality": {
|
|||
|
"field": "qanon_id"
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
)
|
|||
|
total = result["hits"]["total"]["value"]
|
|||
|
print("total = %d" % total)
|
|||
|
is_first = False
|
|||
|
else:
|
|||
|
result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
|
|||
|
scroll_id = result["_scroll_id"]
|
|||
|
hits = result["hits"]["hits"]
|
|||
|
counter += len(hits)
|
|||
|
print("progress -> %.2f %%" % ((counter / total) * 100))
|
|||
|
# Stop after no more docs
|
|||
|
if not hits:
|
|||
|
break
|
|||
|
# Yield each entry
|
|||
|
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
|
|||
|
|
|||
|
|
|||
|
def add_section(section):
|
|||
|
data = ({
|
|||
|
"id": section["id"],
|
|||
|
"qanon_id": section["qanon_id"],
|
|||
|
"content": section["content"],
|
|||
|
"main_topic": section["tcode_main_old"][0],
|
|||
|
"all_topics": section["tcode_main_old"],
|
|||
|
"ts_year": section["ts_year"],
|
|||
|
"state_etebar": section["state_etebar"],
|
|||
|
"ners": section["ners_v1"]
|
|||
|
})
|
|||
|
return data
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
|
|||
|
start_time = time.time()
|
|||
|
|
|||
|
base_address = os.getcwd() # debugger
|
|||
|
#base_address = "/home/gpu/tnlp/jokar/llama" # terminal
|
|||
|
|
|||
|
json_address_15k_sections = base_address + "/data/sections_15k.json"
|
|||
|
|
|||
|
data15k = read_from_json(json_address_15k_sections)
|
|||
|
|
|||
|
all_sections = es_iterate_all_documents(es, index_name_i)
|
|||
|
|
|||
|
all_sections_arr = []
|
|||
|
for mentry in all_sections:
|
|||
|
section_id = mentry["id"]
|
|||
|
source = mentry["source"]
|
|||
|
all_sections_arr.append([section_id, source])
|
|||
|
# انتقال داده های الستیک به یک لیست نامپای برای سرعت بیشتر جستجو در داده ها
|
|||
|
np_sections_arr = np.array(all_sections_arr)
|
|||
|
|
|||
|
selected_sections = []
|
|||
|
index = -1
|
|||
|
x = 0
|
|||
|
try:
|
|||
|
|
|||
|
for i, line in enumerate(data15k):
|
|||
|
# if i == 813:
|
|||
|
# pass
|
|||
|
id = line['id']
|
|||
|
law_id = line["law_id"]
|
|||
|
content = line['content']
|
|||
|
|
|||
|
try:
|
|||
|
# جستجوی شناسه سکشن جاری در داده الستیک
|
|||
|
foun_item = np.where(np_sections_arr[:, 0] == id)[0][0]
|
|||
|
local_id = np_sections_arr[foun_item][1]["id"]
|
|||
|
if not local_id == id:
|
|||
|
pass
|
|||
|
# دریافت فیلد html
|
|||
|
# اگر این فیلد پر باشد، به این معناست که سکشن جاری، دارای جدول است و نباید به فایل اصلی اضافه شود
|
|||
|
html = np_sections_arr[foun_item][1]["child_order"]
|
|||
|
has_html = False
|
|||
|
if html:
|
|||
|
has_html = True
|
|||
|
# پیدا کردن ترتیب اولویت برای این سکشن بر اساس داده الستیک
|
|||
|
child_order = str(int(np_sections_arr[foun_item][1]["child_order"]))
|
|||
|
selected_sections.append({
|
|||
|
"id": id,
|
|||
|
"law_id": law_id,
|
|||
|
"content": content,
|
|||
|
"child_order": child_order,
|
|||
|
"has_html": has_html
|
|||
|
})
|
|||
|
except Exception as e:
|
|||
|
result = -1
|
|||
|
|
|||
|
print(i+1)
|
|||
|
|
|||
|
|
|||
|
|
|||
|
except Exception as inst:
|
|||
|
print(type(inst)) # the exception type
|
|||
|
print(inst.args) # arguments stored in .args
|
|||
|
print(inst) # __str__ allows args to be printed directly,
|
|||
|
# but may be overridden in exception subclasses
|
|||
|
print("Exception:=> %s -> %.2f " % (id , counter / total))
|
|||
|
|
|||
|
print(len(selected_sections))
|
|||
|
path = "./data/main_sections_15k.json" # os.getcwd() +
|
|||
|
write_to_json(selected_sections, path)
|
|||
|
|
|||
|
end_time = time.time()
|
|||
|
print(f"elapsed time: {end_time-start_time} seconds.")
|
|||
|
print(" *** finished! *** ")
|