125 lines
3.2 KiB
Python
125 lines
3.2 KiB
Python
"""
|
|
این فایل با نرمالایزر پارسیور کار می کند
|
|
"""
|
|
from html import escape
|
|
from lxml import etree
|
|
from datetime import datetime
|
|
from elasticsearch import Elasticsearch
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
|
|
from threading import Thread
|
|
import torch
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
import concurrent
|
|
import threading
|
|
import json
|
|
import os.path
|
|
import os
|
|
from general_functions import normalize_content
|
|
from funcs import write_to_json, read_from_json
|
|
#lock = threading.Lock()
|
|
#lock1 = threading.Lock()
|
|
#from cleantext import clean
|
|
#import re
|
|
from normalizer import Normalizer
|
|
from tokenizer import *
|
|
_normalizer = Normalizer(date_normalizing_needed=True)
|
|
|
|
address = os.getcwd()
|
|
|
|
|
|
if torch.cuda.is_available():
|
|
model_id = "PartAI/Dorna-Llama3-8B-Instruct"
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
|
|
|
|
# pipe = pipeline(
|
|
# "text-generation",
|
|
# model=model,
|
|
# tokenizer=tokenizer,
|
|
# torch_dtype=torch.float16,
|
|
# device_map="auto",
|
|
# )
|
|
|
|
|
|
|
|
|
|
index_name_i = 'mj_qa_section-v02'
|
|
|
|
|
|
|
|
|
|
es = Elasticsearch(
|
|
"http://127.0.0.1:6900",
|
|
# ca_certs="/path/to/http_ca.crt",
|
|
basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
|
|
)
|
|
|
|
counter = 0
|
|
total = 0
|
|
remained = 0
|
|
id = ''
|
|
keywords_count = 15
|
|
|
|
|
|
def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="12m", **kwargs):
|
|
"""
|
|
Helper to iterate ALL values from a single index
|
|
Yields all the documents.
|
|
"""
|
|
global counter
|
|
global total
|
|
global remained
|
|
is_first = True
|
|
|
|
while True:
|
|
|
|
# Scroll next
|
|
if is_first: # Initialize scroll
|
|
# result = es.search(index=index, scroll="12m", **kwargs, body={
|
|
# "size": pagesize
|
|
# })
|
|
result = es.search(index=index, scroll="12m", **kwargs, size=pagesize)
|
|
total = result["hits"]["total"]['value']
|
|
remained = total
|
|
print('total = %d' % total)
|
|
is_first = False
|
|
else:
|
|
# result = es.scroll(body={
|
|
# "scroll_id": scroll_id,
|
|
# "scroll": scroll_timeout
|
|
# })
|
|
result = es.scroll( scroll_id = scroll_id, scroll = scroll_timeout )
|
|
scroll_id = result["_scroll_id"]
|
|
hits = result["hits"]["hits"]
|
|
counter += len(hits)
|
|
print("progress -> %.2f %% , count: %d" % ((counter / total)*100, counter))
|
|
# Stop after no more docs
|
|
if not hits:
|
|
break
|
|
# Yield each entry
|
|
yield from ({"source":hit['_source'], "id":hit['_id']} for hit in hits)
|
|
|
|
sections_all = []
|
|
for mentry in es_iterate_all_documents(es, index_name_i):
|
|
entry = mentry['source']
|
|
id = mentry['id']
|
|
qid = entry['id']
|
|
id = entry['id']
|
|
id = entry['id']
|
|
|
|
sections_all.append({
|
|
"id": id,
|
|
"q-id": qid,
|
|
"content": content,
|
|
"level": level,
|
|
"child-order": child_order,
|
|
"parent-id": parent,
|
|
"ners": ners,
|
|
})
|
|
write_to_json(sections_all, "/data/sections_all.json")
|
|
|
|
|
|
|
|
print(f" Finished!!! ") |