""" این فایل با نرمالایزر پارسیور کار می کند """ from html import escape from lxml import etree from datetime import datetime from elasticsearch import Elasticsearch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer from threading import Thread import torch import time from concurrent.futures import ThreadPoolExecutor import concurrent import threading import json import os.path import os from general_functions import normalize_content from funcs import write_to_json, read_from_json #lock = threading.Lock() #lock1 = threading.Lock() #from cleantext import clean #import re from normalizer import Normalizer from tokenizer import * _normalizer = Normalizer(date_normalizing_needed=True) address = os.getcwd() if torch.cuda.is_available(): model_id = "PartAI/Dorna-Llama3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16) tokenizer = AutoTokenizer.from_pretrained(model_id) # pipe = pipeline( # "text-generation", # model=model, # tokenizer=tokenizer, # torch_dtype=torch.float16, # device_map="auto", # ) index_name_i = 'mj_qa_section-v02' es = Elasticsearch( "http://127.0.0.1:6900", # ca_certs="/path/to/http_ca.crt", basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm") ) counter = 0 total = 0 remained = 0 id = '' keywords_count = 15 def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="12m", **kwargs): """ Helper to iterate ALL values from a single index Yields all the documents. """ global counter global total global remained is_first = True while True: # Scroll next if is_first: # Initialize scroll # result = es.search(index=index, scroll="12m", **kwargs, body={ # "size": pagesize # }) result = es.search(index=index, scroll="12m", **kwargs, size=pagesize) total = result["hits"]["total"]['value'] remained = total print('total = %d' % total) is_first = False else: # result = es.scroll(body={ # "scroll_id": scroll_id, # "scroll": scroll_timeout # }) result = es.scroll( scroll_id = scroll_id, scroll = scroll_timeout ) scroll_id = result["_scroll_id"] hits = result["hits"]["hits"] counter += len(hits) print("progress -> %.2f %% , count: %d" % ((counter / total)*100, counter)) # Stop after no more docs if not hits: break # Yield each entry yield from ({"source":hit['_source'], "id":hit['_id']} for hit in hits) sections_all = [] for mentry in es_iterate_all_documents(es, index_name_i): entry = mentry['source'] id = mentry['id'] qid = entry['id'] id = entry['id'] id = entry['id'] sections_all.append({ "id": id, "q-id": qid, "content": content, "level": level, "child-order": child_order, "parent-id": parent, "ners": ners, }) write_to_json(sections_all, "/data/sections_all.json") print(f" Finished!!! ")