llama/make_all_data.py

"""
این فایل با نرمالایزر پارسیور کار می کند
"""
from html import escape
from lxml import etree
from datetime import datetime
from elasticsearch import Elasticsearch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
from threading import Thread
import torch
import time
from concurrent.futures import ThreadPoolExecutor
import concurrent
import threading
import json
import os.path
import os
from general_functions import normalize_content
from funcs import write_to_json, read_from_json
#lock = threading.Lock()
#lock1 = threading.Lock()
#from cleantext import clean
#import re
from normalizer import Normalizer
from tokenizer import *
_normalizer = Normalizer(date_normalizing_needed=True)

address = os.getcwd()


if torch.cuda.is_available():
    model_id = "PartAI/Dorna-Llama3-8B-Instruct"
    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
    tokenizer = AutoTokenizer.from_pretrained(model_id)


# pipe = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     torch_dtype=torch.float16,
#     device_map="auto",
# )


index_name_i = 'mj_qa_section-v02'


es = Elasticsearch(
    "http://127.0.0.1:6900",
    # ca_certs="/path/to/http_ca.crt",
    basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
)

counter = 0
total = 0
remained = 0
id = ''
keywords_count = 15


def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="12m", **kwargs):
    """
    Helper to iterate ALL values from a single index
    Yields all the documents.
    """
    global counter
    global total
    global remained
    is_first = True

    while True:

        # Scroll next
        if is_first: # Initialize scroll
            # result = es.search(index=index, scroll="12m", **kwargs, body={
            #     "size": pagesize
            # })
            result = es.search(index=index, scroll="12m", **kwargs, size=pagesize)
            total = result["hits"]["total"]['value']
            remained = total
            print('total = %d' % total)
            is_first = False
        else:
            # result = es.scroll(body={
            #     "scroll_id": scroll_id,
            #     "scroll": scroll_timeout
            # })
            result = es.scroll( scroll_id = scroll_id, scroll = scroll_timeout )
        scroll_id = result["_scroll_id"]
        hits = result["hits"]["hits"]
        counter += len(hits)
        print("progress -> %.2f %% , count: %d" % ((counter / total)*100, counter))
        # Stop after no more docs
        if not hits:
            break
        # Yield each entry
        yield from ({"source":hit['_source'], "id":hit['_id']} for hit in hits)

sections_all = []
for mentry in es_iterate_all_documents(es, index_name_i):
    entry = mentry['source']
    id = mentry['id']
    qid = entry['id']
    id = entry['id']
    id = entry['id']

    sections_all.append({
        "id": id,
        "q-id": qid,
        "content": content,
        "level": level,
        "child-order": child_order,
        "parent-id": parent,
        "ners": ners,
    })
write_to_json(sections_all, "/data/sections_all.json")


print(f" Finished!!! ")