llama/elastic_keywords_llama3_1.py

from html import escape
from lxml import etree
from datetime import datetime
from elasticsearch import Elasticsearch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
from threading import Thread
import torch
#from cleantext import clean
#import re


if torch.cuda.is_available():
    model_id = "PartAI/Dorna-Llama3-8B-Instruct"
    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
    tokenizer = AutoTokenizer.from_pretrained(model_id)


# pipe = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     torch_dtype=torch.float16,
#     device_map="auto",
# )


index_name_i = 'semantic_search-v10'


es = Elasticsearch(
    "http://127.0.0.1:6900",
    # ca_certs="/path/to/http_ca.crt",
    basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
)

counter = 0
total = 0
id = ''
keywords_count = 15

messages = [    {'role': 'user', 'content': 'لطفا فقط فارسی جواب بدهید'},
    {'role': 'user', 'content': '''

"متن شماره 1" :
برای تأمین استقلال اقتصادی جامعه و ریشه ‌كن كردن فقر و محرومیت و برآوردن نیازهای انسان در جریان رشد، با حفظ آزادگی او، اقتصاد جمهوری اسلامی ایران بر اساس ضوابط زیر استوار می ‌شود:
- تنظیم برنامه اقتصادی كشور به صورتی كه شكل و محتوا و ساعات كار چنان باشد كه هر فرد علاوه بر تلاش شغلی ، فرصت و توان كافی برای خودسازی معنوی، سیاسی و اجتماعی و شركت فعال در رهبری كشور و افزایش مهارت و ابتكار داشته باشد.
- رعایت آزادی انتخاب شغل و عدم اجبار افراد به كاری معین و جلوگیری از بهره‌ كشی از كار دیگری.
- منع اضرار به غیر و انحصار و احتكار و ربا و دیگر معاملات باطل و حرام.

کلیدواژه های اصلی  در "متن شماره 1" عبارت اند از:

تأمین استقلال اقتصادی
فقر و محرومیت
برآوردن نیازهای انسان
حفظ آزادگی
برنامه اقتصادی كشور
خودسازی معنوی
آزادی انتخاب شغل
منع اضرار به غیر
منع احتکار
منع انحصار
منع ربا
منع معاملات باطل و حرام

'''}

]
def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="12m", **kwargs):
    """
    Helper to iterate ALL values from a single index
    Yields all the documents.
    """
    global counter
    global total
    is_first = True
    while True:
        # Scroll next
        if is_first: # Initialize scroll
            # result = es.search(index=index, scroll="12m", **kwargs, body={
            #     "size": pagesize
            # })
            result = es.search(index=index, scroll="12m", **kwargs, size=pagesize)
            total = result["hits"]["total"]['value']
            print('total = %d' % total)
            is_first = False
        else:
            # result = es.scroll(body={
            #     "scroll_id": scroll_id,
            #     "scroll": scroll_timeout
            # })
            result = es.scroll( scroll_id = scroll_id, scroll = scroll_timeout )
        scroll_id = result["_scroll_id"]
        hits = result["hits"]["hits"]
        counter += len(hits)
        print("progress -> %.2f %% , counte: %d" % ((counter / total)*100, counter))
        # Stop after no more docs
        if not hits:
            break
        # Yield each entry
        yield from ({"source":hit['_source'], "id":hit['_id']} for hit in hits)

try:
  #els_file = open('./elastic-dataset.jsonl', 'w', encoding='utf-8')
  for mentry in es_iterate_all_documents(es, index_name_i):

      entry = mentry['source']
      id = mentry['id']

      #title = entry.get('title','').replace('"', "'").replace('\n', ' ').replace('\r', '')
      #text = entry.get('clean_content','')
      text = entry.get('content','')

      #lkeys = entry.get('content_keywords','')
      print("%s -> %.2f " % (id , counter / total))
      try:
        full_path = entry.get('other_info', {'full_path':''})['full_path']
        text_len = len(text)
        if full_path == 'عنوان' or full_path == 'موخره' or full_path == 'امضاء' or text_len == 0:
            continue

        keywords_count = (text_len / 1000) * 15
        if keywords_count < 0.3:
           continue
        keywords_count = int(keywords_count)
        if keywords_count == 0:
           keywords_count = 1

        message_new =  {"role": "user", "content":
                    '''با توجه به کلیدواژه های استخراج شده از "متن شماره 1" از متن زیر حداقل {} کلیدواژه اصلی را استخراج کن به طوریکه تعداد توکن های هر کلیدواژه حداقل بین 1 تا 5 توکن باشد:
                    {}
                    '''.format(keywords_count, text)
        }
        messages.append(message_new)
        # prompt = tokenizer.apply_chat_template(messages,
        #                                        tokenize=False,
        #                                        add_generation_prompt=True
        #                                        )
        # terminators = [
        #             tokenizer.eos_token_id,
        #             tokenizer.convert_tokens_to_ids("<|eot_id|>")
        #         ]
        # outputs = pipe(prompt,
        #             max_new_tokens=256,
        #             do_sample=True,
        #             eos_token_id=terminators,
        #             temperature=0.6,
        #             top_p=0.85
        #             )

        #keywords = outputs[0]["generated_text"]

        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)

        terminators = [
                    tokenizer.eos_token_id,
                    tokenizer.convert_tokens_to_ids("<|eot_id|>")
                ]
        model.generation_config.pad_token_id = tokenizer.pad_token_id

        outputs = model.generate(
            input_ids,
            max_new_tokens=256,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.85,
        )
        response = outputs[0][input_ids.shape[-1]:]
        keywords = tokenizer.decode(response, skip_special_tokens=True)
        #print(lkeys)
        #print(keywords)
        #print('*'*20)
        resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})
        messages.pop()

      except Exception as inst:
        print(type(inst))    # the exception type
        print(inst.args)     # arguments stored in .args
        print("Exception: " + str(inst))
      #  print(inst)
      #  print(id)

except Exception as inst:
    print(type(inst))    # the exception type
    print(inst.args)     # arguments stored in .args
    print(inst)          # __str__ allows args to be printed directly,
                         # but may be overridden in exception subclasses
    print("%s -> %.2f " % (id , counter / total))