llama/elastic_keywords.py

from html import escape
from lxml import etree
from datetime import datetime
from elasticsearch import Elasticsearch
import hazm
from normalizer import cleaning
#from cleantext import clean
#import re
from keybert import KeyBERT


#def cleanhtml(raw_html):
#    cleanr = re.compile('<.*?>')
#    cleantext = re.sub(cleanr, '', raw_html)
#    return cleantext

#normalizer = hazm.Normalizer()
#wierd_pattern = re.compile("["
#        u"\U0001F600-\U0001F64F"  # emoticons
#        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#        u"\U0001F680-\U0001F6FF"  # transport & map symbols
#        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#        u"\U00002702-\U000027B0"
#        u"\U000024C2-\U0001F251"
#        u"\U0001f926-\U0001f937"
#        u'\U00010000-\U0010ffff'
#        u"\u200d"
#        u"\u2640-\u2642"
#        u"\u2600-\u2B55"
#        u"\u23cf"
#        u"\u23e9"
#        u"\u231a"
#        u"\u3030"
#        u"\ufe0f"
#        u"\u2069"
#        u"\u2066"
#        # u"\u200c"
#        u"\u2068"
#        u"\u2067"
#        "]+", flags=re.UNICODE)
#def cleaning(text):
#    text = text.strip()
#    text = clean(text,
#                 extra_spaces = True,
#                 lowercase = True
#                 )
#    # cleaning htmls
#    text = cleanhtml(text)
#
#    # normalizing
#    text = normalizer.normalize(text)
    # removing wierd patterns
#    text = wierd_pattern.sub(r'', text)
    # removing extra spaces, hashtags
#    text = re.sub("#", "", text)
#    text = re.sub("\s+", " ", text)
#    return text


index_name_i = 'semantic_search-v10'
model_id = "../../MLM/MODELS/HooshvareLab-bert-fa-base-uncased-finetuned-2-pt"
kw_model = KeyBERT(model=model_id)

es = Elasticsearch(
    "http://127.0.0.1:6900",
    # ca_certs="/path/to/http_ca.crt",
    basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
)

stop_words = hazm.stopwords_list()
stop_words = [cleaning(element) for element in stop_words]

counter = 0
total = 0
id = ''
def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="12m", **kwargs):
    """
    Helper to iterate ALL values from a single index
    Yields all the documents.
    """
    global counter
    global total
    is_first = True
    while True:
        # Scroll next
        if is_first: # Initialize scroll
            # result = es.search(index=index, scroll="12m", **kwargs, body={
            #     "size": pagesize
            # })
            result = es.search(index=index, scroll="12m", **kwargs, size=pagesize)
            total = result["hits"]["total"]['value']
            print('total = %d' % total)
            is_first = False
        else:
            # result = es.scroll(body={
            #     "scroll_id": scroll_id,
            #     "scroll": scroll_timeout
            # })
            result = es.scroll( scroll_id = scroll_id, scroll = scroll_timeout )
        scroll_id = result["_scroll_id"]
        hits = result["hits"]["hits"]
        counter += len(hits)
        print("progress -> %.2f %% , counte: %d" % ((counter / total)*100, counter))
        # Stop after no more docs
        if not hits:
            break
        # Yield each entry
        yield from ({"source":hit['_source'], "id":hit['_id']} for hit in hits)

try:
  #els_file = open('./elastic-dataset.jsonl', 'w', encoding='utf-8')
  for mentry in es_iterate_all_documents(es, index_name_i):

      entry = mentry['source']
      id = mentry['id']

      #title = entry.get('title','').replace('"', "'").replace('\n', ' ').replace('\r', '')
      text = entry.get('clean_content','')
      #lkeys = entry.get('content_keywords','')
      print("%s -> %.2f " % (id , counter / total))
      try:

        #print(lkeys)
        #print(keywords)
        #print('*'*20)
        keywords = ''
        resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})


      except Exception as inst:
        print(type(inst))    # the exception type
        print(inst.args)     # arguments stored in .args
        print("Exception: " + str(inst))
      #  print(inst)
      #  print(id)

except Exception as inst:
    print(type(inst))    # the exception type
    print(inst.args)     # arguments stored in .args
    print(inst)          # __str__ allows args to be printed directly,
                         # but may be overridden in exception subclasses
    print("%s -> %.2f " % (id , counter / total))