llama/test_elastic.py

#!/home/NLP/LLM/.env/bin/python
from html import escape
from lxml import etree
from datetime import datetime
from elasticsearch import Elasticsearch

index_name_i = 'semantic_search-v09'


es = Elasticsearch(
    "http://127.0.0.1:6900",
    # ca_certs="/path/to/http_ca.crt",
    basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
)

counter = 0
total = 0
id = ''

def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="12m", **kwargs):
    """
    Helper to iterate ALL values from a single index
    Yields all the documents.
    """
    global counter
    global total
    is_first = True
    while True:
        # Scroll next
        if is_first: # Initialize scroll
            # result = es.search(index=index, scroll="12m", **kwargs, body={
            #     "size": pagesize
            # })
            result = es.search(index=index, scroll="12m", **kwargs, size=pagesize)
            total = result["hits"]["total"]['value']
            print('total = %d' % total)
            is_first = False
        else:
            # result = es.scroll(body={
            #     "scroll_id": scroll_id,
            #     "scroll": scroll_timeout
            # })
            result = es.scroll( scroll_id = scroll_id, scroll = scroll_timeout )
        scroll_id = result["_scroll_id"]
        hits = result["hits"]["hits"]
        counter += len(hits)
        print("progress -> %.2f %% , counte: %d" % ((counter / total)*100, counter))
        # Stop after no more docs
        if not hits:
            break
        # Yield each entry
        yield from ({"source":hit['_source'], "id":hit['_id']} for hit in hits)

try:
  #els_file = open('./elastic-dataset.jsonl', 'w', encoding='utf-8')
  for mentry in es_iterate_all_documents(es, index_name_i):

      entry = mentry['source']
      id = mentry['id']

      #title = entry.get('title','').replace('"', "'").replace('\n', ' ').replace('\r', '')
      #text = entry.get('content','').replace('"', "'").replace('\n', ' ').replace('\r', '')
      print("%s -> %.2f " % (id , counter / total))
      #try:

        #[n,768]
        #n = len(tokenized(text))/512
        # normalText = ""
        # if(len(text) > 10000):
        #   textParts = textwrap.wrap(text, 10000, break_long_words=False)
        #   for p in textParts:
        #     normalText += _normalizer.normalize(p)
        # else:
        #     normalText = _normalizer.normalize(text)
        #if(els_file):
        #    els_file.write('{"text":"'+title+'"}\n')
        #    els_file.write('{"text":"'+text+'"}\n')


      #except Exception as inst:
        #print(type(inst))    # the exception type
        #print(inst.args)     # arguments stored in .args
      #  print(inst)
      #  print(id)

except Exception as inst:
    print(type(inst))    # the exception type
    print(inst.args)     # arguments stored in .args
    print(inst)          # __str__ allows args to be printed directly,
                         # but may be overridden in exception subclasses
    print("%s -> %.2f " % (id , counter / total))