llama/test_elastic.py
2025-07-13 19:05:59 +03:30

94 lines
3.0 KiB
Python

#!/home/NLP/LLM/.env/bin/python
from html import escape
from lxml import etree
from datetime import datetime
from elasticsearch import Elasticsearch
index_name_i = 'semantic_search-v09'
es = Elasticsearch(
"http://127.0.0.1:6900",
# ca_certs="/path/to/http_ca.crt",
basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
)
counter = 0
total = 0
id = ''
def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="12m", **kwargs):
"""
Helper to iterate ALL values from a single index
Yields all the documents.
"""
global counter
global total
is_first = True
while True:
# Scroll next
if is_first: # Initialize scroll
# result = es.search(index=index, scroll="12m", **kwargs, body={
# "size": pagesize
# })
result = es.search(index=index, scroll="12m", **kwargs, size=pagesize)
total = result["hits"]["total"]['value']
print('total = %d' % total)
is_first = False
else:
# result = es.scroll(body={
# "scroll_id": scroll_id,
# "scroll": scroll_timeout
# })
result = es.scroll( scroll_id = scroll_id, scroll = scroll_timeout )
scroll_id = result["_scroll_id"]
hits = result["hits"]["hits"]
counter += len(hits)
print("progress -> %.2f %% , counte: %d" % ((counter / total)*100, counter))
# Stop after no more docs
if not hits:
break
# Yield each entry
yield from ({"source":hit['_source'], "id":hit['_id']} for hit in hits)
try:
#els_file = open('./elastic-dataset.jsonl', 'w', encoding='utf-8')
for mentry in es_iterate_all_documents(es, index_name_i):
entry = mentry['source']
id = mentry['id']
#title = entry.get('title','').replace('"', "'").replace('\n', ' ').replace('\r', '')
#text = entry.get('content','').replace('"', "'").replace('\n', ' ').replace('\r', '')
print("%s -> %.2f " % (id , counter / total))
#try:
#[n,768]
#n = len(tokenized(text))/512
# normalText = ""
# if(len(text) > 10000):
# textParts = textwrap.wrap(text, 10000, break_long_words=False)
# for p in textParts:
# normalText += _normalizer.normalize(p)
# else:
# normalText = _normalizer.normalize(text)
#if(els_file):
# els_file.write('{"text":"'+title+'"}\n')
# els_file.write('{"text":"'+text+'"}\n')
#except Exception as inst:
#print(type(inst)) # the exception type
#print(inst.args) # arguments stored in .args
# print(inst)
# print(id)
except Exception as inst:
print(type(inst)) # the exception type
print(inst.args) # arguments stored in .args
print(inst) # __str__ allows args to be printed directly,
# but may be overridden in exception subclasses
print("%s -> %.2f " % (id , counter / total))