llama/elastic_keywords.py
2025-07-13 19:05:59 +03:30

145 lines
4.3 KiB
Python

from html import escape
from lxml import etree
from datetime import datetime
from elasticsearch import Elasticsearch
import hazm
from normalizer import cleaning
#from cleantext import clean
#import re
from keybert import KeyBERT
#def cleanhtml(raw_html):
# cleanr = re.compile('<.*?>')
# cleantext = re.sub(cleanr, '', raw_html)
# return cleantext
#normalizer = hazm.Normalizer()
#wierd_pattern = re.compile("["
# u"\U0001F600-\U0001F64F" # emoticons
# u"\U0001F300-\U0001F5FF" # symbols & pictographs
# u"\U0001F680-\U0001F6FF" # transport & map symbols
# u"\U0001F1E0-\U0001F1FF" # flags (iOS)
# u"\U00002702-\U000027B0"
# u"\U000024C2-\U0001F251"
# u"\U0001f926-\U0001f937"
# u'\U00010000-\U0010ffff'
# u"\u200d"
# u"\u2640-\u2642"
# u"\u2600-\u2B55"
# u"\u23cf"
# u"\u23e9"
# u"\u231a"
# u"\u3030"
# u"\ufe0f"
# u"\u2069"
# u"\u2066"
# # u"\u200c"
# u"\u2068"
# u"\u2067"
# "]+", flags=re.UNICODE)
#def cleaning(text):
# text = text.strip()
# text = clean(text,
# extra_spaces = True,
# lowercase = True
# )
# # cleaning htmls
# text = cleanhtml(text)
#
# # normalizing
# text = normalizer.normalize(text)
# removing wierd patterns
# text = wierd_pattern.sub(r'', text)
# removing extra spaces, hashtags
# text = re.sub("#", "", text)
# text = re.sub("\s+", " ", text)
# return text
index_name_i = 'semantic_search-v10'
model_id = "../../MLM/MODELS/HooshvareLab-bert-fa-base-uncased-finetuned-2-pt"
kw_model = KeyBERT(model=model_id)
es = Elasticsearch(
"http://127.0.0.1:6900",
# ca_certs="/path/to/http_ca.crt",
basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
)
stop_words = hazm.stopwords_list()
stop_words = [cleaning(element) for element in stop_words]
counter = 0
total = 0
id = ''
def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="12m", **kwargs):
"""
Helper to iterate ALL values from a single index
Yields all the documents.
"""
global counter
global total
is_first = True
while True:
# Scroll next
if is_first: # Initialize scroll
# result = es.search(index=index, scroll="12m", **kwargs, body={
# "size": pagesize
# })
result = es.search(index=index, scroll="12m", **kwargs, size=pagesize)
total = result["hits"]["total"]['value']
print('total = %d' % total)
is_first = False
else:
# result = es.scroll(body={
# "scroll_id": scroll_id,
# "scroll": scroll_timeout
# })
result = es.scroll( scroll_id = scroll_id, scroll = scroll_timeout )
scroll_id = result["_scroll_id"]
hits = result["hits"]["hits"]
counter += len(hits)
print("progress -> %.2f %% , counte: %d" % ((counter / total)*100, counter))
# Stop after no more docs
if not hits:
break
# Yield each entry
yield from ({"source":hit['_source'], "id":hit['_id']} for hit in hits)
try:
#els_file = open('./elastic-dataset.jsonl', 'w', encoding='utf-8')
for mentry in es_iterate_all_documents(es, index_name_i):
entry = mentry['source']
id = mentry['id']
#title = entry.get('title','').replace('"', "'").replace('\n', ' ').replace('\r', '')
text = entry.get('clean_content','')
#lkeys = entry.get('content_keywords','')
print("%s -> %.2f " % (id , counter / total))
try:
#print(lkeys)
#print(keywords)
#print('*'*20)
keywords = ''
resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})
except Exception as inst:
print(type(inst)) # the exception type
print(inst.args) # arguments stored in .args
print("Exception: " + str(inst))
# print(inst)
# print(id)
except Exception as inst:
print(type(inst)) # the exception type
print(inst.args) # arguments stored in .args
print(inst) # __str__ allows args to be printed directly,
# but may be overridden in exception subclasses
print("%s -> %.2f " % (id , counter / total))