145 lines
4.3 KiB
Python
145 lines
4.3 KiB
Python
from html import escape
|
|
from lxml import etree
|
|
from datetime import datetime
|
|
from elasticsearch import Elasticsearch
|
|
import hazm
|
|
from normalizer import cleaning
|
|
#from cleantext import clean
|
|
#import re
|
|
from keybert import KeyBERT
|
|
|
|
|
|
|
|
#def cleanhtml(raw_html):
|
|
# cleanr = re.compile('<.*?>')
|
|
# cleantext = re.sub(cleanr, '', raw_html)
|
|
# return cleantext
|
|
|
|
#normalizer = hazm.Normalizer()
|
|
#wierd_pattern = re.compile("["
|
|
# u"\U0001F600-\U0001F64F" # emoticons
|
|
# u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
|
# u"\U0001F680-\U0001F6FF" # transport & map symbols
|
|
# u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
|
# u"\U00002702-\U000027B0"
|
|
# u"\U000024C2-\U0001F251"
|
|
# u"\U0001f926-\U0001f937"
|
|
# u'\U00010000-\U0010ffff'
|
|
# u"\u200d"
|
|
# u"\u2640-\u2642"
|
|
# u"\u2600-\u2B55"
|
|
# u"\u23cf"
|
|
# u"\u23e9"
|
|
# u"\u231a"
|
|
# u"\u3030"
|
|
# u"\ufe0f"
|
|
# u"\u2069"
|
|
# u"\u2066"
|
|
# # u"\u200c"
|
|
# u"\u2068"
|
|
# u"\u2067"
|
|
# "]+", flags=re.UNICODE)
|
|
#def cleaning(text):
|
|
# text = text.strip()
|
|
# text = clean(text,
|
|
# extra_spaces = True,
|
|
# lowercase = True
|
|
# )
|
|
# # cleaning htmls
|
|
# text = cleanhtml(text)
|
|
#
|
|
# # normalizing
|
|
# text = normalizer.normalize(text)
|
|
# removing wierd patterns
|
|
# text = wierd_pattern.sub(r'', text)
|
|
# removing extra spaces, hashtags
|
|
# text = re.sub("#", "", text)
|
|
# text = re.sub("\s+", " ", text)
|
|
# return text
|
|
|
|
|
|
index_name_i = 'semantic_search-v10'
|
|
model_id = "../../MLM/MODELS/HooshvareLab-bert-fa-base-uncased-finetuned-2-pt"
|
|
kw_model = KeyBERT(model=model_id)
|
|
|
|
es = Elasticsearch(
|
|
"http://127.0.0.1:6900",
|
|
# ca_certs="/path/to/http_ca.crt",
|
|
basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
|
|
)
|
|
|
|
stop_words = hazm.stopwords_list()
|
|
stop_words = [cleaning(element) for element in stop_words]
|
|
|
|
counter = 0
|
|
total = 0
|
|
id = ''
|
|
def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="12m", **kwargs):
|
|
"""
|
|
Helper to iterate ALL values from a single index
|
|
Yields all the documents.
|
|
"""
|
|
global counter
|
|
global total
|
|
is_first = True
|
|
while True:
|
|
# Scroll next
|
|
if is_first: # Initialize scroll
|
|
# result = es.search(index=index, scroll="12m", **kwargs, body={
|
|
# "size": pagesize
|
|
# })
|
|
result = es.search(index=index, scroll="12m", **kwargs, size=pagesize)
|
|
total = result["hits"]["total"]['value']
|
|
print('total = %d' % total)
|
|
is_first = False
|
|
else:
|
|
# result = es.scroll(body={
|
|
# "scroll_id": scroll_id,
|
|
# "scroll": scroll_timeout
|
|
# })
|
|
result = es.scroll( scroll_id = scroll_id, scroll = scroll_timeout )
|
|
scroll_id = result["_scroll_id"]
|
|
hits = result["hits"]["hits"]
|
|
counter += len(hits)
|
|
print("progress -> %.2f %% , counte: %d" % ((counter / total)*100, counter))
|
|
# Stop after no more docs
|
|
if not hits:
|
|
break
|
|
# Yield each entry
|
|
yield from ({"source":hit['_source'], "id":hit['_id']} for hit in hits)
|
|
|
|
try:
|
|
#els_file = open('./elastic-dataset.jsonl', 'w', encoding='utf-8')
|
|
for mentry in es_iterate_all_documents(es, index_name_i):
|
|
|
|
entry = mentry['source']
|
|
id = mentry['id']
|
|
|
|
#title = entry.get('title','').replace('"', "'").replace('\n', ' ').replace('\r', '')
|
|
text = entry.get('clean_content','')
|
|
#lkeys = entry.get('content_keywords','')
|
|
print("%s -> %.2f " % (id , counter / total))
|
|
try:
|
|
|
|
#print(lkeys)
|
|
#print(keywords)
|
|
#print('*'*20)
|
|
keywords = ''
|
|
resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})
|
|
|
|
|
|
except Exception as inst:
|
|
print(type(inst)) # the exception type
|
|
print(inst.args) # arguments stored in .args
|
|
print("Exception: " + str(inst))
|
|
# print(inst)
|
|
# print(id)
|
|
|
|
except Exception as inst:
|
|
print(type(inst)) # the exception type
|
|
print(inst.args) # arguments stored in .args
|
|
print(inst) # __str__ allows args to be printed directly,
|
|
# but may be overridden in exception subclasses
|
|
print("%s -> %.2f " % (id , counter / total))
|
|
|