from html import escape from lxml import etree from datetime import datetime from elasticsearch import Elasticsearch import hazm from normalizer import cleaning #from cleantext import clean #import re from keybert import KeyBERT #def cleanhtml(raw_html): # cleanr = re.compile('<.*?>') # cleantext = re.sub(cleanr, '', raw_html) # return cleantext #normalizer = hazm.Normalizer() #wierd_pattern = re.compile("[" # u"\U0001F600-\U0001F64F" # emoticons # u"\U0001F300-\U0001F5FF" # symbols & pictographs # u"\U0001F680-\U0001F6FF" # transport & map symbols # u"\U0001F1E0-\U0001F1FF" # flags (iOS) # u"\U00002702-\U000027B0" # u"\U000024C2-\U0001F251" # u"\U0001f926-\U0001f937" # u'\U00010000-\U0010ffff' # u"\u200d" # u"\u2640-\u2642" # u"\u2600-\u2B55" # u"\u23cf" # u"\u23e9" # u"\u231a" # u"\u3030" # u"\ufe0f" # u"\u2069" # u"\u2066" # # u"\u200c" # u"\u2068" # u"\u2067" # "]+", flags=re.UNICODE) #def cleaning(text): # text = text.strip() # text = clean(text, # extra_spaces = True, # lowercase = True # ) # # cleaning htmls # text = cleanhtml(text) # # # normalizing # text = normalizer.normalize(text) # removing wierd patterns # text = wierd_pattern.sub(r'', text) # removing extra spaces, hashtags # text = re.sub("#", "", text) # text = re.sub("\s+", " ", text) # return text index_name_i = 'semantic_search-v10' model_id = "../../MLM/MODELS/HooshvareLab-bert-fa-base-uncased-finetuned-2-pt" kw_model = KeyBERT(model=model_id) es = Elasticsearch( "http://127.0.0.1:6900", # ca_certs="/path/to/http_ca.crt", basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm") ) stop_words = hazm.stopwords_list() stop_words = [cleaning(element) for element in stop_words] counter = 0 total = 0 id = '' def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="12m", **kwargs): """ Helper to iterate ALL values from a single index Yields all the documents. """ global counter global total is_first = True while True: # Scroll next if is_first: # Initialize scroll # result = es.search(index=index, scroll="12m", **kwargs, body={ # "size": pagesize # }) result = es.search(index=index, scroll="12m", **kwargs, size=pagesize) total = result["hits"]["total"]['value'] print('total = %d' % total) is_first = False else: # result = es.scroll(body={ # "scroll_id": scroll_id, # "scroll": scroll_timeout # }) result = es.scroll( scroll_id = scroll_id, scroll = scroll_timeout ) scroll_id = result["_scroll_id"] hits = result["hits"]["hits"] counter += len(hits) print("progress -> %.2f %% , counte: %d" % ((counter / total)*100, counter)) # Stop after no more docs if not hits: break # Yield each entry yield from ({"source":hit['_source'], "id":hit['_id']} for hit in hits) try: #els_file = open('./elastic-dataset.jsonl', 'w', encoding='utf-8') for mentry in es_iterate_all_documents(es, index_name_i): entry = mentry['source'] id = mentry['id'] #title = entry.get('title','').replace('"', "'").replace('\n', ' ').replace('\r', '') text = entry.get('clean_content','') #lkeys = entry.get('content_keywords','') print("%s -> %.2f " % (id , counter / total)) try: #print(lkeys) #print(keywords) #print('*'*20) keywords = '' resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)}) except Exception as inst: print(type(inst)) # the exception type print(inst.args) # arguments stored in .args print("Exception: " + str(inst)) # print(inst) # print(id) except Exception as inst: print(type(inst)) # the exception type print(inst.args) # arguments stored in .args print(inst) # __str__ allows args to be printed directly, # but may be overridden in exception subclasses print("%s -> %.2f " % (id , counter / total))