88 lines
2.3 KiB
Python
88 lines
2.3 KiB
Python
from dotenv import load_dotenv
|
|
import os
|
|
from llm_helper import AsyncCore
|
|
from es_helper import ElasticHelper
|
|
from base_model import MnMeet
|
|
import time, traceback, uuid, orjson, re
|
|
from datetime import datetime, timezone
|
|
from elasticsearch.helpers import scan
|
|
from typing import Union
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
from typing import List
|
|
|
|
load_dotenv()
|
|
ES_URL = os.getenv("ES_URL")
|
|
ES_USER_NAME = os.getenv("ES_USER_NAME")
|
|
ES_PASSWORD = os.getenv("ES_PASSWORD")
|
|
LLM_URL = os.getenv("LLM_URL")
|
|
|
|
def save_orjson(path, data):
|
|
with open(path, "wb") as f:
|
|
f.write(
|
|
orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_NON_STR_KEYS)
|
|
)
|
|
|
|
def load_orjson(path: str | Path):
|
|
path = Path(path)
|
|
with path.open("rb") as f: # باید باینری باز بشه برای orjson
|
|
return orjson.loads(f.read())
|
|
|
|
# --------------------------- flow
|
|
term_index_name = "mn_term"
|
|
meet_index_name = "mn_meet"
|
|
ment_index_name = "mn_meet_entity"
|
|
sections_index_name = ""
|
|
dash = "-" * 25
|
|
|
|
es_helper = ElasticHelper(
|
|
es_url=ES_URL,
|
|
es_user=ES_USER_NAME,
|
|
es_pass=ES_PASSWORD,
|
|
)
|
|
|
|
############ DELETE INDEXES
|
|
# es_helper.deleteIndex(index_name=term_index_name)
|
|
# es_helper.deleteIndex(index_name=meet_index_name)
|
|
# es_helper.deleteIndex(index_name=ment_index_name)
|
|
|
|
############ CREATE INDEXES
|
|
# es_helper.createIndexIfNotExist(index_name_o=term_index_name)
|
|
# es_helper.createIndexIfNotExist(index_name_o=meet_index_name)
|
|
# es_helper.createIndexIfNotExist(index_name_o=ment_index_name)
|
|
|
|
|
|
es = es_helper.es
|
|
# fields = list(MnMeet.model_fields.keys())
|
|
fields = [
|
|
"id",
|
|
"sanad_id",
|
|
"main_type",
|
|
"title",
|
|
"author",
|
|
"content",
|
|
]
|
|
# old_data = es_helper.search(
|
|
# index=old_index_name, _source=fields, query={"match_all": {}}, size=3
|
|
# )
|
|
# old_data = old_data["hits"]["hits"] # don't use in scan
|
|
################### for all data
|
|
old_data = list(
|
|
scan(
|
|
es,
|
|
index=meet_index_name,
|
|
query={
|
|
"_source": fields,
|
|
"query":
|
|
{ "term":
|
|
{
|
|
"main_type": "جلسه علمی"}},
|
|
# {"match_all": {}},
|
|
},
|
|
)
|
|
)
|
|
print(f'--- old_data {len(old_data)}')
|
|
save_orjson(
|
|
data=old_data,
|
|
path='./data_content_1.json'
|
|
) |