DataCleaning/monir/main.py
2025-11-27 20:31:12 +00:00

88 lines
2.3 KiB
Python

from dotenv import load_dotenv
import os
from llm_helper import AsyncCore
from es_helper import ElasticHelper
from base_model import MnMeet
import time, traceback, uuid, orjson, re
from datetime import datetime, timezone
from elasticsearch.helpers import scan
from typing import Union
from pathlib import Path
from collections import defaultdict
from typing import List
load_dotenv()
ES_URL = os.getenv("ES_URL")
ES_USER_NAME = os.getenv("ES_USER_NAME")
ES_PASSWORD = os.getenv("ES_PASSWORD")
LLM_URL = os.getenv("LLM_URL")
def save_orjson(path, data):
with open(path, "wb") as f:
f.write(
orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_NON_STR_KEYS)
)
def load_orjson(path: str | Path):
path = Path(path)
with path.open("rb") as f: # باید باینری باز بشه برای orjson
return orjson.loads(f.read())
# --------------------------- flow
term_index_name = "mn_term"
meet_index_name = "mn_meet"
ment_index_name = "mn_meet_entity"
sections_index_name = ""
dash = "-" * 25
es_helper = ElasticHelper(
es_url=ES_URL,
es_user=ES_USER_NAME,
es_pass=ES_PASSWORD,
)
############ DELETE INDEXES
# es_helper.deleteIndex(index_name=term_index_name)
# es_helper.deleteIndex(index_name=meet_index_name)
# es_helper.deleteIndex(index_name=ment_index_name)
############ CREATE INDEXES
# es_helper.createIndexIfNotExist(index_name_o=term_index_name)
# es_helper.createIndexIfNotExist(index_name_o=meet_index_name)
# es_helper.createIndexIfNotExist(index_name_o=ment_index_name)
es = es_helper.es
# fields = list(MnMeet.model_fields.keys())
fields = [
"id",
"sanad_id",
"main_type",
"title",
"author",
"content",
]
# old_data = es_helper.search(
# index=old_index_name, _source=fields, query={"match_all": {}}, size=3
# )
# old_data = old_data["hits"]["hits"] # don't use in scan
################### for all data
old_data = list(
scan(
es,
index=meet_index_name,
query={
"_source": fields,
"query":
{ "term":
{
"main_type": "جلسه علمی"}},
# {"match_all": {}},
},
)
)
print(f'--- old_data {len(old_data)}')
save_orjson(
data=old_data,
path='./data_content_1.json'
)