first step

This commit is contained in:
init_mahdi 2025-11-27 20:31:12 +00:00
commit 1472bf0e9f
11 changed files with 2286 additions and 0 deletions

4
monir/.env Normal file
View File

@ -0,0 +1,4 @@
ES_URL = 'http://192.168.23.60/9200'
ES_USER_NAME = 'elastic'
ES_PASSWORD = '1234'
LLM_URL = 'http://2.188.15.102:8001/v1/'

0
monir/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

339
monir/base_model.py Normal file
View File

@ -0,0 +1,339 @@
from pydantic import BaseModel, Field
from typing import Optional, List, Dict, Union, Any
# فیلد اجباری:
# str = Field(..., description=)
# فیلد اختیاری:
# Optional[str] = Field("", description=)
class Properties(BaseModel):
"""
Standard Form Of Foreign Key To Other Index
"""
id: str = Field(..., description="شناسه یکتا")
title: Optional[str] = Field("", description="عنوان")
author: Optional[str] = Field("", description="سازنده")
sub_type: Optional[str] = Field("", description="نوع دوم")
class TreeInfo(BaseModel):
title: str
parent_id: str = Field(..., description="")
child_order: int
level: int
full_path: Optional[str] = Field("", description="")
path_headings: str
class NlpParses(BaseModel):
main_type: str
nlp_type: str
begin: int
end: int
text: str
referes: str
dependency_infos: str # nested
class Embeddings(BaseModel):
type: Optional[str] = (Field("", description=""),)
dims: Optional[int] = (Field(1024, description=""),)
index: Optional[bool] = (Field(True, description=""),)
similarity: Optional[str] = Field("", description="")
class FileLinks(BaseModel):
title: str
link: str
type: str
description: str
class UserLogs(BaseModel):
id: str
user_id: int
username: str
time_edit: int
property: str
class MnSection(BaseModel):
id: str
old_id: str
main_type: str
html: str
content: str
meet_info: Properties
term_info: Properties
tree_info: TreeInfo
content_len: int
word_len: int
tags: str
keywords: str
sort_date_timestamp: int
nlp_parses: NlpParses
embeddings: Embeddings
file_links: FileLinks
time_edit: int
user_edit: int
user_logs: UserLogs
class MnTerm(BaseModel):
id: str
author: str
sub_type: Optional[str] = ""
title: Optional[str] = ""
begin_date: Optional[int] = ""
end_date: Optional[int] = ""
begin_year: Optional[int] = ""
main_type: Optional[str] = ""
content: Optional[str] = ""
tags: Optional[str] = ""
keywords: Optional[str] = ""
class MnMeetEntity(BaseModel):
id: str
main_type: str
sub_type: str
title: str
content: str
permit_tags: str
search_state: str
user_create: str
time_create: int
time_edit: int
file_links: FileLinks
meet_info: Properties
term_info: Properties
class Subjects(BaseModel):
id: int
title: str
class ReportInfo(BaseModel):
films: int
sounds: int
photos: int
class MnMeet(BaseModel):
"""
Monir Meet Standard fields foramt
"""
id: str
sanad_id: str
main_type: str
sub_type: Optional[str] = ""
person_code: Optional[str] = ""
research_code: str
meet_code: str
old_meet_id: int
title: str
meet_no: int
author: str
term_info: Optional[Properties] = Field("", description="کلید اتصال به جدول ترم")
subtitle: str
subjects: Optional[List[Subjects]] = []
allwords: str
tags: Optional[List[str]] = []
keywords: str
verb: str
sanad_year: Optional[int] = ""
sanad_date: Optional[int] = ""
amplify: str
ralation: str
city: str
place: str
address: str
audience: str
attendees: str
report_info: ReportInfo
mindex: Optional[str] = Field("", description="mindex index فهرست")
mintro: Optional[str] = Field("", description="mintro خلاصه ")
content: str
completion: Optional[str] = Field("", description="type: completion")
sort_date_timestamp: int
permit_tags: str
resource_info: str
class MnSanadLink(BaseModel):
text: Optional[str] = ""
link: str
in_search: bool
title: str
class Properties(BaseModel):
"""
Standard Form Of Foreign Key To Other Index
"""
id: str = Field(..., description="شناسه یکتا")
title: Optional[str] = Field("", description="عنوان")
author: Optional[str] = Field("", description="سازنده")
sub_type: Optional[str] = Field("", description="نوع دوم")
class TreeProperties(BaseModel):
parent_id :str
child_order: int
level: int
full_path: str
title: str
path_headings: str
class NlpParses(BaseModel):
main_type: str
nlp_type: str
begin: int
end: int
text: str
referes: str
dependency_infos: Dict
class UserLogs(BaseModel):
id:str
user_id:int
username:str
time_edit:int
property:str
class MNSection(BaseModel):
main_type : str
id : str
html : str
content : str
meet_info: Properties
term_info: Properties
tree_info: TreeProperties
content_len: int
word_len: int
tags: str
keywords: str
sort_date_timestamp: int
nlp_parses: NlpParses
embeddings: Embeddings
file_links:FileLinks
time_edit: int
user_edit: int
user_logs: UserLogs
class MnSanadVersionInfo(BaseModel):
timestamp: int
number: int
title: str
class SoundLinks(BaseModel):
link: str
title: str
class MnSanad(BaseModel):
id: Optional[str] = ""
sort_date_timestamp: Optional[int] = ""
title: Optional[str] = ""
subtitle: Optional[str] = ""
research_code: Optional[int] = ""
content: Optional[str] = ""
version_info: Optional[MnSanadVersionInfo] = ""
meet_lid: Optional[int] = ""
meet_id: Optional[int] = ""
meet_no: Optional[int] = ""
meet_code: Optional[int] = ""
allwords: Optional[str] = ""
keywords: Optional[str] = ""
person_code: Optional[str] = ""
subject: Optional[List] = ""
city: Optional[str] = ""
author: str
begin_year: Optional[int] = ""
begin_date: Optional[int] = ""
end_date: Optional[int] = ""
branch: Optional[str] = ""
ralation: Optional[str] = ""
research_id: Optional[int] = ""
mintro: Optional[str] = ""
mindex: Optional[str] = ""
RowNum: Optional[int] = ""
resource_info: Optional[str] = ""
in_tadvin: Optional[bool] = ""
format: Optional[str] = ""
verb: Optional[str] = ""
address: Optional[str] = ""
attendees: Optional[str] = ""
amplify: Optional[str] = ""
audience: Optional[str] = ""
place: Optional[str] = ""
permit_tags: Optional[str] = ""
photos: Optional[int] = ""
tags: Optional[List[str]] = ""
films: Optional[int] = ""
sounds: Optional[int] = ""
file_links: Optional[List[MnSanadLink]] = ""
sound_links: Optional[List[SoundLinks]] = ""
video_links: Optional[List[SoundLinks]] = ""
photo_links: Optional[List[SoundLinks]] = ""
model_config = {"exclude_" "": True}
class MnSanad2Meet(BaseModel):
id: str
sort_date_timestamp: Optional[int] = ""
title: str
subtitle: str
research_code: str
format: str
content: str
# version_info: Optional[List|dict] = ""
meet_lid: Optional[str] = ""
meet_id: Optional[int] = ""
meet_no: Optional[int] = ""
meet_code: Optional[str] = ""
allwords: Optional[str] = ""
keywords: Optional[str] = ""
person_code: Optional[str] = ""
subject: Optional[List] = []
city: Optional[str] = ""
author: str
begin_year: Optional[int] = ""
begin_date: Optional[str] = ""
end_date: Optional[str] = ""
branch: str
ralation: Optional[str] = ""
research_id: int
mintro: Optional[str] = ""
mindex: Optional[str] = ""
# RowNum: Optional[int]= ""
resource_info: Optional[str] = ""
# in_tadvin: Optional[bool] = ""
verb: Optional[str] = ""
address: Optional[str] = ""
attendees: Optional[str] = ""
amplify: Optional[str] = ""
audience: Optional[str] = ""
place: Optional[str] = ""
permit_tags: Optional[str] = ""
photos: Optional[int] = 0
tags: Optional[List[str]] = []
films: Optional[int] = 0
sounds: Optional[int] = 0
file_links: Optional[List[MnSanadLink]] = ""
sound_links: Optional[List[SoundLinks]] = ""
video_links: Optional[List[SoundLinks]] = ""
photo_links: Optional[List[SoundLinks]] = ""
# model_config = {
# "exclude_""": True
# }

349
monir/doc_type.py Normal file
View File

@ -0,0 +1,349 @@
type_count = [
{
"key": "جلسه علمی",
"doc_count": 7332
},
{
"key": "منبر",
"doc_count": 3970
},
{
"key": "درس خارج",
"doc_count": 2450
},
{
"key": "تدریس",
"doc_count": 1401
},
{
"key": "سخنرانی",
"doc_count": 1221
},
{
"key": "-",
"doc_count": 992
},
{
"key": "مشاوره",
"doc_count": 858
},
{
"key": "مدیریت",
"doc_count": 652
},
{
"key": "مصاحبه",
"doc_count": 628
},
{
"key": "مباحثه و گفتگو",
"doc_count": 587
},
{
"key": "جزوه",
"doc_count": 583
},
{
"key": "مناظره و گفتگو",
"doc_count": 496
},
{
"key": "گزارش",
"doc_count": 395
},
{
"key": "--",
"doc_count": 250
},
{
"key": "جدول",
"doc_count": 208
},
{
"key": "کتاب",
"doc_count": 173
},
{
"key": "صورت جلسه",
"doc_count": 126
},
{
"key": "فهرست",
"doc_count": 124
},
{
"key": "ارائه",
"doc_count": 90
},
{
"key": "بازدید (دیدار)",
"doc_count": 83
},
{
"key": "مقاله",
"doc_count": 81
},
{
"key": "طرح",
"doc_count": 74
},
{
"key": "کلیپ",
"doc_count": 68
},
{
"key": "گزارش رصد",
"doc_count": 64
},
{
"key": "نامه",
"doc_count": 48
},
{
"key": "نمودار",
"doc_count": 45
},
{
"key": "آیین نامه",
"doc_count": 36
},
{
"key": "تقریر",
"doc_count": 35
},
{
"key": "خلاصه",
"doc_count": 25
},
{
"key": "پیش نویس",
"doc_count": 25
},
{
"key": "نقد",
"doc_count": 23
},
{
"key": "چکیده",
"doc_count": 16
},
{
"key": "یادبود",
"doc_count": 13
},
{
"key": "فرم",
"doc_count": 12
},
{
"key": "فرم فیش",
"doc_count": 11
},
{
"key": "مناجات",
"doc_count": 10
},
{
"key": "دعا و مناجات",
"doc_count": 8
},
{
"key": "فیش",
"doc_count": 6
},
{
"key": "پرسش و پاسخ",
"doc_count": 6
},
{
"key": "بیانیه",
"doc_count": 5
},
{
"key": "اجلاسیه",
"doc_count": 4
},
{
"key": "پایان نامه",
"doc_count": 4
},
{
"key": "",
"doc_count": 3
},
{
"key": "آئین نامه",
"doc_count": 3
},
{
"key": "رزومه",
"doc_count": 3
},
{
"key": "قرارداد",
"doc_count": 3
},
{
"key": "مصوبه",
"doc_count": 3
},
{
"key": "نمونه سوال",
"doc_count": 3
},
{
"key": "همایش",
"doc_count": 3
},
{
"key": "پژوهش",
"doc_count": 3
},
{
"key": "چارت",
"doc_count": 3
},
{
"key": "کنفرانس",
"doc_count": 3
},
{
"key": "تلخیص",
"doc_count": 2
},
{
"key": "قرائت زیارت",
"doc_count": 2
},
{
"key": "لیست",
"doc_count": 2
},{
"key": "متن جلسه",
"doc_count": 2
},
{
"key": "مستند تلوزیونی",
"doc_count": 2
},
{
"key": "نشست علمی",
"doc_count": 2
},
{
"key": "کتاب داخلی",
"doc_count": 2
},
{
"key": "گزارش جلسه",
"doc_count": 2
},
{
"key": "برنامه",
"doc_count": 1
},
{
"key": "بروشور",
"doc_count": 1
},
{
"key": "بزرگداشت",
"doc_count": 1
},
{
"key": "جزو",
"doc_count": 1
},
{
"key": "خطبه عقد(دائم)",
"doc_count": 1
},
{
"key": "روضه",
"doc_count": 1
},
{
"key": "زندگی نامه",
"doc_count": 1
},
{
"key": "زیارتنامه",
"doc_count": 1
},
{
"key": "سائر",
"doc_count": 1
},
{
"key": "سالگرد",
"doc_count": 1
},
{
"key": "سایر",
"doc_count": 1
},
{
"key": "طرج",
"doc_count": 1
},
{
"key": "عقد",
"doc_count": 1
},
{
"key": "ماتریس",
"doc_count": 1
},
{
"key": "مدل",
"doc_count": 1
},
{
"key": "مراحل دستیابی و به کارگیری الگوی پیشرفت اسلامی ـ ",
"doc_count": 1
},
{
"key": "مقالات",
"doc_count": 1
},
{
"key": "مقاله و ارائه",
"doc_count": 1
},
{
"key": "نماه",
"doc_count": 1
},
{
"key": "نمایه",
"doc_count": 1
},
{
"key": "هرم",
"doc_count": 1
},
{
"key": "پرسشنامه",
"doc_count": 1
},
{
"key": "پروژه",
"doc_count": 1
},
{
"key": "پیش نشست",
"doc_count": 1
},
{
"key": "کاربرگ",
"doc_count": 1
},
{
"key": "کتاب سایت",
"doc_count": 1
},
{
"key": "کمیسیون خبرگان",
"doc_count": 1
}
]

1134
monir/es_helper.py Normal file

File diff suppressed because it is too large Load Diff

368
monir/llm_helper.py Normal file
View File

@ -0,0 +1,368 @@
from typing import List
from pathlib import Path
import os, orjson, time, json, re, asyncio, traceback
from openai import AsyncOpenAI
# --------------------------------------------------------------------
# ------------------------------ پردازش API ------------------------------
class AsyncCore:
def __init__(
self,
model_name,
task_name,
data_path,
output_schema,
api_url,
reasoning_effort='low',
top_p=1,
temperature=0.0,
max_token=128000,
output_path=None,
ai_code_version=None,
request_timeout=30, # ثانیه
api_key="EMPTY",
save_number=2,
):
self.save_number = save_number
# json file of data
self.data_path = data_path
self.task_name = task_name
if output_path is None:
output_path = f"./{task_name}"
self.output_path = Path(output_path)
self._temp_path = self.output_path / "batch_data"
self._temp_processed_id_path = self._temp_path / "processed_id.json"
# Create output directory and subdirectories if they don't exist
self.output_path.mkdir(parents=True, exist_ok=True)
self._temp_path.mkdir(parents=True, exist_ok=True)
# self._temp_processed_id_path.mkdir(parents=True, exist_ok=True)
self.request_timeout = request_timeout
self.model_name = model_name
self.api_key = api_key
self.output_schema = output_schema
self.api_url = api_url
self.reasoning_effort = reasoning_effort
self.top_p = top_p
self.temperature = temperature
self.max_token = max_token
if ai_code_version is None:
ai_code_version = f"{model_name}_{reasoning_effort}"
self.ai_code_version = ai_code_version
self.PRIMARY_KEY = {"system_prompt", "user_prompt", "id"}
try:
self.data = self.__data_process()
print(f"📦 Loaded {len(self.data)} words")
except Exception as e:
raise ValueError(
f"Data loading/validation failed: {e}\n{traceback.format_exc()}"
)
def __validate_item(self, item, idx):
# Mandatory fields
for key in self.PRIMARY_KEY:
if key not in item:
raise ValueError(f"Missing mandatory key '{key}' in item #{idx}")
if not isinstance(item[key], str):
raise TypeError(
f"Item #{idx}: '{key}' must be a string, got {type(item[key]).__name__}"
)
# Optional field: assistant_prompt
if "assistant_prompt" not in item or item["assistant_prompt"] is None:
item["assistant_prompt"] = None
else:
if not isinstance(item["assistant_prompt"], str):
raise TypeError(
f"Item #{idx}: 'assistant_prompt' must be a string or absent, got {type(item['assistant_prompt']).__name__}"
)
return item # now normalized
def __data_process(self):
raw_data = self.__load_orjson(self.data_path)
if not isinstance(raw_data, list):
raise ValueError("Data must be a list of dictionaries.")
processed_data = []
for idx, item in enumerate(raw_data):
if not isinstance(item, dict):
raise ValueError(f"Item #{idx} is not a dictionary.")
validated_item = self.__validate_item(item, idx)
processed_data.append(validated_item)
return processed_data
def __get_max_number_file(self, directory):
# Pattern to match filenames like out_1.json, out_25.json, etc.
pattern = re.compile(r"output_(\d+)\.json$")
max_num = 0
for filename in os.listdir(directory):
match = pattern.match(filename)
if match:
num = int(match.group(1))
if num > max_num:
max_num = num
return max_num + 1
def __load_orjson(self, path: str | Path):
path = Path(path)
with path.open("rb") as f: # باید باینری باز بشه برای orjson
return orjson.loads(f.read())
def __save_orjson(self, path, data):
with open(path, "wb") as f:
f.write(
orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_NON_STR_KEYS)
)
def merge_json_dir(self, input_path, output_path):
directory = Path(input_path)
if not directory.is_dir():
raise ValueError(f"Not valid PATH: {input_path}")
seen_ids = set() # برای ردیابی idهای دیده‌شده (سریع!)
unique_data = [] # فقط داده‌های یکتا
failed_files = []
json_files = list(directory.glob("*.json"))
if not json_files:
print("⚠️ NO JSON File Found In This PATH")
return
for json_file in json_files:
try:
data = self.__load_orjson(json_file)
if not data: # خالی یا None
failed_files.append(json_file.name)
continue
if isinstance(data, list) and isinstance(data[0], dict):
for item in data:
item_id = item.get("id")
if item_id is None:
# اگر id نداشت، می‌تونی تصمیم بگیری: نگه داری یا ردش کنی
# اینجا فرض می‌کنیم فقط مواردی با id معتبر مهم هستند
continue
if item_id not in seen_ids:
seen_ids.add(item_id)
unique_data.append(item)
else:
raise ValueError(f"no list available in this json -> {json_file}")
except (
json.JSONDecodeError,
ValueError,
OSError,
KeyError,
TypeError,
) as e:
# print(f"❌ Failed in process '{json_file.name}': {e}")
failed_files.append(json_file.name)
# گزارش خطاها
if failed_files:
print("\n❌ We lose this file:")
for name in failed_files:
print(f" - {name}")
else:
print("\n✅ All JSON added")
# ذخیره خروجی
try:
self.__save_orjson(data=unique_data, path=output_path)
print(
f"\n💾 Final file saved: {output_path} (Total unique items: {len(unique_data)})"
)
except Exception as e:
print(f"❌ Error in saving final file: {e}")
def make_new_proccessed_ids_from_file(self, json_in, out_path):
data = self.__load_orjson(json_in)
finall_data = []
for d in data:
if d["id"]:
finall_data.append(d["id"])
finall_data = set(finall_data)
finall_data = list(finall_data)
print(f"-- len ids {len(finall_data)}")
self.__save_orjson(data=finall_data, path=out_path)
# ------------------------------ Main ------------------------------
async def __process_item(self, client, item):
try:
messages = [
{"role": "system", "content": item["system_prompt"]},
{"role": "user", "content": item["user_prompt"]},
]
if item.get("assistant_prompt"):
messages.append(
{"role": "assistant", "content": item["assistant_prompt"]}
)
response = await client.chat.completions.parse(
model=self.model_name,
messages=messages,
temperature=self.temperature,
top_p=self.top_p,
reasoning_effort=self.reasoning_effort,
max_tokens=self.max_token,
stop=None,
response_format=self.output_schema,
)
parsed = (
response.choices[0].message.parsed
if response and response.choices and response.choices[0].message.parsed
else {"raw_text": str(response)}
)
parsed = self.output_schema.model_validate(parsed)
parsed = dict(parsed)
parsed["ai_code_version"] = self.ai_code_version
parsed["id"] = item["id"]
return parsed, 200
except asyncio.TimeoutError:
print(f"⏳ Timeout on item {item['id']}")
return None, 408
except Exception as e:
print(f"⚠️ Error __process_item {item['id']}: {traceback.print_exc()}")
return None, 400
def async_eval(self, processed_id: List = []):
try:
asyncio.run(self.__async_eval(processed_id))
except KeyboardInterrupt:
print("\n🛑 Interrupted by user.")
traceback.print_exc()
async def __async_eval(self, processed_id: List):
"""
اجرای اصلی تکهستهای و async برای تولید خروجی نهایی.
"""
print("🔹 Starting async data processing...")
# ------------------ مرحله ۱: بازیابی شناسه‌های قبلاً پردازش‌شده ------------------
if not processed_id:
try:
processed_id = self.__load_orjson(self._temp_processed_id_path)
print(
f"📂 Loaded existing processed_id from {self._temp_processed_id_path}"
)
except Exception:
print("⚠️ No valid processed_id found. Starting fresh.")
processed_id = []
# ------------------ مرحله ۲: آماده‌سازی داده‌ها ------------------
all_processed_id = set(processed_id)
all_results = []
total_time = []
data = [item for item in self.data if item.get("id") not in all_processed_id]
print(
f" Total items: {len(self.data)} - {len(all_processed_id)} = {len(data)}"
)
# اگر چیزی برای پردازش نیست
if not data:
print("✅ Nothing new to process. All items are already done.")
return
# ------------------ مرحله ۳: شروع پردازش ------------------
print(f"🤖 Model: {self.model_name} | Reasoning: {self.reasoning_effort}")
async with AsyncOpenAI(base_url=self.api_url, api_key=self.api_key) as client:
semaphore = asyncio.Semaphore(5)
async def limited_process(item):
async with semaphore:
return await self.__process_item(client, item)
tasks = [asyncio.create_task(limited_process(item)) for item in data]
total_i = 0
# ✅ پردازش به ترتیب تکمیل (نه ترتیب لیست)
for i, task in enumerate(asyncio.as_completed(tasks), start=1):
start = time.time()
try:
parsed, status_code = await asyncio.wait_for(
task, timeout=self.request_timeout
) # ⏱ حداکثر 2 دقیقه
except asyncio.TimeoutError:
print(f"⏳ Task {i} timed out completely")
parsed, status_code = None, 408
total_time.append(time.time() - start)
if status_code == 200:
all_results.append(parsed)
all_processed_id.add(parsed.get("id"))
else:
print(f"⚠️ Skipped item {parsed.get('id')} (status={status_code})")
total_i += 1
# ✅ ذخیره‌ی موقت هر n مورد
if total_i >= self.save_number:
print(f"total_i {total_i}")
print(f"self.save_number {self.save_number}")
total_i = 0
self.__save_orjson(
data=list(all_processed_id),
path=self._temp_processed_id_path,
)
print(f"💾 Auto-saved processed ids: {len(all_processed_id)}")
number = self.__get_max_number_file(self._temp_path)
print(f"number {number}")
temp_output_path = self._temp_path / f"output_{number}.json"
self.__save_orjson(data=list(all_results), path=temp_output_path)
print(f"💾 Auto-saved partial data: {len(all_results)}")
all_results.clear()
# ✅ بعد از پایان تمام تسک‌ها، ذخیره نهایی برای داده‌های باقیمانده
if total_i > 0 or len(all_results) > 0:
print("💾 Final save of remaining data...")
self.__save_orjson(
data=list(all_processed_id),
path=self._temp_processed_id_path,
)
print(f"💾 Auto-saved processed ids: {len(all_processed_id)}")
number = self.__get_max_number_file(self._temp_path)
print(f"number {number}")
temp_output_path = self._temp_path / f"output_{number}.json"
self.__save_orjson(data=list(all_results), path=temp_output_path)
print(f"💾 Auto-saved partial data: {len(all_results)}")
all_results.clear()
# ------------------ مرحله ۴: ذخیره خروجی ------------------
final_data_path = self.output_path / f"final_data_{self.task_name}.json"
processed_id_path = self.output_path / "processed_id.json"
self.merge_json_dir(input_path=self._temp_path, output_path=final_data_path)
all_results = self.__load_orjson(final_data_path)
# make_new_proccessed_ids_from_file()
self.__save_orjson(data=list(all_processed_id), path=processed_id_path)
self.__save_orjson(data=all_results, path=final_data_path)
avg_time = (sum(total_time) / len(total_time)) if total_time else 0
print(
f"\n✅ Processing completed!\n"
f"📊 Total-Data: {len(data)} | "
f"⭕ Ignored-Data: {len(processed_id)} | "
f"📦 Proccessed-Data: {len(all_results)} | "
f"❌ Loss-Data: {len(data)-len(all_results)} | "
f"🕒 Avg Time: {avg_time:.2f}'s per item | "
f"🕒 Total Time: {sum(total_time):.4f}'s | "
f"💾 Results saved to: {final_data_path}"
)

88
monir/main.py Normal file
View File

@ -0,0 +1,88 @@
from dotenv import load_dotenv
import os
from llm_helper import AsyncCore
from es_helper import ElasticHelper
from base_model import MnMeet
import time, traceback, uuid, orjson, re
from datetime import datetime, timezone
from elasticsearch.helpers import scan
from typing import Union
from pathlib import Path
from collections import defaultdict
from typing import List
load_dotenv()
ES_URL = os.getenv("ES_URL")
ES_USER_NAME = os.getenv("ES_USER_NAME")
ES_PASSWORD = os.getenv("ES_PASSWORD")
LLM_URL = os.getenv("LLM_URL")
def save_orjson(path, data):
with open(path, "wb") as f:
f.write(
orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_NON_STR_KEYS)
)
def load_orjson(path: str | Path):
path = Path(path)
with path.open("rb") as f: # باید باینری باز بشه برای orjson
return orjson.loads(f.read())
# --------------------------- flow
term_index_name = "mn_term"
meet_index_name = "mn_meet"
ment_index_name = "mn_meet_entity"
sections_index_name = ""
dash = "-" * 25
es_helper = ElasticHelper(
es_url=ES_URL,
es_user=ES_USER_NAME,
es_pass=ES_PASSWORD,
)
############ DELETE INDEXES
# es_helper.deleteIndex(index_name=term_index_name)
# es_helper.deleteIndex(index_name=meet_index_name)
# es_helper.deleteIndex(index_name=ment_index_name)
############ CREATE INDEXES
# es_helper.createIndexIfNotExist(index_name_o=term_index_name)
# es_helper.createIndexIfNotExist(index_name_o=meet_index_name)
# es_helper.createIndexIfNotExist(index_name_o=ment_index_name)
es = es_helper.es
# fields = list(MnMeet.model_fields.keys())
fields = [
"id",
"sanad_id",
"main_type",
"title",
"author",
"content",
]
# old_data = es_helper.search(
# index=old_index_name, _source=fields, query={"match_all": {}}, size=3
# )
# old_data = old_data["hits"]["hits"] # don't use in scan
################### for all data
old_data = list(
scan(
es,
index=meet_index_name,
query={
"_source": fields,
"query":
{ "term":
{
"main_type": "جلسه علمی"}},
# {"match_all": {}},
},
)
)
print(f'--- old_data {len(old_data)}')
save_orjson(
data=old_data,
path='./data_content_1.json'
)

4
monir/requirements.txt Normal file
View File

@ -0,0 +1,4 @@
python-dotenv
openai
elasticsearch==8.13.0
orjson