data model latest updates

This commit is contained in:
ajokar 2026-02-23 11:49:43 +00:00
parent dc2bc292b0
commit 700571aa6a
3 changed files with 309 additions and 134 deletions

View File

@ -1,3 +1,4 @@
from datetime import datetime
import sqlite3
import json
@ -80,11 +81,19 @@ def get_all_data():
conn = sqlite3.connect('./db/nahj.db')
cursor = conn.cursor()
# اجرای کوئری برای دریافت همه داده‌ها
cursor.execute('SELECT * FROM speeches')
result = cursor.fetchall()
# دریافت نام ستون‌ها
columns = [desc[0] for desc in cursor.description]
conn.close()
return result
# تبدیل داده‌ها به دیکشنری (کلید-مقدار)
data_as_dict = [dict(zip(columns, row)) for row in result]
return data_as_dict
# متد ایجاد جدول چت
def create_chat_table():
@ -201,30 +210,207 @@ def get_chats_by_user_id(user_id):
return chats
def insert_error(query, error_message):
conn = sqlite3.connect('./db/nahj.db') # اتصال به دیتابیس
cursor = conn.cursor()
try:
# درج داده در جدول `error`
cursor.execute('''
INSERT INTO error (query, error_message)
VALUES (?, ?)
''', (query, error_message))
conn.commit() # ذخیره تغییرات
print("Error inserted successfully.")
except Exception as e:
print(f"Failed to insert error: {e}")
finally:
conn.close() # بستن اتصال
# اضافه کردن یک رکورد جدید
def add_credit(remained_credit, date=None):
conn = sqlite3.connect('./db/nahj.db')
cursor = conn.cursor()
if date is None:
date = datetime.now().strftime('%Y-%m-%d')
try:
cursor.execute('''
INSERT INTO credit (remained_credit, date)
VALUES (?, ?)
''', (remained_credit, date))
conn.commit()
print("Record added successfully.")
except Exception as e:
print(f"Failed to add record: {e}")
finally:
conn.close()
# بازخوانی آخرین رکورد
def get_last_credit():
conn = sqlite3.connect('./db/nahj.db')
cursor = conn.cursor()
try:
cursor.execute('''
SELECT * FROM credit
ORDER BY id DESC
LIMIT 1
''')
last_record = cursor.fetchone()
conn.close()
if last_record:
return {"id": last_record[0], "remained_credit": last_record[1], "date": last_record[2]}
else:
return None
except Exception as e:
print(f"Failed to fetch last record: {e}")
conn.close()
return None
def insert_request(update_id, username=None, text=None, answer=None, message_id=None, user_id=None, is_bot=None, date_value=None, chat_id=None, req_type=None, first_name=None, last_name=None, is_active=None):
conn = sqlite3.connect('./db/nahj.db')
cursor = conn.cursor()
if date_value:
date_value = datetime.fromtimestamp(date_value).strftime('%Y-%m-%d %H:%M:%S')
try:
# درج رکورد جدید در جدول `requests`
cursor.execute('''
INSERT INTO requests (update_id, username, text, answer, message_id, user_id, is_bot, date, chat_id, type, first_name, last_name, is_active)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (update_id, username, text, answer, message_id, user_id, is_bot, date_value, chat_id, req_type, first_name, last_name, is_active))
conn.commit()
# print("Record inserted successfully.")
except Exception as e:
print(f"Failed to insert record: {e}")
finally:
conn.close()
def update_request(update_id, username=None, text=None, answer=None, message_id=None, user_id=None, is_bot=None,
date_value=None, chat_id=None, req_type=None, first_name=None, last_name=None, is_active=None):
conn = sqlite3.connect('./db/nahj.db')
cursor = conn.cursor()
try:
# لیست ستون‌هایی که به‌روزرسانی می‌شوند
update_fields = []
update_values = []
if username is not None:
update_fields.append("username = ?")
update_values.append(username)
if text is not None:
update_fields.append("text = ?")
update_values.append(text)
if answer is not None:
update_fields.append("answer = ?")
update_values.append(answer)
if message_id is not None:
update_fields.append("message_id = ?")
update_values.append(message_id)
if user_id is not None:
update_fields.append("user_id = ?")
update_values.append(user_id)
if is_bot is not None:
update_fields.append("is_bot = ?")
update_values.append(is_bot)
if date_value is not None:
update_fields.append("date = ?")
update_values.append(date_value)
if chat_id is not None:
update_fields.append("chat_id = ?")
update_values.append(chat_id)
if req_type is not None:
update_fields.append("type = ?")
update_values.append(req_type)
if first_name is not None:
update_fields.append("first_name = ?")
update_values.append(first_name)
if last_name is not None:
update_fields.append("last_name = ?")
update_values.append(last_name)
if is_active is not None:
update_fields.append("is_active = ?")
update_values.append(is_active)
# افزودن مقدار `update_id` برای شرط WHERE
update_values.append(update_id)
# اجرای کوئری آپدیت
cursor.execute(f'''
UPDATE requests
SET {', '.join(update_fields)}
WHERE update_id = ?
''', update_values)
conn.commit()
# print("Record updated successfully.")
except Exception as e:
print(f"Failed to update record: {e}")
finally:
conn.close()
def get_last_request():
conn = sqlite3.connect('./db/nahj.db')
cursor = conn.cursor()
try:
# بازخوانی آخرین رکورد بر اساس بیشترین مقدار update_id
cursor.execute('''
SELECT * FROM requests
ORDER BY update_id DESC
LIMIT 1
''')
last_record = cursor.fetchone()
conn.close()
if last_record:
return {
"update_id": last_record[0],
"username": last_record[1],
"text": last_record[2],
"answer": last_record[3],
"message_id": last_record[4],
"user_id": last_record[5],
"is_bot": last_record[6],
"date": last_record[7],
"chat_id": last_record[8],
"type": last_record[9],
"first_name": last_record[10],
"last_name": last_record[11],
"is_active": last_record[12]
}
else:
return None # اگر جدولی خالی باشد
except Exception as e:
print(f"Failed to fetch last record: {e}")
conn.close()
return None
def create_tables():
create_speechs_table()
create_chat_table()
# مثال استفاده
if __name__ == "__main__":
pass
# ایجاد جدول (فقط بار اول اجرا می‌شود)
#create_tables()
# درج داده‌ها
with open('./data-faiss/faiss_index_nahj_metadata.json', 'r', encoding='utf-8') as file:
data = json.load(file)
# with open('./data-faiss/faiss_index_nahj_metadata.json', 'r', encoding='utf-8') as file:
# data = json.load(file)
insert_data(data)
# خواندن داده بر اساس id
record = get_data_by_id("wisdom774")
print("Record by ID:", record)
# خواندن داده بر اساس part_id
records_by_part = get_data_by_part_id("sp13")
print("Records by Part ID:", records_by_part)
# خواندن داده بر اساس context_id
records_by_context = get_data_by_context_id("sn1")
print("Records by Context ID:", records_by_context)
# insert_data(data)

View File

@ -6,6 +6,7 @@ import logging
# import uvicorn
import time
import nahj_engine as nahj_chat
import data_model as dm
# ===========================
# پیکربندی اولیه
# ===========================
@ -62,33 +63,30 @@ class BaleBot:
return None
async def get_latest_req_id(self):
latest_req_id = 0
# should write in DATABASE
with open('./bale_bot/requests.json', 'r', encoding='utf-8') as file:
prev_reqs = json.load(file)
if prev_reqs:
latest_req_id = prev_reqs[-1]['update_id']
latest_request = dm.get_last_request()
latest_req_id = latest_request['update_id']
if not latest_req_id:
latest_req_id = 0
return latest_req_id + 1
async def save_entery(self, update):
all_reqs = []
prev_reqs = []
async def save_entery(self, update_item):
is_active = True
answer = ''
message = update_item['message']
fromm = message['from']
chat = message['chat']
try:
# should read from DATABASE
with open('./bale_bot/requests.json', 'r', encoding='utf-8') as file:
prev_reqs = json.load(file)
all_reqs.extend(prev_reqs)
dm.insert_request(update_item['update_id'],fromm['username'],message['text'], answer, message['message_id'],fromm['id'],fromm['is_bot'],message['date'],chat['id'],chat['type'],fromm['first_name'],fromm['last_name'], is_active)
except:
prev_reqs = []
all_reqs.extend(update)
# should write in DATABASE
with open('./bale_bot/requests.json', 'w', encoding='utf-8') as file:
data = json.dumps(all_reqs, ensure_ascii=False, indent=2)
file.write(data)
return True
return update_item['update_id']
return update_item['update_id']
async def update_request(self, update_id, answer):
dm.update_request(update_id= update_id, answer= answer)
async def split_text_into_chunks(self, text, max_length=4000):
"""
@ -270,10 +268,10 @@ class BaleBot:
reply_chuncs = await self.split_text_into_chunks(reply)
for i, paragraph in enumerate(reply_chuncs):
resultt = await self.send_message(chat_id, paragraph, keyboard)
# print(f"sent to bale server result {i+1}: {resultt}")
await self.send_message(chat_id, paragraph, keyboard)
await self.save_chat_data(text, reply, first_name, username)
# await self.save_chat_data(text, reply, first_name, username)
return reply
async def main():
@ -285,10 +283,12 @@ async def main():
# print(f"reading bale-bot server. len update: {len(update)}")
if update:
print(f'{len(update)} requests recognized!')
await bale_bot.save_entery(update)
for i, item in enumerate(update, 1):
print(f'handle input {i}/{len(update)}')
await bale_bot.handle_update(item)
update_id = await bale_bot.save_entery(item)
answer = await bale_bot.handle_update(item)
if answer:
await bale_bot.update_request(update_id, answer)
time.sleep(1)
# ===========================

View File

@ -28,7 +28,8 @@ today = f'{datetime.datetime.now().year}{datetime.datetime.now().month}{datetime
# -------------------
# مدل‌ها و مسیر داده
# -------------------
MODEL_PATH = "/home/sabr/MODELS/models--sentence-transformers--paraphrase-multilingual-MiniLM-L12-v2" # مسیر مدل FastText فارسی
# MODEL_PATH = "/home/sabr/MODELS/models--sentence-transformers--paraphrase-multilingual-MiniLM-L12-v2"
MODEL_PATH = "./models/models--sentence-transformers--paraphrase-multilingual-MiniLM-L12-v2"
RERANKER_MODEL = "BAAI/bge-reranker-v2-m3"
# RERANKER_MODEL = "/home/gpu/HFHOME/hub/models/BAAI/bge-reranker-v2-m3"
FAISS_INDEX_PATH = "./data-faiss/faiss_index_nahj.index"
@ -40,26 +41,43 @@ def get_key():
key = 'aa-Fu5oeQv8jx8NCWV39WenJ7Yy1mbcFJ4P20CLQURkql2Eleta' # nahj_rag api key
return key
def read_records():
metadata = dm.get_all_data()
ids, part_ids, context_ids, large_titles, normalized_sentences, titles, urls, arabic_texts, Interpretation_links, types = [], [], [], [], [], [], [], [], [], []
for item in metadata:
ids.append(item['id'])
context_ids.append(item['context_id'])
part_ids.append(item['part_id'])
titles.append(item['title'])
large_titles.append(item['large_title'])
normalized_sentences.append(item['normalized_sentence'])
urls.append(item['url'])
types.append(item['types'])
arabic_texts.append(item['arabic_text'])
Interpretation_links.append(item['interpretation_links'])
# ids.append(item["id"])
# part_ids.append(item["part_id"])
# context_ids.append(item["context_id"])
# large_titles.append(item["large_title"])
# normalized_sentences.append(item["normalized_text"])
# titles.append(item["title"])
# urls.append(item["url"])
# arabic_texts.append(item["arabic_text"])
# Interpretation_links.append(item["Interpretation_link"])
# types.append(item["type"])
return ids, part_ids,context_ids,large_titles,normalized_sentences, titles, urls, arabic_texts, Interpretation_links, types
def load_faiss_index(index_path: str, metadata_path: str):
index = faiss.read_index(index_path)
# with open(metadata_path, "r", encoding="utf-8") as file:
# metadata = json.load(file)
metadata = dm.get_all_data()
ids, part_ids, context_ids, large_titles, normalized_sentences, titles, urls, arabic_texts, Interpretation_links, types = [], [], [], [], [], [], [], [], [], [], []
for item in metadata:
ids.append(item["id"])
part_ids.append(item["part_id"])
context_ids.append(item["context_id"])
large_titles.append(item["large_title"])
normalized_sentences.append(item["normalized_text"])
titles.append(item["title"])
urls.append(item["url"])
arabic_texts.append(item["arabic_text"])
Interpretation_links.append(item["Interpretation_link"])
types.append(item["type"])
ids, part_ids,context_ids,large_titles,normalized_sentences, titles, urls, arabic_texts, Interpretation_links, types = read_records()
return ids, part_ids,context_ids,large_titles,normalized_sentences, titles, urls, arabic_texts, Interpretation_links, types, index
def get_client():
url = "https://api.avalai.ir/v1"
@ -72,7 +90,7 @@ def get_client():
def llm_free_request(llm_messages, model="gemini-2.5-flash-lite"):
print(f'llm_free_request func ...')
print(f'using model: {model}')
client = get_client()
try:
response = client.chat.completions.create(
messages=llm_messages,
@ -81,10 +99,10 @@ def llm_free_request(llm_messages, model="gemini-2.5-flash-lite"):
answer = response.choices[0].message.content
except Exception as error:
with open('./nahj-answer/error-in-llm-free-req.txt', mode='a+', encoding='utf-8') as file:
error_message = f'\n\nquery: {query.strip()}\nerror:{error} \n-------------------------------\n'
file.write(error_message)
return 'متاسفانه خطایی رخ داده است.'
dm.insert_error(query= query, error_message= error)
answer = 'متاسفانه خطایی رخ داده است.'
return answer
def llm_request(query, model="gemini-2.5-flash-lite"):
@ -92,7 +110,7 @@ def llm_request(query, model="gemini-2.5-flash-lite"):
if query == '':
return 'لطفا متن سوال را وارد نمائید'
determine_refrence = """شناسه هر سخنرانی در ابتدای آن و با فرمت "id: {idvalue}" آمده است..."""
# determine_refrence = """شناسه هر سخنرانی در ابتدای آن و با فرمت "id: {idvalue}" آمده است..."""
try:
messages.append({"role": "user", "content": query})
response = client.chat.completions.create(
@ -103,10 +121,10 @@ def llm_request(query, model="gemini-2.5-flash-lite"):
messages.append({"role": "assistant", "content": answer})
except Exception as error:
# should write in DATABASE
with open('./nahj-answer/error-in-llm.txt', mode='a+', encoding='utf-8') as file:
error_message = f'\n\nquery: {query.strip()}\nerror:{error} \n-------------------------------\n'
file.write(error_message)
return 'با عرض پوزش؛ متاسفانه خطایی رخ داده است.'
dm.insert_error(query= query, error_message= error)
answer = 'با عرض پوزش؛ متاسفانه خطایی رخ داده است.'
return answer
class QueryAnalysisPipeline:
@ -193,36 +211,43 @@ class QueryAnalysisPipeline:
# -----------------------------
class HybridRetrieverReranker:
__slots__ = (
"device", "content_list", "ids","context_ids","large_titles","normalized_sentences","titles","urls","arabic_texts","Interpretation_links","types", "N", "embedder", "faiss_index",
"vectorizer", "tfidf_matrix", "tokenizer", "reranker", "dense_alpha"
"device", "ids","part_ids","context_ids","large_titles","sentences","titles","urls","arabic_texts","Interpretation_links","types", "N", "embedder", "faiss_index", "vectorizer", "tfidf_matrix", "tokenizer", "reranker", "dense_alpha"
)
# ids, dates, titles, content_list, urls,
def __init__(self, ids: List[str], dates: List[str], titles: List[str], content_list: List[str],urls: List[str] , faiss_index,
# ids, part_ids,context_ids,large_titles,sentences , titles, urls, arabic_texts, Interpretation_links, types,
def __init__(self, ids: List[str], part_ids: List[str], context_ids: List[str], large_titles: List[str], sentences: List[str], titles: List[str], urls: List[str], arabic_texts: List[str], Interpretation_links: List[str], types: List[str], faiss_index,
dense_alpha: float = 0.6, device: str = None):
if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
self.device = device
self.content_list = content_list
self.ids = ids
self.dates = dates
self.part_ids = part_ids
self.context_ids = context_ids
self.large_titles = large_titles
self.sentences = sentences
self.titles = titles
self.urls = urls
self.arabic_texts = arabic_texts
self.Interpretation_links = Interpretation_links
self.types = types
self.faiss_index = faiss_index
self.N = len(content_list)
self.N = len(sentences)
# --- Dense Embedder ---
print("Loading SentenceTransformer model ...")
self.embedder = SentenceTransformer(MODEL_PATH, device=self.device)
# embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device= self.device)
# embedder.save(MODEL_PATH)
# --- Sparse (TF-IDF) ---
self.vectorizer = TfidfVectorizer(
analyzer="word",
ngram_range=(1, 2),
token_pattern=r"(?u)\b[\w\u0600-\u06FF]{2,}\b",
)
self.tfidf_matrix = self.vectorizer.fit_transform(self.content_list)
self.tfidf_matrix = self.vectorizer.fit_transform(self.sentences)
# --- Reranker ---
# self.tokenizer = AutoTokenizer.from_pretrained(RERANKER_MODEL, use_fast=True, local_files_only= True)
@ -334,8 +359,8 @@ class HybridRetrieverReranker:
)[:final_k]
return reranked
def get_passages(self, cand_idx, content_list):
passages = [content_list[idx] for idx in cand_idx]
def get_passages(self, cand_idx, sentences_list):
passages = [sentences_list[idx] for idx in cand_idx]
return passages
# --- Search ---
@ -351,29 +376,22 @@ class HybridRetrieverReranker:
# print(item)
pre_rerank_k = final_k
cand_idx = self.fuse(d_idx, d_scores, s_idx, s_scores, pre_rerank_k)
# for item in d_scores:
# print(item)
# print('&&&&&&&&&&&&&&&&&&&&&&&&&')
passages = self.get_passages(cand_idx, sentence_list)
# for itemm in s_scores:
# print(itemm)
# print('&&&&&&&&&&&&&&&&&&&&&&&&&')
# print('&&&&&&&&&&&&&&&&&&&&&&&&&')
# passages = self.get_passages(cand_idx, sentence_list)
# reranked = self.rerank(query, cand_idx, passages, final_k)
# return [
# {"idx": i, "content": self.sentence_list[i], "rerank_score": score}
# for i, score in reranked
# ]
with open("./data/passages.json", 'w',encoding='utf-8') as file:
json.dump(passages, file, ensure_ascii=False, indent=4)
# return [
# {"idx": i, "content": self.sentence_list[i], "rerank_score": score}
# for i, score in reranked
# ]
return [
{"idx": i, "content": self.sentence_list[i]}
{"idx": i, "content": self.sentences[i]}
for i in cand_idx
]
@ -381,7 +399,7 @@ def single_query(query: str):
query = cleaning(query)
retrived_sections = pipe.search(query, sentences, topk_dense=100, topk_sparse=100, pre_rerank_k=100, final_k=17)
retrived_sections = pipe.search(query, sentences, topk_dense=100, topk_sparse=100, pre_rerank_k=100, final_k=10)
retrived_sections_list = []
final_similars_text = ''
@ -399,7 +417,7 @@ def single_query(query: str):
final_similars_text += ''.join(result)
id_list += f"{ids[row['idx']]}\n"
retrived_sections_list.append(row)
return idlist, final_similars_text, retrived_sections_list
return id_list, final_similars_text, retrived_sections_list
def find_refrences(llm_answer: str) -> List[str]:
"""
@ -451,7 +469,7 @@ client = get_client()
models = [ "gemini-2.5-flash-lite", "gpt-4o-mini","deepseek-reasoner"]
def save_result(chat_obj: object) -> bool:
# index result in elastic
# index result in DATABASE
pass
def run_chatbot(query:str, chat_id:str):
@ -532,7 +550,7 @@ def run_chatbot(query:str, chat_id:str):
'status' : True, # or False # bool
}
save_result(chat_obj)
# save_result(chat_obj)
status_text ='پاسخ با موفقیت ایجاد شد'
return chat_obj, status_text
@ -547,11 +565,11 @@ def credit_refresh():
"Content-Type": "application/json",
"Authorization": f"Bearer {get_key()}"
}
remained_credit = requests.get(url, headers=headers)
remained_credit = str((requests.get(url, headers=headers)).json()['remaining_irt'])
with open('./llm-answer/credit.txt','w') as file:
file.write(str(remained_credit.json()['remaining_irt']))
return str(remained_credit.json()['remaining_irt'])
dm.add_credit(remained_credit= remained_credit)
return remained_credit
# تعریف مدل داده‌ها برای درخواست‌های API
# class Query(BaseModel):
# query: str
@ -559,7 +577,7 @@ def credit_refresh():
date = str((datetime.datetime.now())).replace(' ','-').replace(':','').replace('.','-')
chat_id = f'{date}-{random.randint(100000, 999999)}'
print('#'*29)
print(' - NAHJ ENGINE IS READY! - ')
print(' - NAHJ ENGINE IS READY! - ')
print('#'*29)
# مسیر API برای اجرا کردن run_chatbot
# @chatbot.post("/run_chatbot")
@ -600,16 +618,15 @@ def get_passages_by_paragraphs(retrived_sections_list):
"""
بازسازی متن های مشابه بر اساس پاراگراف آنها
"""
with open('./data-faiss/faiss_index_nahj_metadata.json', 'r', encoding='utf-8') as file:
data = json.load(file)
data = dm.get_all_data()
final_passages = ''
for item in retrived_sections_list:
title = item['large_title']
filtered_data = {}
for row in data:
if row['part_id'] == item['part_id']:
filtered_data[row['id']] = row
title = row['large_title']
# مرتب سازی بر اساس ترتیب جمله در پاراگراف
sorted_data = dict(sorted(filtered_data.items()))
@ -621,7 +638,7 @@ def get_passages_by_paragraphs(retrived_sections_list):
final_passages += ''.join(f'{paragraph.strip()}\n\n')
return final_passages
pass
def bale_search(query):
start = datetime.datetime.now()
@ -634,20 +651,6 @@ def bale_search(query):
# پاسخ حداکثر 300 کلمه باشد.
refrences = ''
# recognized_refrences = find_refrences(llm_answer)
# llm_answer = replace_refrences(llm_answer, recognized_refrences)
# with open('./nahj-answer/result-khamenei.txt', mode='a+', encoding='utf-8') as file:
# result_message = f'متن پرامپت: {query.strip()}\n\nپاسخ: {llm_answer} \n----------------------------------------------------------\n'
# file.write(result_message)
# with open('./nahj-answer/passages-khamenei.txt', mode='a+', encoding='utf-8') as file:
# result_message = f'متن پرامپت: {query.strip()}\nمواد مشابه: {result_passages} \n----------------------------------------------------------\n'
# file.write(result_message)
# with open('./nahj-answer/chat-leader.txt', mode='w', encoding='utf-8') as file:
# result_message = f'{query.strip()}\n\nپاسخ:\n {llm_answer} \n----------------------------------------------------------\n'
# file.write(result_message)
print('---------------------------------------------')
print(f'full duration: {(datetime.datetime.now() - start).total_seconds()}')
@ -672,20 +675,6 @@ def bale_chat(query):
print(f'llm duration: {(datetime.datetime.now() - end_retrive).total_seconds()}')
refrences = ''
# recognized_refrences = find_refrences(llm_answer)
# llm_answer = replace_refrences(llm_answer, recognized_refrences)
# with open('./nahj-answer/result-khamenei.txt', mode='a+', encoding='utf-8') as file:
# result_message = f'متن پرامپت: {query.strip()}\n\nپاسخ: {llm_answer} \n----------------------------------------------------------\n'
# file.write(result_message)
# with open('./nahj-answer/passages-khamenei.txt', mode='a+', encoding='utf-8') as file:
# result_message = f'متن پرامپت: {query.strip()}\nمواد مشابه: {result_passages} \n----------------------------------------------------------\n'
# file.write(result_message)
# with open('./nahj-answer/chat-leader.txt', mode='w', encoding='utf-8') as file:
# result_message = f'{query.strip()}\n\nپاسخ:\n {llm_answer} \n----------------------------------------------------------\n'
# file.write(result_message)
print('---------------------------------------------')
print(f'full duration: {(datetime.datetime.now() - start).total_seconds()}')