Compare commits
1 Commits
master
...
init-mahdi
| Author | SHA1 | Date | |
|---|---|---|---|
| d7ee4bdea0 |
8
.env
Executable file
8
.env
Executable file
|
|
@ -0,0 +1,8 @@
|
||||||
|
api_key="aa-fdh9d847ANcBxQCBTZD5hrrAdl0UrPEnJOScYmOncrkagYPf"
|
||||||
|
aval_ai_key = 'aa-4tvAEazUBovEN1i7i7tdl1PR93OaWXs6hMflR4oQbIIA4K7Z'
|
||||||
|
aval_ai_url = "https://api.avalai.ir/v1"
|
||||||
|
rerank_batch_size ="256"
|
||||||
|
es_url = "https://192.168.23.160:9200"
|
||||||
|
es_username = ""
|
||||||
|
es_index_name = ""
|
||||||
|
es_password = ""
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
# Qavanin Chatbot
|
|
||||||
|
|
||||||
|
|
@ -1,155 +0,0 @@
|
||||||
print(f'import bale madule ...')
|
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
from fastapi import FastAPI, Request
|
|
||||||
from pydantic import BaseModel
|
|
||||||
import requests
|
|
||||||
import logging
|
|
||||||
import uvicorn
|
|
||||||
import chatbot_handler as ch
|
|
||||||
|
|
||||||
# ===========================
|
|
||||||
# پیکربندی اولیه
|
|
||||||
# ===========================
|
|
||||||
TOKEN = '2052165365:Tt7u2qXB9oRTPISeZ0wmoAGpPsIgjq-vAxM'
|
|
||||||
API_URL = f"https://tapi.bale.ai/bot{TOKEN}/"
|
|
||||||
|
|
||||||
# راهاندازی لاگر
|
|
||||||
logging.basicConfig(
|
|
||||||
filename="./baleqabot/bot.log",
|
|
||||||
level=logging.INFO,
|
|
||||||
format="%(asctime)s - %(levelname)s - %(message)s"
|
|
||||||
)
|
|
||||||
|
|
||||||
# ===========================
|
|
||||||
# define import model class
|
|
||||||
# ===========================
|
|
||||||
class Message(BaseModel):
|
|
||||||
chat: dict
|
|
||||||
text: str | None = None
|
|
||||||
|
|
||||||
class Update(BaseModel):
|
|
||||||
message: Message | None = None
|
|
||||||
|
|
||||||
# ===========================
|
|
||||||
# کلاس اصلی ربات بله
|
|
||||||
# ===========================
|
|
||||||
class BaleBot:
|
|
||||||
def __init__(self, token: str):
|
|
||||||
# self.api_url = f"https://api.bale.ai/bot{token}/"
|
|
||||||
self.api_url = API_URL
|
|
||||||
|
|
||||||
async def get_updates(self, offset = None):
|
|
||||||
params = {"timeout": 20}
|
|
||||||
if offset:
|
|
||||||
params["offset"] = offset
|
|
||||||
resp = requests.get(f"{self.api_url}getUpdates", params=params)
|
|
||||||
return resp.json().get("result", [])
|
|
||||||
|
|
||||||
async def send_message(self, chat_id: int, text: str, keyboard=None):
|
|
||||||
payload = {
|
|
||||||
"chat_id": chat_id,
|
|
||||||
"text": text,
|
|
||||||
"parse_mode": "HTML"
|
|
||||||
}
|
|
||||||
if keyboard:
|
|
||||||
payload["reply_markup"] = keyboard
|
|
||||||
try:
|
|
||||||
response = requests.post(self.api_url + "sendMessage", json=payload)
|
|
||||||
return response.json()
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Error sending message: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
async def get_latest_req_id(self):
|
|
||||||
latest_req_id = 0
|
|
||||||
with open('./baleqabot/requests.json', 'r', encoding='utf-8') as file:
|
|
||||||
prev_reqs = json.load(file)
|
|
||||||
|
|
||||||
if prev_reqs:
|
|
||||||
latest_req_id = prev_reqs[-1]['update_id']
|
|
||||||
|
|
||||||
return latest_req_id + 1
|
|
||||||
|
|
||||||
async def save_entery(self, update):
|
|
||||||
all_reqs = []
|
|
||||||
with open('./baleqabot/requests.json', 'r', encoding='utf-8') as file:
|
|
||||||
prev_reqs = json.load(file)
|
|
||||||
all_reqs.extend(prev_reqs)
|
|
||||||
all_reqs.extend(update)
|
|
||||||
|
|
||||||
with open('./baleqabot/requests.json', 'w', encoding='utf-8') as file:
|
|
||||||
data = json.dumps(all_reqs, ensure_ascii=False, indent=2)
|
|
||||||
file.write(data)
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
async def handle_update(self, update_reqs: dict):
|
|
||||||
data = update_reqs[0]
|
|
||||||
if "message" not in data:
|
|
||||||
return
|
|
||||||
message = data["message"]
|
|
||||||
chat_id = message["chat"]["id"]
|
|
||||||
text = message.get("text", "").strip()
|
|
||||||
|
|
||||||
logging.info(f"Received message from {chat_id}: {text}")
|
|
||||||
|
|
||||||
if text == "/start":
|
|
||||||
reply = "سلام، من دستیار هوشمند قوانین هستم. لطفا پرسش خود را وارد نمائید ..."
|
|
||||||
# keyboard = {
|
|
||||||
# "keyboard": [["/help", "/status"]],
|
|
||||||
# "resize_keyboard": True,
|
|
||||||
# "one_time_keyboard": True
|
|
||||||
# }
|
|
||||||
self.send_message(chat_id, reply)
|
|
||||||
|
|
||||||
elif text == "/chat":
|
|
||||||
reply = "لطفا متن پرسش از قوانین را وارد نمائید ..."
|
|
||||||
self.send_message(chat_id, reply)
|
|
||||||
|
|
||||||
# elif text == "/help":
|
|
||||||
# reply = (
|
|
||||||
# "دستورهای موجود:\n"
|
|
||||||
# "/start - شروع ربات\n"
|
|
||||||
# "/chat - گفتگو با دستیار هوشمند قانون\n"
|
|
||||||
# "/status - وضعیت ربات"
|
|
||||||
# )
|
|
||||||
# self.send_message(chat_id, reply)
|
|
||||||
|
|
||||||
elif text == "/status":
|
|
||||||
reply = "ربات فعال است ✅"
|
|
||||||
self.send_message(chat_id, reply)
|
|
||||||
|
|
||||||
else:
|
|
||||||
answer = await chat.run_chatbot(text, chat.create_chat_id())
|
|
||||||
if answer:
|
|
||||||
reply = answer
|
|
||||||
else:
|
|
||||||
reply = 'خطا در تولید پاسخ!'
|
|
||||||
|
|
||||||
self.send_message(chat_id, reply)
|
|
||||||
|
|
||||||
|
|
||||||
async def bale_main():
|
|
||||||
print(f'bale-qabot is Readey!!!')
|
|
||||||
while True:
|
|
||||||
last_req_id = bale_bot.get_latest_req_id()
|
|
||||||
update = bale_bot.get_updates(last_req_id)
|
|
||||||
if update:
|
|
||||||
bale_bot.save_entery(update)
|
|
||||||
bale_bot.handle_update(update)
|
|
||||||
print('ok')
|
|
||||||
|
|
||||||
|
|
||||||
# ===========================
|
|
||||||
# ساخت اپلیکیشن FastAPI
|
|
||||||
# ===========================
|
|
||||||
# app = FastAPI()
|
|
||||||
bale_bot = BaleBot(TOKEN)
|
|
||||||
result = asyncio.run(bale_main())
|
|
||||||
# ===========================
|
|
||||||
# (local execution)
|
|
||||||
# ===========================
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# uvicorn.run("chatbot:app", host="https://bale.tavasi.ir/", port=5000)
|
|
||||||
bale_main()
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
sudo docker build -t docker.tavasi.ir/tavasi/qachat_base:1.0.0 -f dockerfile_base .
|
|
||||||
sudo docker build -t docker.tavasi.ir/tavasi/qachat:1.0.0 .
|
|
||||||
117
_old/chatbot.py
117
_old/chatbot.py
|
|
@ -1,117 +0,0 @@
|
||||||
import json
|
|
||||||
import chatbot_handler as chatbot_handler
|
|
||||||
# import bale_qabot
|
|
||||||
import os
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
import faiss
|
|
||||||
from typing import List, Tuple
|
|
||||||
from sentence_transformers import SentenceTransformer
|
|
||||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
|
||||||
import datetime
|
|
||||||
import re
|
|
||||||
import random
|
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
|
||||||
from embedder_sbert_qavanin_285k import PersianVectorAnalyzer
|
|
||||||
#from normalizer import cleaning
|
|
||||||
from fastapi import FastAPI ,Header
|
|
||||||
from pydantic import BaseModel
|
|
||||||
# LLM Libs
|
|
||||||
from openai import OpenAI
|
|
||||||
from langchain_openai import ChatOpenAI # pip install -U langchain_openai
|
|
||||||
import requests
|
|
||||||
from FlagEmbedding import FlagReranker # deldar-reranker-v2
|
|
||||||
import aiofiles
|
|
||||||
|
|
||||||
chatbot = FastAPI()
|
|
||||||
origins = ["*"]
|
|
||||||
|
|
||||||
chatbot.add_middleware(
|
|
||||||
CORSMiddleware,
|
|
||||||
allow_origins=origins,
|
|
||||||
allow_credentials=True,
|
|
||||||
allow_methods=["*"],
|
|
||||||
allow_headers=["*"],
|
|
||||||
)
|
|
||||||
|
|
||||||
print('#'*19)
|
|
||||||
print('-Chatbot is Ready-')
|
|
||||||
print('#'*19)
|
|
||||||
|
|
||||||
# تعریف مدل دادهها برای درخواستهای API
|
|
||||||
class Query(BaseModel):
|
|
||||||
query: str
|
|
||||||
# مسیر API برای اجرا کردن run_chatbot
|
|
||||||
@chatbot.get("/")
|
|
||||||
async def simple():
|
|
||||||
return "ai rag caht qanon OK"
|
|
||||||
|
|
||||||
@chatbot.get("/ping")
|
|
||||||
async def ping():
|
|
||||||
return "ai rag caht qanon OK"
|
|
||||||
|
|
||||||
|
|
||||||
@chatbot.post("/emergency_call")
|
|
||||||
async def emergency_call(query: Query):
|
|
||||||
print('emergency generate answer ...')
|
|
||||||
chat_id = await chatbot_handler.create_chat_id()
|
|
||||||
print('emergency chat_id ...', chat_id)
|
|
||||||
answer = await chatbot_handler.ask_chatbot_avalai(query.query, chat_id)
|
|
||||||
print('emergency answer ...', answer)
|
|
||||||
await chatbot_handler.credit_refresh()
|
|
||||||
print('credit updated')
|
|
||||||
return {"answer": answer}
|
|
||||||
|
|
||||||
@chatbot.post("/run_chat")
|
|
||||||
async def run_chat(query: Query):
|
|
||||||
print('regular generate answer ...')
|
|
||||||
chat_id = await chatbot_handler.create_chat_id()
|
|
||||||
answer = await chatbot_handler.ask_chatbot(query.query, chat_id)
|
|
||||||
await chatbot_handler.credit_refresh()
|
|
||||||
|
|
||||||
return {"answer": answer}
|
|
||||||
|
|
||||||
# uvicorn src.app:app --reload
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
|
|
||||||
# query = 'در قانون حمایت از خانواده و جوانی جمعیت چه خدماتی در نظر گرفته شده است؟'
|
|
||||||
while True:
|
|
||||||
query = input('enter your qustion:')
|
|
||||||
if query == '':
|
|
||||||
print('لطفا متن سوال را وارد نمائید')
|
|
||||||
continue
|
|
||||||
start = (datetime.datetime.now())
|
|
||||||
# result = test_dataset()
|
|
||||||
result = chatbot_handler.single_query(query)
|
|
||||||
end_retrive = datetime.datetime.now()
|
|
||||||
print('-'*40)
|
|
||||||
print(f'retrive duration: {(end_retrive - start).total_seconds()}')
|
|
||||||
|
|
||||||
prompt = f'برای پرسش "{query}" از میان مواد قانونی "{result}" .پاسخ مناسب و دقیق را استخراج کن. درصورتی که مطلبی مرتبط با پرسش در متن پیدا نشد، فقط پاسخ بده: "متاسفانه در منابع، پاسخی پیدا نشد!"'
|
|
||||||
llm_answer = chatbot_handler.llm_request(prompt)
|
|
||||||
|
|
||||||
print('-'*40)
|
|
||||||
print(f'llm duration: {(datetime.datetime.now() - end_retrive).total_seconds()}')
|
|
||||||
|
|
||||||
refrences = ''
|
|
||||||
recognized_refrences = chatbot_handler.find_refrences(llm_answer)
|
|
||||||
llm_answer = chatbot_handler.replace_refrences(llm_answer, recognized_refrences)
|
|
||||||
|
|
||||||
with open('./llm-answer/result.txt', mode='a+', encoding='utf-8') as file:
|
|
||||||
result_message = f'متن پرامپت: {query.strip()}\n\nپاسخ: {llm_answer} \n----------------------------------------------------------\n'
|
|
||||||
file.write(result_message)
|
|
||||||
|
|
||||||
with open('./llm-answer/passages.txt', mode='a+', encoding='utf-8') as file:
|
|
||||||
result_message = f'متن پرامپت: {query.strip()}\n\مواد مشابه: {result} \n----------------------------------------------------------\n'
|
|
||||||
file.write(result_message)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
print('----------------------------------------------------------')
|
|
||||||
print(f'full duration: {(datetime.datetime.now() - start).total_seconds()}')
|
|
||||||
print('----------------------------------------------------------')
|
|
||||||
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
|
|
||||||
|
|
||||||
|
|
@ -1,974 +0,0 @@
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
import faiss
|
|
||||||
from typing import List, Tuple
|
|
||||||
from sentence_transformers import SentenceTransformer
|
|
||||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
|
||||||
import datetime
|
|
||||||
import re
|
|
||||||
import random
|
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
|
||||||
from embedder_sbert_qavanin_285k import PersianVectorAnalyzer
|
|
||||||
# from normalizer import cleaning
|
|
||||||
from fastapi import FastAPI ,Header
|
|
||||||
from pydantic import BaseModel
|
|
||||||
# LLM Libs
|
|
||||||
from openai import OpenAI
|
|
||||||
from langchain_openai import ChatOpenAI # pip install -U langchain_openai
|
|
||||||
import requests
|
|
||||||
# from FlagEmbedding import FlagReranker # deldar-reranker-v2
|
|
||||||
import aiofiles
|
|
||||||
import oss
|
|
||||||
|
|
||||||
# chatbot = FastAPI()
|
|
||||||
# origins = ["*"]
|
|
||||||
|
|
||||||
# chatbot.add_middleware(
|
|
||||||
# CORSMiddleware,
|
|
||||||
# allow_origins=origins,
|
|
||||||
# allow_credentials=True,
|
|
||||||
# allow_methods=["*"],
|
|
||||||
# allow_headers=["*"],
|
|
||||||
# )
|
|
||||||
|
|
||||||
# -------------------
|
|
||||||
# مدلها و مسیر دادهsrc/app/qavanin-faiss/faiss_index_qavanin_285k_metadata.json
|
|
||||||
# -------------------/src/app/qavanin-faiss
|
|
||||||
EMBED_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
|
||||||
RERANKER_MODEL = "BAAI/bge-reranker-v2-m3"
|
|
||||||
FAISS_INDEX_PATH = "/src/app/qavanin-faiss/faiss_index_qavanin_285k.index"
|
|
||||||
FAISS_METADATA_PATH = "/src/app/qavanin-faiss/faiss_index_qavanin_285k_metadata.json"
|
|
||||||
|
|
||||||
RERANK_BATCH = int(os.environ.get("RERANK_BATCH", 256))
|
|
||||||
# print(f'RERANK_BATCH: {RERANK_BATCH}')
|
|
||||||
determine_refrence = '''شناسه هر ماده قانونی در ابتدای آن و با فرمت "id: {idvalue}" آمده است که id-value همان شناسه ماده است. بازای هربخش از پاسخی که تولید می شود، ضروری است شناسه ماده ای که در ایجاد پاسخ از آن استفاده شده، در انتهای پاراگراف یا جمله مربوطه با فرمت {idvalue} اضافه شود. همیشه idvalue با رشته "qs" شروع می شود'''
|
|
||||||
messages = [
|
|
||||||
# {
|
|
||||||
# "role": "system",
|
|
||||||
# "content": "تو یک دستیار خبره در زمینه حقوق و قوانین مرتبط به آن هستی و می توانی متون حقوقی را به صورت دقیق توضیح بدهی . پاسخ ها باید الزاما به زبان فارسی باشد. پاسخ ها فقط از متون قانونی که در پرامپت وجود دارد استخراج شود. پاسخ تولید شده باید کاملا ساده و بدون هیچ مارک داون یا علائم افزوده ای باشد. لحن متن باید رسمی باشد.",
|
|
||||||
# },
|
|
||||||
{"role": "developer", "content": determine_refrence},
|
|
||||||
]
|
|
||||||
|
|
||||||
models = ["gpt-4o-mini" ,"gemini-2.5-flash-lite", "deepseek-chat"]
|
|
||||||
normalizer_obj = PersianVectorAnalyzer()
|
|
||||||
pipe = None
|
|
||||||
content_list, ids, prefix_list, faiss_index = [], [], [], []
|
|
||||||
|
|
||||||
async def get_key():
|
|
||||||
key = 'aa-fdh9d847ANcBxQCBTZD5hrrAdl0UrPEnJOScYmOncrkagYPf'
|
|
||||||
return key
|
|
||||||
|
|
||||||
def load_faiss_index(index_path: str, metadata_path: str):
|
|
||||||
"""بارگذاری ایندکس FAISS و متادیتا (لیست جملات + عناوین)."""
|
|
||||||
index = faiss.read_index(index_path)
|
|
||||||
|
|
||||||
with open(metadata_path, "r", encoding="utf-8") as f:
|
|
||||||
metadata = json.load(f)
|
|
||||||
|
|
||||||
content_list, ids, prefix_list = [], [], []
|
|
||||||
for item in metadata:
|
|
||||||
content_list.append(item["content"])
|
|
||||||
ids.append(item["id"])
|
|
||||||
prefix_list.append(item["prefix"])
|
|
||||||
|
|
||||||
return content_list, ids, prefix_list, index
|
|
||||||
|
|
||||||
async def get_client():
|
|
||||||
url = "https://api.avalai.ir/v1"
|
|
||||||
# key = 'aa-4tvAEazUBovEN1i7i7tdl1PR93OaWXs6hMflR4oQbIIA4K7Z'
|
|
||||||
|
|
||||||
client = OpenAI(
|
|
||||||
api_key=await get_key(),
|
|
||||||
base_url=url, # آدرس پایه
|
|
||||||
)
|
|
||||||
|
|
||||||
return client
|
|
||||||
|
|
||||||
async def llm_base_request(system_prompt, user_prompt):
|
|
||||||
client = await get_client() # فرض میکنیم get_client یک متد async است
|
|
||||||
base_messages = []
|
|
||||||
try:
|
|
||||||
if system_prompt:
|
|
||||||
base_messages.append({
|
|
||||||
"role": "system",
|
|
||||||
"content": system_prompt
|
|
||||||
})
|
|
||||||
|
|
||||||
base_messages.append({
|
|
||||||
"role": "user",
|
|
||||||
"content": user_prompt
|
|
||||||
})
|
|
||||||
for model in models:
|
|
||||||
response = client.chat.completions.create( # متد create به صورت async فراخوانی میشود
|
|
||||||
messages=base_messages,
|
|
||||||
model=model
|
|
||||||
)
|
|
||||||
answer = response.choices[0].message.content
|
|
||||||
cost = response.estimated_cost['irt']
|
|
||||||
break
|
|
||||||
|
|
||||||
except Exception as error:
|
|
||||||
# برای مدیریت خطاها، میتوانید فایلنویسی را به صورت async انجام دهید (در صورت نیاز)
|
|
||||||
async with aiofiles.open('./llm-answer/error-in-llm.txt', mode='a+', encoding='utf-8') as file:
|
|
||||||
error_message = f'\n\nquery: {user_prompt.strip()}\nerror:{error} \n------------------------------\n'
|
|
||||||
await file.write(error_message) # فایلنویسی async
|
|
||||||
|
|
||||||
return '', 0
|
|
||||||
|
|
||||||
return answer, cost
|
|
||||||
|
|
||||||
def llm_base_request2(system_prompt, user_prompt):
|
|
||||||
client = get_client()
|
|
||||||
base_messages = []
|
|
||||||
try:
|
|
||||||
if system_prompt:
|
|
||||||
base_messages.append(system_prompt)
|
|
||||||
base_messages.append({
|
|
||||||
"role": "user",
|
|
||||||
"content": user_prompt
|
|
||||||
})
|
|
||||||
for model in models:
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
messages = base_messages,
|
|
||||||
model= model)
|
|
||||||
answer = response.choices[0].message.content
|
|
||||||
cost = response.estimated_cost['irt']
|
|
||||||
break
|
|
||||||
|
|
||||||
except Exception as error:
|
|
||||||
with open('./llm-answer/error-in-llm.txt', mode='a+', encoding='utf-8') as file:
|
|
||||||
error_message = f'\n\nquery: {query.strip()}\nerror:{error} \n-------------------------------\n'
|
|
||||||
file.write(error_message)
|
|
||||||
|
|
||||||
return '', 0
|
|
||||||
|
|
||||||
return answer, cost
|
|
||||||
|
|
||||||
async def oss_base_request(sys_prompt, user_prompt):
|
|
||||||
base_messages = []
|
|
||||||
try:
|
|
||||||
if sys_prompt:
|
|
||||||
base_messages.append({
|
|
||||||
"role": "system",
|
|
||||||
"content": sys_prompt
|
|
||||||
})
|
|
||||||
|
|
||||||
base_messages.append({
|
|
||||||
"role": "user",
|
|
||||||
"content": user_prompt
|
|
||||||
})
|
|
||||||
response = await oss.process_item(base_messages, reasoning_effort='low', temperature=0.1, max_tokens=40)
|
|
||||||
|
|
||||||
if response[0]:
|
|
||||||
answer = response[1]
|
|
||||||
else:
|
|
||||||
answer = ''
|
|
||||||
cost = 0
|
|
||||||
|
|
||||||
except Exception as error:
|
|
||||||
# برای مدیریت خطاها، میتوانید فایلنویسی را به صورت async انجام دهید (در صورت نیاز)
|
|
||||||
async with aiofiles.open('./llm-answer/error-in-llm.txt', mode='a+', encoding='utf-8') as file:
|
|
||||||
error_message = f'\n\nquery: {user_prompt.strip()}\nerror:{error} \n------------------------------\n'
|
|
||||||
await file.write(error_message) # فایلنویسی async
|
|
||||||
|
|
||||||
return '', 0
|
|
||||||
|
|
||||||
return answer, cost
|
|
||||||
|
|
||||||
async def oss_request(query):
|
|
||||||
|
|
||||||
if query == '':
|
|
||||||
return 'لطفا متن سوال را وارد نمائید', 0
|
|
||||||
|
|
||||||
try:
|
|
||||||
messages.append({"role": "user", "content": query})
|
|
||||||
print(f'final prompt request attempt')
|
|
||||||
response = await oss.process_item(messages= messages, reasoning_effort='low') # reasoning_effort='high'
|
|
||||||
print(response)
|
|
||||||
if response[0]:
|
|
||||||
answer = response[1]
|
|
||||||
else:
|
|
||||||
answer = 'متاسفانه پاسخی دریافت نشد'
|
|
||||||
cost_prompt = 0
|
|
||||||
# پاسخ را هم به سابقه اضافه میکنیم
|
|
||||||
# messages.append({"role": "assistant", "content": answer})
|
|
||||||
|
|
||||||
response_dict = {}
|
|
||||||
response_dict['output'] = str(response)
|
|
||||||
async with aiofiles. open('./llm-answer/messages.json', mode='w', encoding='utf-8') as output:
|
|
||||||
await output.write(json.dumps(response_dict, ensure_ascii=False, indent=2))
|
|
||||||
print('oss response created')
|
|
||||||
async with aiofiles.open('./llm-answer/chat-objs.txt', mode='a+', encoding='utf-8') as file:
|
|
||||||
response_value = '0'
|
|
||||||
await file.write(response_value) # estimated_cost
|
|
||||||
|
|
||||||
except Exception as error:
|
|
||||||
print(f'error-in-llm.txt writing ...')
|
|
||||||
async with aiofiles.open('./llm-answer/error-in-llm.txt', mode='a+', encoding='utf-8') as file:
|
|
||||||
error_message = f'\n\nquery: {query.strip()}\nerror:{error} \n-------------------------------\n'
|
|
||||||
await file.write(error_message)
|
|
||||||
|
|
||||||
return 'با عرض پوزش؛ متاسفانه خطایی رخ داده است. لطفا لحظاتی دیگر دوباره تلاش نمائید', 0
|
|
||||||
print('================')
|
|
||||||
print(f'len messages: {len(messages)}')
|
|
||||||
print('================')
|
|
||||||
return answer, cost_prompt
|
|
||||||
|
|
||||||
async def llm_request(query, model):
|
|
||||||
|
|
||||||
if query == '':
|
|
||||||
return 'لطفا متن سوال را وارد نمائید', 0
|
|
||||||
|
|
||||||
client = await get_client()
|
|
||||||
try:
|
|
||||||
messages.append({"role": "user", "content": query})
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
messages = messages,
|
|
||||||
model = model,
|
|
||||||
temperature = 0.3) # "gpt-4o", "gpt-4o-mini", "deepseek-chat" , "gemini-2.0-flash", gemini-2.5-flash-lite
|
|
||||||
# gpt-4o : 500
|
|
||||||
# gpt-4o-mini : 34
|
|
||||||
# deepseek-chat: : 150
|
|
||||||
# gemini-2.0-flash : error
|
|
||||||
# cf.gemma-3-12b-it : 1
|
|
||||||
# gemini-2.5-flash-lite : 35 خیلی خوب
|
|
||||||
|
|
||||||
answer = response.choices[0].message.content
|
|
||||||
# print('$'*50)
|
|
||||||
# print(f'answer: {answer}')
|
|
||||||
# print('$'*50)
|
|
||||||
cost_prompt = response.estimated_cost['irt']
|
|
||||||
# print('$'*50)
|
|
||||||
# print(f'answer: {cost_prompt}')
|
|
||||||
# print('$'*50)
|
|
||||||
# پاسخ را هم به سابقه اضافه میکنیم
|
|
||||||
# messages.append({"role": "assistant", "content": answer})
|
|
||||||
# print(f'type(response): {type(response)}')
|
|
||||||
# print(f'response: {response}')
|
|
||||||
response_dict = {}
|
|
||||||
response_dict['output'] = str(response)
|
|
||||||
async with aiofiles. open('./llm-answer/messages.json', mode='w', encoding='utf-8') as output:
|
|
||||||
await output.write(json.dumps(response_dict, ensure_ascii=False, indent=2))
|
|
||||||
print('llm response created')
|
|
||||||
async with aiofiles.open('./llm-answer/chat-objs.txt', mode='a+', encoding='utf-8') as file:
|
|
||||||
response_value = f"{response.estimated_cost['irt']}\n-------------------------------\n\n"
|
|
||||||
await file.write(response_value) # estimated_cost
|
|
||||||
|
|
||||||
except Exception as error:
|
|
||||||
print(f'error-in-llm.txt writing ...')
|
|
||||||
async with aiofiles.open('./llm-answer/error-in-llm.txt', mode='a+', encoding='utf-8') as file:
|
|
||||||
error_message = f'\n\nquery: {query.strip()}\nerror:{error} \n-------------------------------\n'
|
|
||||||
await file.write(error_message)
|
|
||||||
|
|
||||||
return 'با عرض پوزش؛ متاسفانه خطایی رخ داده است. لطفا لحظاتی دیگر دوباره تلاش نمائید', 0
|
|
||||||
print('================')
|
|
||||||
print(f'len messages: {len(messages)}')
|
|
||||||
print('================')
|
|
||||||
return answer, cost_prompt
|
|
||||||
|
|
||||||
class HybridRetrieverReranker:
|
|
||||||
__slots__ = (
|
|
||||||
"device", "content_list", "ids", "prefix_list", "N", "embedder", "faiss_index",
|
|
||||||
"vectorizer", "tfidf_matrix", "tokenizer", "reranker", "dense_alpha"
|
|
||||||
)
|
|
||||||
|
|
||||||
def __init__(self, content_list: List[str],ids: List[str], prefix_list: List[str], faiss_index,
|
|
||||||
dense_alpha: float = 0.6, device: str = None):
|
|
||||||
|
|
||||||
if device is None:
|
|
||||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
||||||
self.device = device
|
|
||||||
|
|
||||||
self.content_list = content_list
|
|
||||||
self.ids = ids
|
|
||||||
self.prefix_list = prefix_list
|
|
||||||
self.faiss_index = faiss_index
|
|
||||||
self.N = len(content_list)
|
|
||||||
|
|
||||||
# Dense
|
|
||||||
self.embedder = SentenceTransformer(EMBED_MODEL,cache_folder='/src/MODELS', device=self.device)
|
|
||||||
#self.embedder = SentenceTransformer(EMBED_MODEL, device=self.device)
|
|
||||||
|
|
||||||
# Sparse (مثل قبل برای حفظ خروجی)
|
|
||||||
self.vectorizer = TfidfVectorizer(
|
|
||||||
analyzer="word",
|
|
||||||
ngram_range=(1, 2),
|
|
||||||
token_pattern=r"(?u)\b[\w\u0600-\u06FF]{2,}\b",
|
|
||||||
)
|
|
||||||
self.tfidf_matrix = self.vectorizer.fit_transform(self.content_list)
|
|
||||||
|
|
||||||
# Reranker
|
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(RERANKER_MODEL,cache_dir='/src/MODELS', use_fast=True)
|
|
||||||
# self.reranker = FlagReranker(RERANKER_MODEL,cache_dir="/src/MODELS", use_fp16=True)
|
|
||||||
self.reranker = AutoModelForSequenceClassification.from_pretrained(
|
|
||||||
RERANKER_MODEL
|
|
||||||
).to(self.device)
|
|
||||||
|
|
||||||
self.dense_alpha = float(dense_alpha)
|
|
||||||
|
|
||||||
# --- Dense (FAISS) ---
|
|
||||||
def dense_retrieve(self, query: str, top_k: int):
|
|
||||||
if top_k <= 0:
|
|
||||||
return [], np.array([], dtype=np.float32)
|
|
||||||
|
|
||||||
q_emb = self.embedder.encode(query, convert_to_numpy=True).astype(np.float32)
|
|
||||||
D, I = self.faiss_index.search(np.expand_dims(q_emb, axis=0), top_k)
|
|
||||||
|
|
||||||
return I[0].tolist(), D[0]
|
|
||||||
|
|
||||||
# --- Sparse ---
|
|
||||||
def sparse_retrieve(self, query: str, top_k: int):
|
|
||||||
if top_k <= 0:
|
|
||||||
return [], np.array([], dtype=np.float32)
|
|
||||||
k = min(top_k, self.N)
|
|
||||||
q_vec = self.vectorizer.transform([query])
|
|
||||||
sims = cosine_similarity(q_vec, self.tfidf_matrix).ravel()
|
|
||||||
idx = np.argpartition(-sims, kth=k-1)[:k]
|
|
||||||
idx = idx[np.argsort(-sims[idx], kind="mergesort")]
|
|
||||||
return idx.tolist(), sims[idx]
|
|
||||||
|
|
||||||
# --- Utils ---
|
|
||||||
@staticmethod
|
|
||||||
def _minmax_norm(arr: np.ndarray) -> np.ndarray:
|
|
||||||
if arr.size == 0:
|
|
||||||
return arr
|
|
||||||
a_min = arr.min()
|
|
||||||
a_max = arr.max()
|
|
||||||
rng = a_max - a_min
|
|
||||||
if rng < 1e-12:
|
|
||||||
return np.zeros_like(arr)
|
|
||||||
return (arr - a_min) / rng
|
|
||||||
|
|
||||||
def fuse(self, d_idx, d_scores, s_idx, s_scores, top_k=50, k_rrf=60):
|
|
||||||
"""
|
|
||||||
ادغام نتایج دو retriever (dense و sparse) با استفاده از Reciprocal Rank Fusion (RRF)
|
|
||||||
|
|
||||||
Args:
|
|
||||||
d_idx (list or np.ndarray): ایندکسهای نتایج dense retriever
|
|
||||||
d_scores (list or np.ndarray): نمرات dense retriever
|
|
||||||
s_idx (list or np.ndarray): ایندکسهای نتایج sparse retriever
|
|
||||||
s_scores (list or np.ndarray): نمرات sparse retriever
|
|
||||||
top_k (int): تعداد نتایج نهایی
|
|
||||||
k_rrf (int): ثابت در فرمول RRF برای کاهش تأثیر رتبههای پایینتر
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list: لیست ایندکسهای ادغامشده به ترتیب نمره
|
|
||||||
"""
|
|
||||||
combined = {}
|
|
||||||
|
|
||||||
# dense retriever
|
|
||||||
for rank, idx in enumerate(d_idx):
|
|
||||||
score = 1.0 / (k_rrf + rank)
|
|
||||||
combined[idx] = combined.get(idx, 0) + score
|
|
||||||
|
|
||||||
# sparse retriever
|
|
||||||
for rank, idx in enumerate(s_idx):
|
|
||||||
score = 1.0 / (k_rrf + rank)
|
|
||||||
combined[idx] = combined.get(idx, 0) + score
|
|
||||||
|
|
||||||
# مرتبسازی نهایی
|
|
||||||
sorted_items = sorted(combined.items(), key=lambda x: x[1], reverse=True)
|
|
||||||
cand_idx = [item[0] for item in sorted_items[:top_k]]
|
|
||||||
|
|
||||||
return cand_idx
|
|
||||||
|
|
||||||
def rerank2(self, query: str, candidate_indices: List[int], passages: List[str], final_k:int=4):
|
|
||||||
z_results = [[query, sentence] for sentence in passages]
|
|
||||||
# The scores map into 0-1 by set "normalize=True", which will apply sigmoid function to the score
|
|
||||||
scores = self.reranker.compute_score(z_results, normalize=True)
|
|
||||||
s_results = sorted(zip(scores, z_results, candidate_indices), key=lambda x: x[0], reverse=True)
|
|
||||||
s_results2 = s_results[:final_k]
|
|
||||||
results = [[i[0], i[1][1], i[2]] for i in s_results2]
|
|
||||||
print('%'*50)
|
|
||||||
print('%'*50)
|
|
||||||
print(results)
|
|
||||||
with open('./llm-answer/reranker-result.txt', mode='a+', encoding='utf-8') as file:
|
|
||||||
for item in results:
|
|
||||||
file.write(f'{item}\n')
|
|
||||||
print('%'*50)
|
|
||||||
print('%'*50)
|
|
||||||
return results
|
|
||||||
|
|
||||||
def rerank(self, query: str, candidate_indices: List[int], passages: List[str], final_k: int) -> List[Tuple[int, float]]:
|
|
||||||
"""
|
|
||||||
Rerank candidate passages using a cross-encoder (e.g., MonoT5, MiniLM, etc.).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
query (str): پرسش کاربر
|
|
||||||
candidate_indices (List[int]): ایندکسهای کاندیدا (از retriever)
|
|
||||||
passages (List[str]): کل جملات/پاراگرافها
|
|
||||||
final_k (int): تعداد نتایج نهایی
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List[Tuple[int, float]]: لیستی از (ایندکس، امتیاز) برای بهترین نتایج
|
|
||||||
"""
|
|
||||||
if final_k <= 0 or not candidate_indices:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# آمادهسازی جفتهای (query, passage)
|
|
||||||
texts = [query] * len(candidate_indices)
|
|
||||||
pairs = passages
|
|
||||||
|
|
||||||
scores: List[float] = []
|
|
||||||
|
|
||||||
def _iter_batches(max_bs: int):
|
|
||||||
bs = max_bs
|
|
||||||
while bs >= 16: # حداقل batch_size
|
|
||||||
try:
|
|
||||||
with torch.inference_mode():
|
|
||||||
for start in range(0, len(pairs), bs):
|
|
||||||
batch_texts = texts[start:start + bs]
|
|
||||||
batch_pairs = pairs[start:start + bs]
|
|
||||||
inputs = self.tokenizer(
|
|
||||||
batch_texts,
|
|
||||||
batch_pairs,
|
|
||||||
padding=True,
|
|
||||||
truncation=True,
|
|
||||||
max_length=512,
|
|
||||||
return_tensors="pt",
|
|
||||||
).to(self.device)
|
|
||||||
|
|
||||||
logits = self.reranker(**inputs).logits.view(-1)
|
|
||||||
scores.extend(logits.detach().cpu().tolist())
|
|
||||||
return True
|
|
||||||
except torch.cuda.OutOfMemoryError:
|
|
||||||
if torch.cuda.is_available():
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
bs //= 2
|
|
||||||
return False
|
|
||||||
|
|
||||||
# اجرای reranking
|
|
||||||
success = _iter_batches(max_bs=64)
|
|
||||||
if not success:
|
|
||||||
raise RuntimeError("Reranker failed due to CUDA OOM, even with small batch size.")
|
|
||||||
|
|
||||||
# مرتبسازی نتایج بر اساس نمره
|
|
||||||
reranked = sorted(
|
|
||||||
zip(candidate_indices, scores),
|
|
||||||
key=lambda x: x[1],
|
|
||||||
reverse=True
|
|
||||||
)[:final_k]
|
|
||||||
|
|
||||||
return reranked
|
|
||||||
|
|
||||||
def get_passages(self, cand_idx, content_list):
|
|
||||||
passages = []
|
|
||||||
for idx in cand_idx:
|
|
||||||
passages.append(content_list[idx])
|
|
||||||
|
|
||||||
return passages
|
|
||||||
|
|
||||||
# --- Search (بدون تغییر) ---
|
|
||||||
def search(self, query: str, content_list, topk_dense=50, topk_sparse=50,
|
|
||||||
pre_rerank_k=50, final_k=10):
|
|
||||||
start_time = datetime.datetime.now()
|
|
||||||
|
|
||||||
# form embedder model
|
|
||||||
d_idx, d_scores = self.dense_retrieve(query, topk_dense)
|
|
||||||
dense_retrieve_end = datetime.datetime.now()
|
|
||||||
# print('@'*50)
|
|
||||||
# print(f'dense_retrieve_duration: {(dense_retrieve_end - start_time).total_seconds()}')
|
|
||||||
|
|
||||||
# from tfidf_matrix
|
|
||||||
s_idx, s_scores = self.sparse_retrieve(query, topk_sparse)
|
|
||||||
sparse_retrieve_end = datetime.datetime.now()
|
|
||||||
# print(f'sparse_retrieve_duration: {(sparse_retrieve_end - dense_retrieve_end).total_seconds()}')
|
|
||||||
cand_idx = self.fuse(d_idx, d_scores, s_idx, s_scores, pre_rerank_k)
|
|
||||||
fuse_end = datetime.datetime.now()
|
|
||||||
# print(f'fuse_duration: {(fuse_end - sparse_retrieve_end).total_seconds()}')
|
|
||||||
passages = self.get_passages(cand_idx, content_list)
|
|
||||||
get_passages_end = datetime.datetime.now()
|
|
||||||
# print(f'get_passages_duration: {(get_passages_end - fuse_end).total_seconds()}')
|
|
||||||
reranked = self.rerank(query, cand_idx, passages, final_k) # rerank2
|
|
||||||
rerank_end = datetime.datetime.now()
|
|
||||||
# print(f'rerank_duration: {(rerank_end - get_passages_end).total_seconds()}')
|
|
||||||
# print('@'*50)
|
|
||||||
return [{"idx": i, "content": self.content_list[i],"prefix": self.prefix_list[i], "rerank_score": score}
|
|
||||||
for i, score in reranked]
|
|
||||||
|
|
||||||
async def single_query(query: str):
|
|
||||||
|
|
||||||
# query = cleaning(query)
|
|
||||||
retrived_sections_ids = []
|
|
||||||
|
|
||||||
retrived_sections = pipe.search(query, content_list, topk_dense=100, topk_sparse=100, pre_rerank_k=100, final_k=10)
|
|
||||||
final_similars = ''
|
|
||||||
for i, row in enumerate(retrived_sections, 1):
|
|
||||||
id_value = '{' + str(ids[row['idx']]) + '}'
|
|
||||||
result = f"id: {id_value} \n{row['prefix']} {row['content']}\n"
|
|
||||||
retrived_sections_ids.append(ids[row['idx']])
|
|
||||||
final_similars += ''.join(result)
|
|
||||||
|
|
||||||
return final_similars, retrived_sections_ids
|
|
||||||
|
|
||||||
async def find_refrences(llm_answer: str) -> List[str]:
|
|
||||||
"""
|
|
||||||
شناسایی شناسه هایی که مدل زبانی، برای تهیه پاسخ از آنها استفاده کرده است
|
|
||||||
|
|
||||||
Args:
|
|
||||||
llm_answer(str): متنی که مدل زبانی تولید کرده است
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
refrence_ids(List[str]): لیستی از شناسه های تشخیص داده شده
|
|
||||||
"""
|
|
||||||
pattern = r"\{[^\}]+\}"
|
|
||||||
# pattern = r"(?:\{([^\}]+)\}|【([^】]+)】)"
|
|
||||||
refrence_ids = re.findall(pattern, llm_answer)
|
|
||||||
new_refrences_ids = []
|
|
||||||
for itm in refrence_ids:
|
|
||||||
# print(itm)
|
|
||||||
refrence = itm.lstrip('{')
|
|
||||||
refrence = refrence.lstrip('}')
|
|
||||||
new_refrences_ids.append(refrence)
|
|
||||||
|
|
||||||
refrence_ids = [item.lstrip('{').rstrip('}') for item in refrence_ids]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return refrence_ids
|
|
||||||
|
|
||||||
async def replace_refrences(llm_answer: str, refrences_list:List[str]) -> List[str]:
|
|
||||||
"""
|
|
||||||
شناسایی شناسه هایی که مدل زبانی، برای تهیه پاسخ از آنها استفاده کرده است
|
|
||||||
|
|
||||||
Args:
|
|
||||||
llm_answer(str): متنی که مدل زبانی تولید کرده است
|
|
||||||
refrences_list(List[str]): لیست شناسه ماده های مورد استفاده در پاسخ مدل زبانی
|
|
||||||
Returns:
|
|
||||||
llm_answer(str), : متن بازسازی شده پاسخ مدل زبانی که شناسه ماده های مورد استفاده در آن، اصلاح شده است
|
|
||||||
"""
|
|
||||||
# refrences = ''
|
|
||||||
for index, ref in enumerate(refrences_list,1):
|
|
||||||
new_ref = '{' + str(ref) + '}'
|
|
||||||
llm_answer = llm_answer.replace(new_ref, f' [«{str(index)}»](https://majles.tavasi.ir/entity/detail/view/qsection/{ref}) ')
|
|
||||||
# id = ref.lstrip('{')
|
|
||||||
# id = id.rstrip('}')
|
|
||||||
# refrences += ''.join(f'[{index}] https://majles.tavasi.ir/entity/detail/view/qsection/{id}\n')
|
|
||||||
|
|
||||||
# llm_answer = f'{llm_answer}\n\nمنابع پاسخ:\n{refrences.strip()}'
|
|
||||||
return llm_answer.strip()
|
|
||||||
|
|
||||||
def initial_model():
|
|
||||||
global pipe
|
|
||||||
global content_list, ids, prefix_list, faiss_index
|
|
||||||
|
|
||||||
if not pipe :
|
|
||||||
# load basic items
|
|
||||||
content_list, ids, prefix_list, faiss_index = load_faiss_index(FAISS_INDEX_PATH, FAISS_METADATA_PATH)
|
|
||||||
pipe = HybridRetrieverReranker(content_list, ids, prefix_list, faiss_index, dense_alpha=0.6)
|
|
||||||
# query preprocess and normalize
|
|
||||||
|
|
||||||
|
|
||||||
def save_result(chat_obj: object) -> bool:
|
|
||||||
# index result in elastic
|
|
||||||
pass
|
|
||||||
|
|
||||||
async def get_title_user_prompt(query: str):
|
|
||||||
"""
|
|
||||||
get a query and prepare a prompt to generate title based on that
|
|
||||||
"""
|
|
||||||
title_prompt = f'برای متن {query} یک عنوان با معنا که بین 3 تا 6 کلمه داشته باشد، در قالب یک رشته متن ایجاد کن. سبک و لحن عنوان، حقوقی و کاملا رسمی باشد. عنوان تولید شده کاملا ساده و بدون هیچ مارک داون یا علائم افزوده ای باشد. غیر از عنوان، به هیچ وجه توضیح اضافه ای در قبل یا بعد آن اضافه نکن.'
|
|
||||||
return title_prompt
|
|
||||||
|
|
||||||
async def get_title_system_prompt():
|
|
||||||
"""
|
|
||||||
returns a system prompt due to generate title
|
|
||||||
"""
|
|
||||||
title_system_prompt = f'تو یک دستیار حقوقی هستی و می توانی متون و سوالات حقوقی را به زبان ساده و دقیق توضیح بدهی.'
|
|
||||||
return title_system_prompt
|
|
||||||
|
|
||||||
|
|
||||||
async def ask_chatbot_avalai(query:str, chat_id:str):
|
|
||||||
print('ask avalai func')
|
|
||||||
prompt_status = True
|
|
||||||
llm_model = ''
|
|
||||||
llm_answer = ''
|
|
||||||
cost_prompt = 0
|
|
||||||
cost_title = 0
|
|
||||||
status_text = 'لطفا متن سوال را وارد نمائید'
|
|
||||||
if query == '':
|
|
||||||
prompt_status = False
|
|
||||||
|
|
||||||
# در صورتی که وضعیت پرامپت معتبر باشد، وارد فرایند شو
|
|
||||||
if prompt_status:
|
|
||||||
|
|
||||||
before_title_time = datetime.datetime.now()
|
|
||||||
title_system_prompt = await get_title_system_prompt()
|
|
||||||
title_user_prompt = await get_title_user_prompt(query)
|
|
||||||
title, cost_title = await llm_base_request(title_system_prompt, title_user_prompt)
|
|
||||||
# title, cost_title = await oss_base_request(title_system_prompt, title_user_prompt)
|
|
||||||
if not title:
|
|
||||||
title = query
|
|
||||||
|
|
||||||
title_prompt_duration = (datetime.datetime.now() - before_title_time).total_seconds()
|
|
||||||
|
|
||||||
if title == '':
|
|
||||||
title = query.split()[0:10]
|
|
||||||
|
|
||||||
start_time = (datetime.datetime.now())
|
|
||||||
result_passages_text, result_passages_ids = await single_query(query)
|
|
||||||
end_retrive = datetime.datetime.now()
|
|
||||||
print('-'*40)
|
|
||||||
print(f'title_prompt_duration: {title_prompt_duration}')
|
|
||||||
retrive_duration = (end_retrive - start_time).total_seconds()
|
|
||||||
print(f'retrive duration: {str(retrive_duration)}')
|
|
||||||
|
|
||||||
prompt = f'''برای پرسش "{query}" از میان متون قانونی زیر، پاسخ مناسب و دقیق را استخراج کن.
|
|
||||||
متون قانونی:
|
|
||||||
"{result_passages_text}"
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
for model in models:
|
|
||||||
before_prompt_credit = await credit_refresh()
|
|
||||||
llm_model = model
|
|
||||||
print(f'using model: {model}')
|
|
||||||
try:
|
|
||||||
llm_answer, cost_prompt = await llm_request(prompt, model)
|
|
||||||
# llm_answer, cost_prompt = await oss_request(prompt)
|
|
||||||
break
|
|
||||||
except Exception as error:
|
|
||||||
print(f'error in ask-chatbot-avalai model:{model}')
|
|
||||||
after_prompt_credit = await credit_refresh()
|
|
||||||
prompt_cost = int(before_prompt_credit) - int(after_prompt_credit)
|
|
||||||
error = f'model: {model} \n{error}\n\n'
|
|
||||||
print('+++++++++++++++++')
|
|
||||||
print(f'llm-error.txt writing error: {error}')
|
|
||||||
print('+++++++++++++++++')
|
|
||||||
async with aiofiles.open('./llm-answer/llm-error.txt', mode='a+', encoding='utf-8') as file:
|
|
||||||
await file.write(error)
|
|
||||||
prompt_status = False
|
|
||||||
status_text = 'با عرض پوزش، سرویس موقتا در دسترس نیست. لطفا دقایقی دیگر دوباره تلاش نمائید!'
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
|
||||||
# حالتی که وضعیت پرامپت، نامعتبر باشد، یک شی با مقادیر زیر برگردانده می شود
|
|
||||||
else:
|
|
||||||
chat_obj = {
|
|
||||||
'id' : chat_id, # str
|
|
||||||
'title' : '', # str
|
|
||||||
'user_id' : '',
|
|
||||||
'user_query' : query, # str
|
|
||||||
'model_key' : llm_model, # str
|
|
||||||
'retrived_passage' : '', # str
|
|
||||||
'retrived_ref_ids' : '', # list[obj]
|
|
||||||
'prompt_type' : 'question-answer', # str
|
|
||||||
'retrived_duration' : '', # str
|
|
||||||
'llm_duration' : '0', # str
|
|
||||||
'full_duration' : '0', # str
|
|
||||||
'cost_prompt' : str(cost_prompt), # str
|
|
||||||
'cost_title' : str(cost_title), # str
|
|
||||||
'cost_total' : str(cost_prompt + cost_title), # str
|
|
||||||
'time_create' : str(start_time), # str
|
|
||||||
'used_ref_ids' : [], # list[str]
|
|
||||||
'prompt_answer' : '', # str
|
|
||||||
'status_text' : status_text,
|
|
||||||
'status' : prompt_status, # or False # bool
|
|
||||||
}
|
|
||||||
|
|
||||||
# بازگرداندن آبجکت ایجاد شده
|
|
||||||
return chat_obj, status_text
|
|
||||||
|
|
||||||
llm_answer_duration = (datetime.datetime.now() - end_retrive).total_seconds()
|
|
||||||
print(f'llm answer duration: {str(llm_answer_duration)}')
|
|
||||||
|
|
||||||
used_refrences_in_answer = await find_refrences(llm_answer)
|
|
||||||
llm_answer = await replace_refrences(llm_answer, used_refrences_in_answer)
|
|
||||||
|
|
||||||
full_prompt_duration = (datetime.datetime.now() - start_time).total_seconds()
|
|
||||||
print(f'full prompt duration: {full_prompt_duration}')
|
|
||||||
print('~'*40)
|
|
||||||
|
|
||||||
status_text ='پاسخ با موفقیت ایجاد شد'
|
|
||||||
|
|
||||||
print(f'cost_prompt: {cost_prompt}')
|
|
||||||
print(f'cost_title: {cost_title}')
|
|
||||||
chat_obj = {
|
|
||||||
'id' : chat_id, # str
|
|
||||||
'title' : title, # str
|
|
||||||
'user_id' : '', #
|
|
||||||
'user_query' : query, # str
|
|
||||||
'model_key' : llm_model, # str
|
|
||||||
'retrived_passage' : result_passages_text, # str
|
|
||||||
'retrived_ref_ids' : result_passages_ids, # list[obj]
|
|
||||||
'prompt_type' : 'question-answer', # str
|
|
||||||
'retrived_duration' : retrive_duration, # str
|
|
||||||
'llm_duration' : llm_answer_duration, # str
|
|
||||||
'full_duration' : full_prompt_duration, # str
|
|
||||||
'cost_prompt' : str(cost_prompt), # str
|
|
||||||
'cost_title' : str(cost_title), # str
|
|
||||||
'cost_total' : str(cost_prompt + cost_title), # str
|
|
||||||
'time_create' : str(start_time), # str
|
|
||||||
'used_ref_ids' : used_refrences_in_answer, # list[str]
|
|
||||||
'prompt_answer' : llm_answer, # str
|
|
||||||
'status_text' : status_text, # str
|
|
||||||
'status' : True, # or False # bool
|
|
||||||
}
|
|
||||||
prev_chat_data = []
|
|
||||||
number = 1
|
|
||||||
try:
|
|
||||||
async with aiofiles.open(f'./llm-answer/chat-messages{number}.json', mode='r', encoding='utf-8') as file:
|
|
||||||
content = await file.read()
|
|
||||||
prev_chat_data = json.loads(content)
|
|
||||||
prev_chat_data.append(chat_obj)
|
|
||||||
except:
|
|
||||||
number += 1
|
|
||||||
|
|
||||||
prev_chat_data.append(chat_obj)
|
|
||||||
async with aiofiles.open(f'./llm-answer/chat-messages{number}.json', mode='w', encoding='utf-8') as output:
|
|
||||||
await output.write(json.dumps(prev_chat_data, ensure_ascii=False, indent=2))
|
|
||||||
|
|
||||||
async with aiofiles.open(f'./llm-answer/chat-messages-answer{number}.txt', mode='a+', encoding='utf-8') as output:
|
|
||||||
await output.write(f'{chat_obj}\n+++++++++++++++++++++++++++\n')
|
|
||||||
|
|
||||||
# save_result(chat_obj)
|
|
||||||
|
|
||||||
# ایجاد آبجکت بازگشتی به فرانت
|
|
||||||
# chat_obj.pop('retrived_passage')
|
|
||||||
# chat_obj.pop('prompt_type')
|
|
||||||
|
|
||||||
print('~'*40)
|
|
||||||
|
|
||||||
return chat_obj
|
|
||||||
|
|
||||||
async def ask_chatbot(query:str, chat_id:str):
|
|
||||||
print('ask oss func')
|
|
||||||
prompt_status = True
|
|
||||||
llm_model = 'gpt.oss.120b'
|
|
||||||
llm_answer = ''
|
|
||||||
cost_prompt = 0
|
|
||||||
cost_title = 0
|
|
||||||
status_text = 'لطفا متن سوال را وارد نمائید'
|
|
||||||
if query == '':
|
|
||||||
prompt_status = False
|
|
||||||
|
|
||||||
# در صورتی که وضعیت پرامپت معتبر باشد، وارد فرایند شو
|
|
||||||
if prompt_status:
|
|
||||||
|
|
||||||
before_title_time = datetime.datetime.now()
|
|
||||||
title_system_prompt = await get_title_system_prompt()
|
|
||||||
title_user_prompt = await get_title_user_prompt(query)
|
|
||||||
title = ''
|
|
||||||
# title, cost_title = await llm_base_request(title_system_prompt, title_user_prompt)
|
|
||||||
# title, cost_title = await oss_base_request(title_system_prompt, title_user_prompt)
|
|
||||||
if not title:
|
|
||||||
title = query
|
|
||||||
|
|
||||||
title_prompt_duration = (datetime.datetime.now() - before_title_time).total_seconds()
|
|
||||||
print('-'*40)
|
|
||||||
print(f'title_prompt_duration: {title_prompt_duration}')
|
|
||||||
|
|
||||||
if title == '':
|
|
||||||
title = query.split()[0:10]
|
|
||||||
|
|
||||||
start_time = (datetime.datetime.now())
|
|
||||||
result_passages_text, result_passages_ids = await single_query(query)
|
|
||||||
end_retrive = datetime.datetime.now()
|
|
||||||
retrive_duration = (end_retrive - start_time).total_seconds()
|
|
||||||
print(f'retrive duration: {str(retrive_duration)}')
|
|
||||||
|
|
||||||
prompt = f''' برای پرسش "{query}" از میان متون قانونی زیر، پاسخ مناسب و دقیق را استخراج کن.
|
|
||||||
متون قانونی:
|
|
||||||
"{result_passages_text}"
|
|
||||||
'''
|
|
||||||
|
|
||||||
# for model in models:
|
|
||||||
# before_prompt_credit = credit_refresh()
|
|
||||||
try:
|
|
||||||
# llm_model = model
|
|
||||||
# print(f'using model: {llm_model}')
|
|
||||||
# llm_answer, cost_prompt = await llm_request(prompt, model)
|
|
||||||
llm_answer, cost_prompt = await oss_request(prompt)
|
|
||||||
|
|
||||||
except Exception as error:
|
|
||||||
# after_prompt_credit = credit_refresh()
|
|
||||||
# prompt_cost = int(before_prompt_credit) - int(after_prompt_credit)
|
|
||||||
error = f'model: gpt.oss.120b \n{error}\n\n'
|
|
||||||
print('+++++++++++++++++')
|
|
||||||
print(f'llm-error.txt writing error: {error}')
|
|
||||||
print('+++++++++++++++++')
|
|
||||||
async with aiofiles.open('./llm-answer/llm-error.txt', mode='a+', encoding='utf-8') as file:
|
|
||||||
await file.write(error)
|
|
||||||
prompt_status = False
|
|
||||||
status_text = 'با عرض پوزش، سرویس موقتا در دسترس نیست. لطفا دقایقی دیگر دوباره تلاش نمائید!'
|
|
||||||
|
|
||||||
# حالتی که وضعیت پرامپت، نامعتبر باشد، یک شی با مقادیر زیر برگردانده می شود
|
|
||||||
else:
|
|
||||||
chat_obj = {
|
|
||||||
'id' : chat_id, # str
|
|
||||||
'title' : '', # str
|
|
||||||
'user_id' : '',
|
|
||||||
'user_query' : query, # str
|
|
||||||
'model_key' : llm_model, # str
|
|
||||||
'retrived_passage' : '', # str
|
|
||||||
'retrived_ref_ids' : '', # list[obj]
|
|
||||||
'prompt_type' : 'question-answer', # str
|
|
||||||
'retrived_duration' : '', # str
|
|
||||||
'llm_duration' : '0', # str
|
|
||||||
'full_duration' : '0', # str
|
|
||||||
'cost_prompt' : str(cost_prompt), # str
|
|
||||||
'cost_title' : str(cost_title), # str
|
|
||||||
'cost_total' : str(cost_prompt + cost_title), # str
|
|
||||||
'time_create' : str(start_time), # str
|
|
||||||
'used_ref_ids' : [], # list[str]
|
|
||||||
'prompt_answer' : '', # str
|
|
||||||
'status_text' : status_text,
|
|
||||||
'status' : prompt_status, # or False # bool
|
|
||||||
}
|
|
||||||
|
|
||||||
# بازگرداندن آبجکت ایجاد شده
|
|
||||||
return chat_obj, status_text
|
|
||||||
|
|
||||||
llm_answer_duration = (datetime.datetime.now() - end_retrive).total_seconds()
|
|
||||||
print(f'llm answer duration: {str(llm_answer_duration)}')
|
|
||||||
|
|
||||||
llm_answer = llm_answer.replace('【','{')
|
|
||||||
llm_answer = llm_answer.replace('】','}')
|
|
||||||
used_refrences_in_answer = await find_refrences(llm_answer)
|
|
||||||
llm_answer = await replace_refrences(llm_answer, used_refrences_in_answer)
|
|
||||||
|
|
||||||
full_prompt_duration = (datetime.datetime.now() - start_time).total_seconds()
|
|
||||||
print(f'full prompt duration: {full_prompt_duration}')
|
|
||||||
print('~'*40)
|
|
||||||
|
|
||||||
status_text ='پاسخ با موفقیت ایجاد شد'
|
|
||||||
|
|
||||||
print(f'cost_prompt: {cost_prompt}')
|
|
||||||
print(f'cost_title: {cost_title}')
|
|
||||||
chat_obj = {
|
|
||||||
'id' : chat_id, # str
|
|
||||||
'title' : title, # str
|
|
||||||
'user_id' : '',
|
|
||||||
'user_query' : query, # str
|
|
||||||
'model_key' : llm_model, # str
|
|
||||||
'retrived_passage' : result_passages_text, # str
|
|
||||||
'retrived_ref_ids' : result_passages_ids, # list[obj]
|
|
||||||
'prompt_type' : 'question-answer', # str
|
|
||||||
'retrived_duration' : retrive_duration, # str
|
|
||||||
'llm_duration' : llm_answer_duration, # str
|
|
||||||
'full_duration' : full_prompt_duration, # str
|
|
||||||
'cost_prompt' : str(cost_prompt), # str
|
|
||||||
'cost_title' : str(cost_title), # str
|
|
||||||
'cost_total' : str(cost_prompt + cost_title), # str
|
|
||||||
'time_create' : str(start_time), # str
|
|
||||||
'used_ref_ids' : used_refrences_in_answer, # list[str]
|
|
||||||
'prompt_answer' : llm_answer, # str
|
|
||||||
'status_text' : status_text, # str
|
|
||||||
'status' : True, # or False # bool
|
|
||||||
}
|
|
||||||
|
|
||||||
prev_chat_data = []
|
|
||||||
number = 1
|
|
||||||
try:
|
|
||||||
async with aiofiles.open(f'./llm-answer/chat-messages{number}.json', mode='r', encoding='utf-8') as file:
|
|
||||||
content = await file.read()
|
|
||||||
prev_chat_data = json.loads(content)
|
|
||||||
prev_chat_data.append(chat_obj)
|
|
||||||
except:
|
|
||||||
number += 1
|
|
||||||
|
|
||||||
prev_chat_data.append(chat_obj)
|
|
||||||
async with aiofiles. open(f'./llm-answer/chat-messages{number}.json', mode='w', encoding='utf-8') as output:
|
|
||||||
await output.write(json.dumps(prev_chat_data, ensure_ascii=False, indent=2))
|
|
||||||
|
|
||||||
# async with aiofiles. open(f'./llm-answer/chat-messages-answer{number}.txt', mode='a+', encoding='utf-8') as output:
|
|
||||||
# await output.write(f'{chat_obj}\n+++++++++++++++++++++++++++\n')
|
|
||||||
|
|
||||||
|
|
||||||
full_prompt_duration = (datetime.datetime.now() - start_time).total_seconds()
|
|
||||||
print(f'aiofiles duration: {full_prompt_duration}')
|
|
||||||
print('~'*40)
|
|
||||||
|
|
||||||
# save_result(chat_obj)
|
|
||||||
|
|
||||||
# ایجاد آبجکت بازگشتی به فرانت
|
|
||||||
# chat_obj.pop('retrived_passage')
|
|
||||||
# chat_obj.pop('prompt_type')
|
|
||||||
|
|
||||||
print('~'*40)
|
|
||||||
|
|
||||||
return chat_obj
|
|
||||||
|
|
||||||
async def credit_refresh():
|
|
||||||
"""
|
|
||||||
Returns remained credit
|
|
||||||
"""
|
|
||||||
url = "https://api.avalai.ir/user/credit"
|
|
||||||
headers = {
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
"Authorization": f"Bearer {await get_key()}"
|
|
||||||
}
|
|
||||||
remained_credit = requests.get(url, headers=headers)
|
|
||||||
remained_credit_value = str(remained_credit.json()['remaining_irt'])
|
|
||||||
print('writing credit')
|
|
||||||
async with aiofiles.open('./llm-answer/credit.txt', mode='a+', encoding='utf-8') as file:
|
|
||||||
await file.write(f'{remained_credit_value}\n')
|
|
||||||
|
|
||||||
return remained_credit_value
|
|
||||||
|
|
||||||
async def create_chat_id():
|
|
||||||
date = str((datetime.datetime.now())).replace(' ','-').replace(':','').replace('.','-')
|
|
||||||
|
|
||||||
chat_id = f'{date}-{random.randint(100000, 999999)}'
|
|
||||||
|
|
||||||
return chat_id
|
|
||||||
|
|
||||||
|
|
||||||
# تعریف مدل دادهها برای درخواستهای API
|
|
||||||
class Query(BaseModel):
|
|
||||||
query: str
|
|
||||||
|
|
||||||
|
|
||||||
initial_model()
|
|
||||||
# uvicorn src.app:app --reload
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
|
|
||||||
# query = 'در قانون حمایت از خانواده و جوانی جمعیت چه خدماتی در نظر گرفته شده است؟'
|
|
||||||
while True:
|
|
||||||
query = input('enter your qustion:')
|
|
||||||
if query == '':
|
|
||||||
print('لطفا متن سوال را وارد نمائید')
|
|
||||||
continue
|
|
||||||
start = (datetime.datetime.now())
|
|
||||||
# result = test_dataset()
|
|
||||||
result = single_query(query)
|
|
||||||
end_retrive = datetime.datetime.now()
|
|
||||||
print('-'*40)
|
|
||||||
print(f'retrive duration: {(end_retrive - start).total_seconds()}')
|
|
||||||
|
|
||||||
prompt = f'برای پرسش "{query}" از میان مواد قانونی "{result}" .پاسخ مناسب و دقیق را استخراج کن. درصورتی که مطلبی مرتبط با پرسش در متن پیدا نشد، فقط پاسخ بده: "متاسفانه در منابع، پاسخی پیدا نشد!"'
|
|
||||||
llm_answer = llm_request(prompt)
|
|
||||||
|
|
||||||
print('-'*40)
|
|
||||||
print(f'llm duration: {(datetime.datetime.now() - end_retrive).total_seconds()}')
|
|
||||||
|
|
||||||
refrences = ''
|
|
||||||
recognized_refrences = find_refrences(llm_answer)
|
|
||||||
llm_answer = replace_refrences(llm_answer, recognized_refrences)
|
|
||||||
|
|
||||||
print('-'*40)
|
|
||||||
print(f'replace_refrences duration: {(datetime.datetime.now() - end_retrive).total_seconds()}')
|
|
||||||
|
|
||||||
with open('./llm-answer/result.txt', mode='a+', encoding='utf-8') as file:
|
|
||||||
result_message = f'متن پرامپت: {query.strip()}\n\nپاسخ: {llm_answer} \n----------------------------------------------------------\n'
|
|
||||||
file.write(result_message)
|
|
||||||
|
|
||||||
with open('./llm-answer/passages.txt', mode='a+', encoding='utf-8') as file:
|
|
||||||
result_message = f'متن پرامپت: {query.strip()}\n\مواد مشابه: {result} \n----------------------------------------------------------\n'
|
|
||||||
file.write(result_message)
|
|
||||||
|
|
||||||
print('-'*40)
|
|
||||||
print(f'file write duration: {(datetime.datetime.now() - end_retrive).total_seconds()}')
|
|
||||||
|
|
||||||
|
|
||||||
print('----------------------------------------------------------')
|
|
||||||
print(f'full duration: {(datetime.datetime.now() - start).total_seconds()}')
|
|
||||||
print('----------------------------------------------------------')
|
|
||||||
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
|
|
||||||
|
|
@ -1,72 +0,0 @@
|
||||||
import json
|
|
||||||
import numpy as np
|
|
||||||
import faiss
|
|
||||||
import os
|
|
||||||
|
|
||||||
def create_faiss_index_from_json(json_file_path, faiss_index_path, metadata_file_path):
|
|
||||||
print(f'try to read {json_file_path} ...')
|
|
||||||
# --- 1. بارگذاری دادهها از JSON ---
|
|
||||||
with open(json_file_path, 'r', encoding='utf-8') as f:
|
|
||||||
data = json.load(f)
|
|
||||||
print(f'file reading finished')
|
|
||||||
|
|
||||||
# فرض بر این است که هر عنصر شامل فیلدهای زیر است:
|
|
||||||
# {
|
|
||||||
# "speech_title": "title",
|
|
||||||
# "sentence": "متن جمله",
|
|
||||||
# "embeddings": [0.12, 0.34, ...]
|
|
||||||
# }
|
|
||||||
|
|
||||||
sentences = []
|
|
||||||
titles = []
|
|
||||||
embeddings_list = []
|
|
||||||
prefix_list = []
|
|
||||||
for k, item in data.items():
|
|
||||||
sentences.append(item['content'])
|
|
||||||
titles.append(item['id'])
|
|
||||||
embeddings_list.append(item['embeddings'])
|
|
||||||
prefix_list.append(item['section-prefix'])
|
|
||||||
|
|
||||||
embeddings = np.array(embeddings_list).astype('float32') # ابعاد: (n, d)
|
|
||||||
dimension = embeddings.shape[1]
|
|
||||||
|
|
||||||
print(f"Loaded {len(embeddings)} embeddings with dimension {dimension}")
|
|
||||||
|
|
||||||
# --- 2. ایجاد ایندکس FAISS برای GPU ---
|
|
||||||
# اگر فقط CPU دارید، از faiss.IndexFlatL2 استفاده کنید.
|
|
||||||
# اگر GPU دارید، ابتدا ایندکس را روی CPU ایجاد و سپس به GPU انتقال دهید.
|
|
||||||
cpu_index = faiss.IndexFlatL2(dimension) # معیار فاصله L2 (Euclidean)
|
|
||||||
|
|
||||||
# انتقال ایندکس به GPU
|
|
||||||
if faiss.get_num_gpus() > 0:
|
|
||||||
print("Using GPU for FAISS index...")
|
|
||||||
res = faiss.StandardGpuResources()
|
|
||||||
gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index)
|
|
||||||
else:
|
|
||||||
print("GPU not available, using CPU.")
|
|
||||||
gpu_index = cpu_index
|
|
||||||
|
|
||||||
# --- 3. افزودن دادهها به ایندکس ---
|
|
||||||
gpu_index.add(embeddings)
|
|
||||||
print(f"Total vectors indexed: {gpu_index.ntotal}")
|
|
||||||
|
|
||||||
# --- 4. ذخیره ایندکس به فایل ---
|
|
||||||
# برای ذخیره باید به CPU منتقل شود
|
|
||||||
final_index = faiss.index_gpu_to_cpu(gpu_index) if isinstance(gpu_index, faiss.Index) and faiss.get_num_gpus() > 0 else gpu_index
|
|
||||||
os.makedirs(os.path.dirname(faiss_index_path), exist_ok=True)
|
|
||||||
faiss.write_index(final_index, faiss_index_path)
|
|
||||||
print(f"FAISS index saved to {faiss_index_path}")
|
|
||||||
|
|
||||||
# --- 5. ذخیره متادیتا (برای نگاشت نتایج جستجو) ---
|
|
||||||
metadata = [{"id": id, "content": c, 'prefix': p} for id, c, p in zip(titles, sentences,prefix_list)]
|
|
||||||
with open(metadata_file_path, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
|
||||||
print(f"Metadata saved to {metadata_file_path}")
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
# استفاده از متد
|
|
||||||
json_file_path = './majles-output/sections-vec-285k.json'
|
|
||||||
faiss_index_path = './qavanin-faiss/faiss_index_qavanin_285k.index'
|
|
||||||
metadata_file_path = './qavanin-faiss/faiss_index_qavanin_285k_metadata.json'
|
|
||||||
|
|
||||||
create_faiss_index_from_json(json_file_path, faiss_index_path, metadata_file_path)
|
|
||||||
|
|
@ -1,10 +0,0 @@
|
||||||
FROM docker.tavasi.ir/tavasi/qachat_base:1.0.0
|
|
||||||
|
|
||||||
WORKDIR /src/app
|
|
||||||
|
|
||||||
COPY . /src/app
|
|
||||||
|
|
||||||
EXPOSE 80
|
|
||||||
|
|
||||||
CMD [ "uvicorn","chatbot:chatbot","--reload","--port","80","--host=0.0.0.0"]
|
|
||||||
|
|
||||||
|
|
@ -1,5 +0,0 @@
|
||||||
FROM docker.tavasi.ir/tavasi/qachat_base:1.0.0
|
|
||||||
RUN pip install uvicorn[standard]
|
|
||||||
RUN pip install FlagEmbedding
|
|
||||||
RUN pip install aiofiles
|
|
||||||
RUN pip install openai
|
|
||||||
|
|
@ -1,677 +0,0 @@
|
||||||
import zipfile
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
import json
|
|
||||||
from time import sleep
|
|
||||||
from elasticsearch7 import Elasticsearch,helpers
|
|
||||||
|
|
||||||
class ElasticHelper():
|
|
||||||
|
|
||||||
counter = 0
|
|
||||||
total = 0
|
|
||||||
id = ""
|
|
||||||
path_mappings = os.getcwd() + '/repo/_other/'
|
|
||||||
|
|
||||||
def __init__(self, es_url="http://127.0.0.1:6900", es_pass="", es_user="elastic", path_mappings = ""):
|
|
||||||
|
|
||||||
if path_mappings :
|
|
||||||
self.path_mappings = path_mappings
|
|
||||||
|
|
||||||
if es_pass == '' :
|
|
||||||
self.es = Elasticsearch(es_url)
|
|
||||||
else:
|
|
||||||
self.es = Elasticsearch(
|
|
||||||
es_url,
|
|
||||||
http_auth=(es_user, es_pass),
|
|
||||||
)
|
|
||||||
|
|
||||||
# print(es_url)
|
|
||||||
# print(self.es)
|
|
||||||
|
|
||||||
self.success_connect = False
|
|
||||||
for a in range(0,10):
|
|
||||||
try :
|
|
||||||
if not self.es.ping():
|
|
||||||
print('elastic not ping, sleep 30 s : ', a)
|
|
||||||
sleep(5)
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
self.success_connect = True
|
|
||||||
break
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
break
|
|
||||||
if not self.success_connect :
|
|
||||||
print('******','not access to elastic service')
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
self.counter = 0
|
|
||||||
self.total = 0
|
|
||||||
self.id = ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_doctument(self, index_name, id):
|
|
||||||
res = self.es.get(index=index_name, id=id)
|
|
||||||
return res
|
|
||||||
|
|
||||||
def exist_doctument(self, index_name, id):
|
|
||||||
res = self.es.exists(index=index_name, id=id)
|
|
||||||
return res
|
|
||||||
|
|
||||||
def update_index_doc(self, is_update_state, index_name_o, eid, data):
|
|
||||||
if is_update_state:
|
|
||||||
resp = self.es.update(index=index_name_o, id=eid, doc=data)
|
|
||||||
# resp = self.es.update(index=index_name_o, id=eid, body={'doc':data})
|
|
||||||
else:
|
|
||||||
resp = self.es.index(index=index_name_o, id=eid, document=data)
|
|
||||||
return resp
|
|
||||||
|
|
||||||
|
|
||||||
def exportToJsonForAI(self, path_back, index_name, out_name= '', body={}, fields=[]) :
|
|
||||||
print('*' * 50, ' start backup -->', index_name)
|
|
||||||
self.counter = 0
|
|
||||||
sid = None
|
|
||||||
|
|
||||||
out = out_name
|
|
||||||
if out_name == '' :
|
|
||||||
out = index_name
|
|
||||||
|
|
||||||
fout = open( path_back + "/"+ out + '.json', 'a+' , encoding='utf-8')
|
|
||||||
|
|
||||||
s_res = self.es.search(
|
|
||||||
index=index_name,
|
|
||||||
scroll='5m',
|
|
||||||
size=1000,
|
|
||||||
body=body
|
|
||||||
)
|
|
||||||
self.total = s_res["hits"]["total"]['value']
|
|
||||||
|
|
||||||
print('start index = %s' % index_name)
|
|
||||||
print('total = %d' % self.total)
|
|
||||||
|
|
||||||
sid = s_res['_scroll_id']
|
|
||||||
scroll_size = len(s_res['hits']['hits'])
|
|
||||||
file_count = 1
|
|
||||||
out_json = []
|
|
||||||
while scroll_size > 0:
|
|
||||||
"Scrolling..."
|
|
||||||
self.counter += scroll_size
|
|
||||||
print("progress -> %.2f %%" % ((self.counter / self.total)*100))
|
|
||||||
#############################
|
|
||||||
for item in s_res['hits']['hits']:
|
|
||||||
|
|
||||||
if fields :
|
|
||||||
item2={}
|
|
||||||
item2['id']=item['_id']
|
|
||||||
for kf in fields :
|
|
||||||
#print(kf)
|
|
||||||
if kf in item['_source'] :
|
|
||||||
# print(item['_source'][kf])
|
|
||||||
item2[kf] = item['_source'][kf]
|
|
||||||
#exit()
|
|
||||||
else :
|
|
||||||
item2=item
|
|
||||||
|
|
||||||
out_json.append(item2)
|
|
||||||
|
|
||||||
|
|
||||||
s_res = self.es.scroll(scroll_id=sid, scroll='2m', request_timeout=100000)
|
|
||||||
sid = s_res['_scroll_id']
|
|
||||||
scroll_size = len(s_res['hits']['hits'])
|
|
||||||
|
|
||||||
sid = None
|
|
||||||
text = json.dumps(out_json, ensure_ascii=False)
|
|
||||||
fout.write(text)
|
|
||||||
|
|
||||||
##############################
|
|
||||||
|
|
||||||
def backupIndexToZipfile(self, path_back, index_name, out_name= '', body={}, byzip = True, fields=[], noFields=[]) :
|
|
||||||
print('*' * 50, ' start backup -->', index_name)
|
|
||||||
self.counter = 0
|
|
||||||
sid = None
|
|
||||||
|
|
||||||
out = out_name
|
|
||||||
if out_name == '' :
|
|
||||||
out = index_name
|
|
||||||
|
|
||||||
|
|
||||||
if body == {} :
|
|
||||||
s_res = self.es.search(
|
|
||||||
index=index_name,
|
|
||||||
scroll='5m',
|
|
||||||
size=1000
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
s_res = self.es.search(
|
|
||||||
index=index_name,
|
|
||||||
scroll='5m',
|
|
||||||
size=1000,
|
|
||||||
body=body
|
|
||||||
)
|
|
||||||
|
|
||||||
self.total = s_res["hits"]["total"]['value']
|
|
||||||
if self.total == 0 :
|
|
||||||
print('total index_name by query = %d' % self.total)
|
|
||||||
return False
|
|
||||||
|
|
||||||
if byzip:
|
|
||||||
fout = zipfile.ZipFile(path_back + "/"+ out + '.zip', 'w')
|
|
||||||
else:
|
|
||||||
fout = open( path_back + "/"+ out + '.json', 'a+' , encoding='utf-8')
|
|
||||||
|
|
||||||
|
|
||||||
print('start index = %s' % index_name)
|
|
||||||
print('total = %d' % self.total)
|
|
||||||
|
|
||||||
sid = s_res['_scroll_id']
|
|
||||||
scroll_size = len(s_res['hits']['hits'])
|
|
||||||
file_count = 1
|
|
||||||
while scroll_size > 0:
|
|
||||||
"Scrolling..."
|
|
||||||
self.counter += scroll_size
|
|
||||||
print("progress -> %.2f %%" % ((self.counter / self.total)*100))
|
|
||||||
#############################
|
|
||||||
out_json = []
|
|
||||||
for item in s_res['hits']['hits']:
|
|
||||||
if fields :
|
|
||||||
item2={}
|
|
||||||
item2['id']=item['_id']
|
|
||||||
item2['_source']={}
|
|
||||||
for kf in fields :
|
|
||||||
if kf in item['_source'] :
|
|
||||||
item2['_source'][kf] = item['_source'][kf]
|
|
||||||
else :
|
|
||||||
item2=item
|
|
||||||
|
|
||||||
if noFields :
|
|
||||||
for kf in noFields :
|
|
||||||
if kf in item2['_source']:
|
|
||||||
del item2['_source'][kf]
|
|
||||||
|
|
||||||
|
|
||||||
out_json.append(item2)
|
|
||||||
|
|
||||||
|
|
||||||
text = json.dumps(out_json, ensure_ascii=False)
|
|
||||||
out_json = []
|
|
||||||
if byzip:
|
|
||||||
filename = out + str(file_count) + '.json'
|
|
||||||
file_count +=1
|
|
||||||
fout.writestr(filename, text.encode('utf-8'), zipfile.ZIP_DEFLATED )
|
|
||||||
else:
|
|
||||||
fout.write(text)
|
|
||||||
|
|
||||||
##############################
|
|
||||||
s_res = self.es.scroll(scroll_id=sid, scroll='2m', request_timeout=100000)
|
|
||||||
sid = s_res['_scroll_id']
|
|
||||||
scroll_size = len(s_res['hits']['hits'])
|
|
||||||
sid = None
|
|
||||||
fout.close()
|
|
||||||
|
|
||||||
|
|
||||||
def restorFileToElastic(self, path_back, index_name, app_key = '', queryDelete = True, map_name='') :
|
|
||||||
if not os.path.exists(path_back) :
|
|
||||||
print(' **** error *** path not exist: ', path_back)
|
|
||||||
return False
|
|
||||||
|
|
||||||
file_path = path_back + '/' + index_name + '.zip'
|
|
||||||
if not os.path.exists(file_path ) :
|
|
||||||
return False
|
|
||||||
|
|
||||||
if queryDelete :
|
|
||||||
# اگر وجود داشته باشد، از کاربر برای حذفش سوال میکند
|
|
||||||
if self.deleteIndex(index_name) :
|
|
||||||
self.createIndex(index_name, app_key, map_name)
|
|
||||||
self.zipFileToElastic(file_path, index_name)
|
|
||||||
else : # اگر وجود داشته باشد پرش می کند و کاری نمیکند
|
|
||||||
self.createIndex(index_name, app_key, map_name)
|
|
||||||
self.zipFileToElastic(file_path, index_name)
|
|
||||||
|
|
||||||
def restorFileToElastic2(self, path_file, index_name, app_key = '', queryDelete = True, map_name='') :
|
|
||||||
if not os.path.exists(path_file) :
|
|
||||||
print(' **** error *** path not exist: ', path_file)
|
|
||||||
return False
|
|
||||||
|
|
||||||
file_path = path_file
|
|
||||||
if not os.path.exists(file_path ) :
|
|
||||||
return False
|
|
||||||
|
|
||||||
if queryDelete :
|
|
||||||
# اگر وجود داشته باشد، از کاربر برای حذفش سوال میکند
|
|
||||||
if self.deleteIndex(index_name) :
|
|
||||||
self.createIndex(index_name, app_key, map_name)
|
|
||||||
self.zipFileToElastic(file_path, index_name)
|
|
||||||
else : # اگر وجود داشته باشد پرش می کند و کاری نمیکند
|
|
||||||
self.createIndex(index_name, app_key, map_name)
|
|
||||||
self.zipFileToElastic(file_path, index_name)
|
|
||||||
|
|
||||||
|
|
||||||
def renameElasticIndex(self, index_name_i, index_name_o, app_key = '', map_name='') :
|
|
||||||
|
|
||||||
if self.createIndex(index_name_o, app_key, map_name) :
|
|
||||||
res = self.es.reindex(
|
|
||||||
body={
|
|
||||||
"source": {"index": index_name_i},
|
|
||||||
"dest": {"index": index_name_o}
|
|
||||||
},
|
|
||||||
wait_for_completion=False)
|
|
||||||
|
|
||||||
print(type(res))
|
|
||||||
print(res)
|
|
||||||
|
|
||||||
taskid = res["task"] if res["task"] else ""
|
|
||||||
#tasks = client.TasksClient(self.es)
|
|
||||||
tasks = self.es.tasks
|
|
||||||
while True :
|
|
||||||
res = tasks.get(task_id = taskid)
|
|
||||||
if res["completed"] :
|
|
||||||
break
|
|
||||||
|
|
||||||
# print( res["task"])
|
|
||||||
print( '----', index_name_o, ' imported : ', res["task"]["status"]["total"] , ' / ', res["task"]["status"]["created"])
|
|
||||||
sleep(1)
|
|
||||||
print( '----', index_name_o, ' complated')
|
|
||||||
|
|
||||||
|
|
||||||
def deleteIndex(self, index_name) :
|
|
||||||
if not self.es.indices.exists(index=index_name) :
|
|
||||||
print(' ' * 10, " for delete NOT exist index :", index_name )
|
|
||||||
return True
|
|
||||||
|
|
||||||
question = 'Is DELETE elastic index (' + index_name +') ? '
|
|
||||||
if self.query_yes_no(question) :
|
|
||||||
self.es.indices.delete(index = index_name)
|
|
||||||
print('%' * 10 , " Finish DELETE index :", index_name )
|
|
||||||
return True
|
|
||||||
else :
|
|
||||||
return False
|
|
||||||
|
|
||||||
def query_yes_no(self, question, default="no"):
|
|
||||||
valid = { "yes": True, "y": True, "ye": True, "no": False, "n": False }
|
|
||||||
if default is None:
|
|
||||||
prompt = " [y/n] "
|
|
||||||
elif default == "yes":
|
|
||||||
prompt = " [Y/n] "
|
|
||||||
elif default == "no":
|
|
||||||
prompt = " [y/N] "
|
|
||||||
else:
|
|
||||||
raise ValueError("invalid default answer: '%s'" % default)
|
|
||||||
|
|
||||||
while True:
|
|
||||||
print('%'*10, ' quistion ', '%'*10 , '\n')
|
|
||||||
sys.stdout.write(question + prompt)
|
|
||||||
choice = input().lower()
|
|
||||||
if default is not None and choice == "":
|
|
||||||
return valid[default]
|
|
||||||
elif choice in valid:
|
|
||||||
return valid[choice]
|
|
||||||
else:
|
|
||||||
sys.stdout.write("لطفا یکی از موارد روبرو را وارد کنید : 'yes' or 'no' " "(or 'y' or 'n').\n")
|
|
||||||
|
|
||||||
def createIndexIfNotExist(self, index_name_o, mapping_o=""):
|
|
||||||
try:
|
|
||||||
if not self.es.indices.exists(index=index_name_o):
|
|
||||||
response = self.es.indices.create(index=index_name_o, body=mapping_o)
|
|
||||||
# print out the response:
|
|
||||||
print("create index response:", response)
|
|
||||||
except:
|
|
||||||
print("....... index exist ! ... not created")
|
|
||||||
|
|
||||||
|
|
||||||
def createIndex(self, index_name, app_key='', map_name=''):
|
|
||||||
|
|
||||||
path_base = self.path_mappings
|
|
||||||
path_mapping1 = path_base + 'general/'
|
|
||||||
if app_key == '' :
|
|
||||||
app_key = 'tavasi'
|
|
||||||
path_mapping2 = path_base + app_key + '/'
|
|
||||||
|
|
||||||
|
|
||||||
if map_name == '':
|
|
||||||
map_name = index_name
|
|
||||||
|
|
||||||
if self.es.indices.exists(index=index_name) :
|
|
||||||
print("============== exist index :", index_name )
|
|
||||||
return True
|
|
||||||
|
|
||||||
if map_name == 'mj_rg_section' or map_name == 'semantic_search' :
|
|
||||||
map_name = 'mj_qa_section'
|
|
||||||
elif map_name[-3]=='_ai':
|
|
||||||
map_name=[0-len(map_name)-3]
|
|
||||||
print(map_name)
|
|
||||||
|
|
||||||
mapping_file_path = path_mapping1 + map_name + '.json'
|
|
||||||
print("mapping_file_path : " , mapping_file_path)
|
|
||||||
if not os.path.isfile(mapping_file_path):
|
|
||||||
if not os.path.isfile(mapping_file_path):
|
|
||||||
mapping_file_path = path_mapping2 + map_name + '.json'
|
|
||||||
|
|
||||||
print("mapping_file_path : " , mapping_file_path)
|
|
||||||
|
|
||||||
# Create Index With Mapping
|
|
||||||
if os.path.isfile(mapping_file_path):
|
|
||||||
mapping_file = open( mapping_file_path,'r', encoding='utf-8' )
|
|
||||||
mapping_file_read = mapping_file.read()
|
|
||||||
mapping_data = json.loads(mapping_file_read)
|
|
||||||
mapping_file.close()
|
|
||||||
if self.es.indices.exists(index=index_name) :
|
|
||||||
print("============== exist index :", index_name )
|
|
||||||
else :
|
|
||||||
self.es.indices.create(index = index_name , body = mapping_data)
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
print('*** error not find maping file elastic : *******', mapping_file_path)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def updateBulkList(self, listData, index_name):
|
|
||||||
chunk_size=1000
|
|
||||||
raise_on_error=False
|
|
||||||
raise_on_exception=False
|
|
||||||
stats_only=True
|
|
||||||
yield_ok = False
|
|
||||||
|
|
||||||
actions=[]
|
|
||||||
for item in listData:
|
|
||||||
actions.append({
|
|
||||||
"_op_type": "update",
|
|
||||||
"_index": index_name,
|
|
||||||
"_id" : item['_id'],
|
|
||||||
"doc": item['_source']
|
|
||||||
}
|
|
||||||
)
|
|
||||||
helpers.bulk(self.es, actions, chunk_size, raise_on_error, raise_on_exception, stats_only, yield_ok )
|
|
||||||
|
|
||||||
def importBulkList(self, listData, index_name):
|
|
||||||
chunk_size=100000
|
|
||||||
raise_on_error=False
|
|
||||||
raise_on_exception=False
|
|
||||||
stats_only=True
|
|
||||||
yield_ok = False
|
|
||||||
|
|
||||||
for item in listData:
|
|
||||||
actions = [{
|
|
||||||
"_op_type": "index",
|
|
||||||
"_index": index_name,
|
|
||||||
"_id" : item['_id'],
|
|
||||||
"_source": item['_source']
|
|
||||||
}
|
|
||||||
]
|
|
||||||
helpers.bulk(self.es, actions, chunk_size, raise_on_error, raise_on_exception, stats_only, yield_ok )
|
|
||||||
|
|
||||||
|
|
||||||
def importJsonDataToElastic(self, jsonData, index_name, fields=[]):
|
|
||||||
chunk_size=1000
|
|
||||||
raise_on_error=False
|
|
||||||
raise_on_exception=False
|
|
||||||
stats_only=True
|
|
||||||
yield_ok = False
|
|
||||||
|
|
||||||
actions=[]
|
|
||||||
|
|
||||||
for item in jsonData:
|
|
||||||
id = item['_id'] if item['_id'] else item['id']
|
|
||||||
source = item['_source']
|
|
||||||
if fields :
|
|
||||||
source = {}
|
|
||||||
for col in fields :
|
|
||||||
if col in item['_source'] :
|
|
||||||
source[col] = item['_source']
|
|
||||||
|
|
||||||
|
|
||||||
actions.append({
|
|
||||||
"_op_type": "index",
|
|
||||||
"_index": index_name,
|
|
||||||
"_id" : id,
|
|
||||||
"_source": source
|
|
||||||
})
|
|
||||||
helpers.bulk(self.es, actions, chunk_size, raise_on_error, raise_on_exception, stats_only, yield_ok )
|
|
||||||
|
|
||||||
|
|
||||||
def fileToElastic(self, file_path, index_name, limit_pack = -1, fields=[]):
|
|
||||||
if not os.path.exists(file_path):
|
|
||||||
print("file zip:" , file_path , " not exist")
|
|
||||||
return
|
|
||||||
print("index:" , index_name , '=>' , file_path )
|
|
||||||
self.counter = 0
|
|
||||||
with open(file_path) as file:
|
|
||||||
data = json.loads(file.read())
|
|
||||||
self.importJsonDataToElastic(data, index_name, fields)
|
|
||||||
|
|
||||||
self.es.indices.refresh(index=index_name)
|
|
||||||
print(self.es.cat.count(index=index_name, format="json"))
|
|
||||||
|
|
||||||
def zipFileToElastic(self, file_path, index_name, limit_pack = -1, fields=[]):
|
|
||||||
if not os.path.exists(file_path):
|
|
||||||
print("file zip:" , file_path , " not exist for imort to elastic : ", index_name )
|
|
||||||
return
|
|
||||||
|
|
||||||
fileNo = 0
|
|
||||||
with zipfile.ZipFile(file_path, 'r') as zObject:
|
|
||||||
fileNo +=1
|
|
||||||
print("="*10, " zip fileNo: " , fileNo ," - ( ", index_name," ) | File Numbers:" ,len(zObject.namelist()) , "=" * 10)
|
|
||||||
|
|
||||||
packNo = 0
|
|
||||||
self.counter = 0
|
|
||||||
for filename in zObject.namelist():
|
|
||||||
packNo += 1
|
|
||||||
if limit_pack != -1 :
|
|
||||||
if packNo > limit_pack :
|
|
||||||
print('limit_data ', index_name, ' ', limit_pack)
|
|
||||||
break
|
|
||||||
|
|
||||||
print("index:" , index_name , '=>' , filename )
|
|
||||||
with zObject.open(filename) as file:
|
|
||||||
data = json.loads(file.read())
|
|
||||||
self.importJsonDataToElastic(data, index_name, fields)
|
|
||||||
|
|
||||||
self.es.indices.refresh(index=index_name)
|
|
||||||
print(self.es.cat.count(index=index_name, format="json"))
|
|
||||||
print(" END Of Import to elastic ", index_name ,"\n")
|
|
||||||
|
|
||||||
|
|
||||||
def iterateJsonFile(self, file_path, isZip=True, limit_pack = -1):
|
|
||||||
if not os.path.exists(file_path):
|
|
||||||
print("file zip:" , file_path , " not exist iterateJsonFile " )
|
|
||||||
return
|
|
||||||
|
|
||||||
if isZip :
|
|
||||||
fileNo = 0
|
|
||||||
with zipfile.ZipFile(file_path, 'r') as zObject:
|
|
||||||
fileNo +=1
|
|
||||||
print("="*10, " zip fileNo: " , fileNo ," iterateJsonFile - | File Numbers:" ,len(zObject.namelist()) , "=" * 10)
|
|
||||||
|
|
||||||
packNo = 0
|
|
||||||
self.counter = 0
|
|
||||||
for filename in zObject.namelist():
|
|
||||||
packNo += 1
|
|
||||||
if limit_pack != -1 :
|
|
||||||
if packNo > limit_pack :
|
|
||||||
print('limit_data iterateJsonFile ', limit_pack)
|
|
||||||
break
|
|
||||||
|
|
||||||
print("index iterateJsonFile :", '=>' , filename )
|
|
||||||
with zObject.open(filename) as file:
|
|
||||||
data = json.loads(file.read())
|
|
||||||
# Yield each entry
|
|
||||||
# yield data
|
|
||||||
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in data)
|
|
||||||
else :
|
|
||||||
with open(filename, 'r', encoding='utf-8') as file:
|
|
||||||
data = json.loads(file.read())
|
|
||||||
# Yield each entry
|
|
||||||
# yield from (hit for hit in data)
|
|
||||||
#return data
|
|
||||||
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in data)
|
|
||||||
|
|
||||||
|
|
||||||
def es_iterate_all_documents(self, index, body="", pagesize=250, scroll_timeout="25m", **kwargs):
|
|
||||||
"""
|
|
||||||
Helper to iterate ALL values from a single index
|
|
||||||
Yields all the documents.
|
|
||||||
"""
|
|
||||||
is_first = True
|
|
||||||
while True:
|
|
||||||
# Scroll next
|
|
||||||
if is_first: # Initialize scroll
|
|
||||||
# result = self.es.search(index=index, scroll="2m", **kwargs, body={
|
|
||||||
# "size": pagesize
|
|
||||||
# })
|
|
||||||
if body :
|
|
||||||
result = self.es.search(
|
|
||||||
index=index,
|
|
||||||
scroll=scroll_timeout,
|
|
||||||
**kwargs,
|
|
||||||
size=pagesize,
|
|
||||||
body=body
|
|
||||||
)
|
|
||||||
else :
|
|
||||||
result = self.es.search(
|
|
||||||
index=index,
|
|
||||||
scroll=scroll_timeout,
|
|
||||||
**kwargs,
|
|
||||||
size=pagesize
|
|
||||||
)
|
|
||||||
|
|
||||||
self.total = result["hits"]["total"]["value"]
|
|
||||||
if self.total > 0:
|
|
||||||
print("total = %d" % self.total)
|
|
||||||
is_first = False
|
|
||||||
else:
|
|
||||||
# result = es.scroll(body={
|
|
||||||
# "scroll_id": scroll_id,
|
|
||||||
# "scroll": scroll_timeout
|
|
||||||
# })
|
|
||||||
result = self.es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
|
|
||||||
|
|
||||||
scroll_id = result["_scroll_id"]
|
|
||||||
hits = result["hits"]["hits"]
|
|
||||||
self.counter += len(hits)
|
|
||||||
if self.total > 0 :
|
|
||||||
print("progress -> %.2f %%" % ((self.counter / self.total) * 100))
|
|
||||||
# Stop after no more docs
|
|
||||||
if not hits:
|
|
||||||
break
|
|
||||||
# Yield each entry
|
|
||||||
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
|
|
||||||
|
|
||||||
|
|
||||||
def moveCustomFileds(self, index_name_i, index_name_o, fields=[], renameFileds={}):
|
|
||||||
try:
|
|
||||||
body = {}
|
|
||||||
list = []
|
|
||||||
try:
|
|
||||||
list = self.es_iterate_all_documents(index_name_i)
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
|
|
||||||
count = 0
|
|
||||||
for mentry in list:
|
|
||||||
count += 1
|
|
||||||
|
|
||||||
entry = mentry["source"]
|
|
||||||
id = mentry["id"]
|
|
||||||
# print(id)
|
|
||||||
eid = id
|
|
||||||
|
|
||||||
if (count % 100) == 0 :
|
|
||||||
print("%s -> %.2f " % (id , (count / self.total) if self.total > 0 else 0))
|
|
||||||
|
|
||||||
data_filled = False
|
|
||||||
data = {}
|
|
||||||
for col in fields:
|
|
||||||
|
|
||||||
if '.' in col :
|
|
||||||
cols = col.split('.')
|
|
||||||
subsource = entry
|
|
||||||
for sub in cols :
|
|
||||||
dCol = subsource.get(sub, None)
|
|
||||||
if dCol :
|
|
||||||
subsource = dCol
|
|
||||||
else :
|
|
||||||
break
|
|
||||||
else :
|
|
||||||
dCol = entry.get(col, None)
|
|
||||||
|
|
||||||
if dCol is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if col in renameFileds :
|
|
||||||
data[renameFileds[col]] = dCol
|
|
||||||
else:
|
|
||||||
data[col] = dCol
|
|
||||||
|
|
||||||
data_filled = True
|
|
||||||
|
|
||||||
if not data_filled :
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
resp = self.update_index_doc(True, index_name_o, eid, data)
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
# save_error(id, e)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
# print("1111")
|
|
||||||
print(e)
|
|
||||||
|
|
||||||
# save_error(id, e)
|
|
||||||
|
|
||||||
def mappingIndex(self, index_name_i):
|
|
||||||
# فقط از طریق کیبانا میشه تغییر مپ داد
|
|
||||||
|
|
||||||
# با پایتون نمیشه
|
|
||||||
# باید ایندکس جدیدی با مپ مطلوب ایجاد کرد و رایندکس کرد
|
|
||||||
pass
|
|
||||||
|
|
||||||
def updateByQueryIndex(self, index_name_i, body):
|
|
||||||
## sample
|
|
||||||
# body = {
|
|
||||||
# "script": {
|
|
||||||
# "inline": "ctx._source.Device='Test'",
|
|
||||||
# "lang": "painless"
|
|
||||||
# },
|
|
||||||
# "query": {
|
|
||||||
# "match": {
|
|
||||||
# "Device": "Boiler"
|
|
||||||
# }
|
|
||||||
# }
|
|
||||||
# }
|
|
||||||
try:
|
|
||||||
self.es.update_by_query(body=body, index=index_name_i)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
# save_error(id, e)
|
|
||||||
|
|
||||||
|
|
||||||
def deleteByQueryIndex(self, index_name_i, body):
|
|
||||||
## sample
|
|
||||||
# body = {
|
|
||||||
# "query": {
|
|
||||||
# "match": {
|
|
||||||
# "Device": "Boiler"
|
|
||||||
# }
|
|
||||||
# }
|
|
||||||
# }
|
|
||||||
try:
|
|
||||||
self.es.delete_by_query(index=index_name_i, body=body )
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
# save_error(id, e)
|
|
||||||
|
|
||||||
def delete_by_ids(self, index_name_i, ids):
|
|
||||||
try:
|
|
||||||
# ids = ['test1', 'test2', 'test3']
|
|
||||||
|
|
||||||
query = {"query": {"terms": {"_id": ids}}}
|
|
||||||
res = self.es.delete_by_query(index=index_name_i, body=query)
|
|
||||||
print(res)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
# save_error(id, e)
|
|
||||||
|
|
||||||
|
|
@ -1,681 +0,0 @@
|
||||||
# !pip install hazm
|
|
||||||
# !pip install transformers==4.26.0
|
|
||||||
# !pip install --upgrade numpy
|
|
||||||
# !pip install --upgrade sentence-transformers
|
|
||||||
"""
|
|
||||||
Persian Sentence Processing and Vector Analysis
|
|
||||||
==============================================
|
|
||||||
|
|
||||||
This script processes Persian sentences from a JSON file and performs:
|
|
||||||
1. Word extraction and preprocessing
|
|
||||||
2. Vector representation using multilingual transformer
|
|
||||||
3. Similarity analysis for key words
|
|
||||||
4. Dimensionality reduction to 3D
|
|
||||||
5. 3D visualization with Persian labels
|
|
||||||
|
|
||||||
Author: NLP Expert Assistant
|
|
||||||
"""
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from typing import List, Dict, Tuple, Set
|
|
||||||
from collections import Counter
|
|
||||||
import logging
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
# NLP and ML libraries
|
|
||||||
from sentence_transformers import SentenceTransformer
|
|
||||||
from transformers import AutoTokenizer
|
|
||||||
from sklearn.decomposition import PCA
|
|
||||||
from sklearn.manifold import TSNE
|
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
|
||||||
#from normalizer import cleaning
|
|
||||||
try:
|
|
||||||
from elastic_helper import ElasticHelper
|
|
||||||
except Exception as error:
|
|
||||||
eee = error
|
|
||||||
pass
|
|
||||||
# Visualization libraries
|
|
||||||
# import matplotlib.pyplot as plt
|
|
||||||
# import plotly.graph_objects as go
|
|
||||||
# import plotly.express as px
|
|
||||||
# from plotly.subplots import make_subplots
|
|
||||||
|
|
||||||
# Persian text processing
|
|
||||||
# import hazm
|
|
||||||
# from hazm import Normalizer, word_tokenize, POSTagger
|
|
||||||
|
|
||||||
# Configure logging
|
|
||||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class PersianVectorAnalyzer:
|
|
||||||
"""
|
|
||||||
A comprehensive class for Persian text processing and vector analysis.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
|
|
||||||
"""
|
|
||||||
Initialize the analyzer with the specified model.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
model_name: The sentence transformer model to use
|
|
||||||
"""
|
|
||||||
self.model_name = model_name
|
|
||||||
self.model = None
|
|
||||||
#self.normalizer = Normalizer()
|
|
||||||
self.stop_words = self._load_persian_stop_words()
|
|
||||||
self.key_words = [
|
|
||||||
"خدا", "بنده", "جهاد", "ولی", "زکات",
|
|
||||||
"نماز", "صبر", "عبادت", "ولایت", "خلافت","پیامبر"
|
|
||||||
]
|
|
||||||
|
|
||||||
logger.info(f"Initializing Persian Vector Analyzer with model: {model_name}")
|
|
||||||
|
|
||||||
def _load_persian_stop_words(self) -> Set[str]:
|
|
||||||
"""
|
|
||||||
Load Persian stop words.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Set of Persian stop words
|
|
||||||
"""
|
|
||||||
# Common Persian stop words
|
|
||||||
stop_words = {
|
|
||||||
'و', 'در', 'به', 'از', 'که', 'این', 'آن', 'با', 'برای', 'تا',
|
|
||||||
'را', 'هم', 'یا', 'اما', 'اگر', 'چون', 'چرا', 'چگونه', 'کجا',
|
|
||||||
'چه', 'کی', 'چند', 'چقدر', 'همه', 'هیچ', 'بعضی', 'هر', 'همه',
|
|
||||||
'خود', 'خویش', 'ما', 'شما', 'آنها', 'ایشان', 'اینها', 'آنها',
|
|
||||||
'من', 'تو', 'او', 'ما', 'شما', 'آنها', 'ایشان', 'اینها',
|
|
||||||
'است', 'هست', 'بود', 'شد', 'می', 'باید', 'خواهد', 'دارد',
|
|
||||||
'کرد', 'شد', 'بود', 'هست', 'است', 'میشود', 'میکند',
|
|
||||||
'یک', 'دو', 'سه', 'چهار', 'پنج', 'شش', 'هفت', 'هشت', 'نه', 'ده',
|
|
||||||
'اول', 'دوم', 'سوم', 'چهارم', 'پنجم', 'ششم', 'هفتم', 'هشتم', 'نهم', 'دهم',
|
|
||||||
'سال', 'ماه', 'روز', 'هفته', 'ساعت', 'دقیقه', 'ثانیه','پس'
|
|
||||||
'بله', 'نه', 'آری', 'خیر', 'بلی', 'نخیر',
|
|
||||||
'حالا', 'الان', 'امروز', 'دیروز', 'فردا', 'هفته', 'ماه', 'سال',
|
|
||||||
'بالا', 'پایین', 'چپ', 'راست', 'جلو', 'عقب', 'داخل', 'خارج',
|
|
||||||
'بزرگ', 'کوچک', 'بلند', 'کوتاه', 'پهن', 'باریک', 'ضخیم', 'نازک',
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
return stop_words
|
|
||||||
|
|
||||||
def load_model(self):
|
|
||||||
"""
|
|
||||||
Load the sentence transformer model.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
logger.info("Loading sentence transformer model...")
|
|
||||||
self.model = SentenceTransformer(self.model_name)
|
|
||||||
logger.info("Model loaded successfully!")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error loading model: {e}")
|
|
||||||
raise
|
|
||||||
def split_sentence(self, sentence:str):
|
|
||||||
sentences = []
|
|
||||||
sentence_len = len(self.tokenize_sentence(sentence))
|
|
||||||
if sentence_len < 512:
|
|
||||||
sentences.append(sentence)
|
|
||||||
else:
|
|
||||||
temp_sentences = str(sentence).split('.')
|
|
||||||
for sent in temp_sentences:
|
|
||||||
sent_len = len(self.tokenize_sentence(sent))
|
|
||||||
if sent_len > 512:
|
|
||||||
temp_sentences_2 = str(sent).split('،')
|
|
||||||
for snt in temp_sentences_2:
|
|
||||||
sentences.append(snt)
|
|
||||||
else:
|
|
||||||
sentences.append(sent)
|
|
||||||
|
|
||||||
return sentences
|
|
||||||
|
|
||||||
def load_json_data(self, file_path: str) -> List[str]:
|
|
||||||
"""
|
|
||||||
Load Persian sentences from JSON file.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_path: Path to the JSON file
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of Persian sentences
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
logger.info(f"Loading data from {file_path}")
|
|
||||||
with open(file_path, 'r', encoding='utf-8') as f:
|
|
||||||
data = json.load(f)
|
|
||||||
# convert dict{dict} to list[dict]
|
|
||||||
if type(data) == dict:
|
|
||||||
temp_data = []
|
|
||||||
for item in data.items():
|
|
||||||
temp_data.append(item[1])
|
|
||||||
data = temp_data
|
|
||||||
|
|
||||||
sentences = []
|
|
||||||
if isinstance(data, list):
|
|
||||||
for index, item in enumerate(data):
|
|
||||||
print(f'split sentence {index}')
|
|
||||||
if isinstance(item, dict):
|
|
||||||
if item['content'] == '':
|
|
||||||
continue
|
|
||||||
sentences.append([item['id'],item['content'].strip()])
|
|
||||||
# for key in ['content']:
|
|
||||||
# if key in item and item[key]:
|
|
||||||
# # splited_sentences = self.split_sentence(item[key])
|
|
||||||
# # splited_sentences = item[key]
|
|
||||||
# sentences.append(item[key])
|
|
||||||
# # for sent in splited_sentences:
|
|
||||||
# # sentences.append(sent)
|
|
||||||
# else:
|
|
||||||
# print('fault '+item['sentence-number'])
|
|
||||||
elif isinstance(item, str):
|
|
||||||
# splited_sentences = self.split_sentence(item[key])
|
|
||||||
sentences.append(item)
|
|
||||||
# for sent in splited_sentences:
|
|
||||||
# sentences.append(sent)
|
|
||||||
elif isinstance(data, dict):
|
|
||||||
# If it's a single object, extract all string values
|
|
||||||
for value in data.values():
|
|
||||||
if isinstance(value, str):
|
|
||||||
sentences.append(value)
|
|
||||||
# splited_sentences = str(value).split('.')
|
|
||||||
# for sent in splited_sentences:
|
|
||||||
# sentences.append(sent)
|
|
||||||
|
|
||||||
sentences = [senten for senten in sentences if senten]
|
|
||||||
logger.info(f"Loaded {len(sentences)} sentences")
|
|
||||||
return sentences
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error loading JSON data: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def preprocess_text(self, text: str) -> str:
|
|
||||||
"""
|
|
||||||
Preprocess Persian text.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: Raw Persian text
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Preprocessed text
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Normalize text
|
|
||||||
#text = self.normalizer.normalize(text)
|
|
||||||
|
|
||||||
# Remove extra whitespace
|
|
||||||
text = re.sub(r'\s+', ' ', text)
|
|
||||||
|
|
||||||
# Remove special characters but keep Persian characters
|
|
||||||
text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text)
|
|
||||||
|
|
||||||
return text.strip()
|
|
||||||
|
|
||||||
def tokenize_sentence(self, sentence:str):
|
|
||||||
|
|
||||||
try:
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
|
||||||
# print(self.model_name)
|
|
||||||
tokens = tokenizer.tokenize(sentence)
|
|
||||||
return tokens
|
|
||||||
except:
|
|
||||||
error = "An exception occurred in tokenizer : " + self.model_name
|
|
||||||
#file.write( error + '\n' )
|
|
||||||
return []
|
|
||||||
|
|
||||||
def extract_words(self, sentences: List[str]) -> List[str]:
|
|
||||||
"""
|
|
||||||
Extract all words from sentences.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
sentences: List of Persian sentences
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of all words
|
|
||||||
"""
|
|
||||||
all_words = []
|
|
||||||
|
|
||||||
for sentence in sentences:
|
|
||||||
# Preprocess sentence
|
|
||||||
processed_sentence = self.preprocess_text(sentence)
|
|
||||||
|
|
||||||
# Tokenize
|
|
||||||
words = word_tokenize(processed_sentence)
|
|
||||||
# words = processed_sentence.split()
|
|
||||||
# Filter out empty strings and very short words
|
|
||||||
words = [word for word in words if len(word) > 1]
|
|
||||||
|
|
||||||
all_words.extend(words)
|
|
||||||
|
|
||||||
logger.info(f"Extracted {len(all_words)} words from {len(sentences)} sentences")
|
|
||||||
return all_words
|
|
||||||
|
|
||||||
def remove_stop_words(self, words: List[str]) -> List[str]:
|
|
||||||
"""
|
|
||||||
Remove stop words from the word list.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
words: List of words
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of words without stop words
|
|
||||||
"""
|
|
||||||
filtered_words = [word for word in words if word not in self.stop_words]
|
|
||||||
logger.info(f"Removed {len(words) - len(filtered_words)} stop words")
|
|
||||||
return filtered_words
|
|
||||||
|
|
||||||
def get_unique_words(self, words: List[str]) -> List[str]:
|
|
||||||
"""
|
|
||||||
Get unique words from the list.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
words: List of words
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of unique words
|
|
||||||
"""
|
|
||||||
unique_words = list(set(words))
|
|
||||||
logger.info(f"Found {len(unique_words)} unique words from {len(words)} total words")
|
|
||||||
return unique_words
|
|
||||||
|
|
||||||
def compute_word_vectors(self, sentences: List[str]) -> Dict[str, List[float]]:
|
|
||||||
"""
|
|
||||||
Compute vector representations for words.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
sentences: List of unique sentences
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary mapping sentences to their vector representations
|
|
||||||
"""
|
|
||||||
if self.model is None:
|
|
||||||
self.load_model()
|
|
||||||
|
|
||||||
logger.info(f"Computing vectors for {len(sentences)} sections ...")
|
|
||||||
# print(sentences[0])
|
|
||||||
# create list of just sentences
|
|
||||||
just_sentences = [sent['content'] for sent in sentences]
|
|
||||||
# Compute embeddings
|
|
||||||
embeddings = self.model.encode(just_sentences, show_progress_bar=True)
|
|
||||||
|
|
||||||
# Create dictionary
|
|
||||||
sentences_vectors = {}
|
|
||||||
for i, sent in enumerate(sentences):
|
|
||||||
sentences_vectors[f'sentence-{sentences[i]["id"]}'] = {
|
|
||||||
'id': sentences[i]['id'],
|
|
||||||
'fullpath': sentences[i]['fullpath'],
|
|
||||||
'qanon-title': sentences[i]['qanon-title'],
|
|
||||||
'section-prefix': sentences[i]['section-prefix'],
|
|
||||||
'content': sentences[i]['content'],
|
|
||||||
'embeddings': embeddings[i].tolist()
|
|
||||||
}
|
|
||||||
print(f'section {i} embedded!')
|
|
||||||
|
|
||||||
logger.info("section vectors computed successfully!")
|
|
||||||
return sentences_vectors
|
|
||||||
|
|
||||||
def find_closest_words(self, word_vectors: Dict[str, List[float]],
|
|
||||||
key_words: List[str], top_k: int = 20) -> Dict[str, List[str]]:
|
|
||||||
"""
|
|
||||||
Find the closest words to each key word.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
word_vectors: Dictionary of word vectors
|
|
||||||
key_words: List of key words to find neighbors for
|
|
||||||
top_k: Number of closest words to find
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary mapping key words to their closest neighbors
|
|
||||||
"""
|
|
||||||
logger.info(f"Finding {top_k} closest words for {len(key_words)} key words...")
|
|
||||||
|
|
||||||
# Convert to numpy arrays for faster computation
|
|
||||||
words = list(word_vectors.keys())
|
|
||||||
vectors = np.array(list(word_vectors.values()))
|
|
||||||
|
|
||||||
closest_words = {}
|
|
||||||
|
|
||||||
for key_word in key_words:
|
|
||||||
if key_word in word_vectors:
|
|
||||||
# Get the key word vector
|
|
||||||
key_vector = np.array(word_vectors[key_word]).reshape(1, -1)
|
|
||||||
|
|
||||||
# Compute cosine similarities
|
|
||||||
similarities = cosine_similarity(key_vector, vectors)[0]
|
|
||||||
|
|
||||||
# Get indices of top k similar words (excluding the key word itself)
|
|
||||||
word_indices = np.argsort(similarities)[::-1]
|
|
||||||
|
|
||||||
# Filter out the key word itself and get top k
|
|
||||||
closest_indices = []
|
|
||||||
for idx in word_indices:
|
|
||||||
if words[idx] != key_word and len(closest_indices) < top_k:
|
|
||||||
closest_indices.append(idx)
|
|
||||||
|
|
||||||
# Get the closest words
|
|
||||||
closest_words[key_word] = [words[idx] for idx in closest_indices]
|
|
||||||
logger.info(f"Found {len(closest_words[key_word])} closest words for '{key_word}'")
|
|
||||||
else:
|
|
||||||
logger.warning(f"Key word '{key_word}' not found in word vectors")
|
|
||||||
closest_words[key_word] = []
|
|
||||||
|
|
||||||
return closest_words
|
|
||||||
|
|
||||||
def reduce_to_3d(self, word_vectors: Dict[str, List[float]],
|
|
||||||
method: str = 'tsne') -> Dict[str, List[float]]:
|
|
||||||
"""
|
|
||||||
Reduce word vectors to 3D coordinates.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
word_vectors: Dictionary of word vectors
|
|
||||||
method: Dimensionality reduction method ('pca' or 'tsne')
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary mapping words to their 3D coordinates
|
|
||||||
"""
|
|
||||||
logger.info(f"Reducing dimensions to 3D using {method.upper()}...")
|
|
||||||
|
|
||||||
words = list(word_vectors.keys())
|
|
||||||
vectors = np.array(list(word_vectors.values()))
|
|
||||||
|
|
||||||
if method.lower() == 'pca':
|
|
||||||
reducer = PCA(n_components=3, random_state=42)
|
|
||||||
elif method.lower() == 'tsne':
|
|
||||||
reducer = TSNE(n_components=3, random_state=42, perplexity=min(30, len(vectors)-1))
|
|
||||||
else:
|
|
||||||
raise ValueError("Method must be 'pca' or 'tsne'")
|
|
||||||
|
|
||||||
# Reduce dimensions
|
|
||||||
reduced_vectors = reducer.fit_transform(vectors)
|
|
||||||
|
|
||||||
# Create dictionary
|
|
||||||
word_vectors_3d = {}
|
|
||||||
for i, word in enumerate(words):
|
|
||||||
word_vectors_3d[word] = reduced_vectors[i].tolist()
|
|
||||||
|
|
||||||
logger.info("Dimensionality reduction completed!")
|
|
||||||
return word_vectors_3d
|
|
||||||
|
|
||||||
def save_json(self, data: dict, file_path: str):
|
|
||||||
"""
|
|
||||||
Save data to JSON file.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
data: Data to save
|
|
||||||
file_path: Output file path
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
with open(file_path, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
||||||
logger.info(f"Data saved to {file_path}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error saving to {file_path}: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
# def create_3d_visualization(self, word_vectors_3d: Dict[str, List[float]],
|
|
||||||
# selected_words: Dict[str, List[str]],
|
|
||||||
# output_path: str = "persian_words_3d.html"):
|
|
||||||
# """
|
|
||||||
# Create 3D visualization of words.
|
|
||||||
|
|
||||||
# Args:
|
|
||||||
# word_vectors_3d: Dictionary of 3D word coordinates
|
|
||||||
# selected_words: Dictionary of selected words for each key word
|
|
||||||
# output_path: Output file path for the visualization
|
|
||||||
# """
|
|
||||||
# logger.info("Creating 3D visualization...")
|
|
||||||
|
|
||||||
# # Prepare data for plotting
|
|
||||||
# words = list(word_vectors_3d.keys())
|
|
||||||
# coords = np.array(list(word_vectors_3d.values()))
|
|
||||||
|
|
||||||
# # Create color mapping for key words and their neighbors
|
|
||||||
# colors = []
|
|
||||||
# sizes = []
|
|
||||||
# hover_texts = []
|
|
||||||
|
|
||||||
# for word in words:
|
|
||||||
# # Check if word is a key word
|
|
||||||
# is_key_word = word in self.key_words
|
|
||||||
|
|
||||||
# # Check if word is in selected words
|
|
||||||
# in_selected = False
|
|
||||||
# key_word_group = None
|
|
||||||
# for key_word, selected_list in selected_words.items():
|
|
||||||
# if word in selected_list:
|
|
||||||
# in_selected = True
|
|
||||||
# key_word_group = key_word
|
|
||||||
# break
|
|
||||||
|
|
||||||
# if is_key_word:
|
|
||||||
# colors.append('red')
|
|
||||||
# sizes.append(15)
|
|
||||||
# hover_texts.append(f"کلیدواژه: {word}")
|
|
||||||
# elif in_selected:
|
|
||||||
# colors.append('blue')
|
|
||||||
# sizes.append(10)
|
|
||||||
# hover_texts.append(f"کلمه مرتبط با '{key_word_group}': {word}")
|
|
||||||
# else:
|
|
||||||
# colors.append('lightgray')
|
|
||||||
# sizes.append(5)
|
|
||||||
# hover_texts.append(f"کلمه: {word}")
|
|
||||||
|
|
||||||
# # Create 3D scatter plot
|
|
||||||
# fig = go.Figure()
|
|
||||||
|
|
||||||
# # Add scatter plot
|
|
||||||
# fig.add_trace(go.Scatter3d(
|
|
||||||
# x=coords[:, 0],
|
|
||||||
# y=coords[:, 1],
|
|
||||||
# z=coords[:, 2],
|
|
||||||
# mode='markers+text',
|
|
||||||
# marker=dict(
|
|
||||||
# size=sizes,
|
|
||||||
# color=colors,
|
|
||||||
# opacity=0.8
|
|
||||||
# ),
|
|
||||||
# text=words,
|
|
||||||
# textposition="middle center",
|
|
||||||
# hovertext=hover_texts,
|
|
||||||
# hoverinfo='text'
|
|
||||||
# ))
|
|
||||||
|
|
||||||
# # Update layout
|
|
||||||
# fig.update_layout(
|
|
||||||
# title={
|
|
||||||
# 'text': 'نمایش سهبعدی کلمات فارسی',
|
|
||||||
# 'x': 0.5,
|
|
||||||
# 'xanchor': 'center',
|
|
||||||
# 'font': {'size': 20}
|
|
||||||
# },
|
|
||||||
# scene=dict(
|
|
||||||
# xaxis_title='محور X',
|
|
||||||
# yaxis_title='محور Y',
|
|
||||||
# zaxis_title='محور Z',
|
|
||||||
# camera=dict(
|
|
||||||
# eye=dict(x=1.5, y=1.5, z=1.5)
|
|
||||||
# )
|
|
||||||
# ),
|
|
||||||
# width=1000,
|
|
||||||
# height=800,
|
|
||||||
# showlegend=False
|
|
||||||
# )
|
|
||||||
|
|
||||||
# # Save the plot
|
|
||||||
# fig.write_html(output_path)
|
|
||||||
# logger.info(f"3D visualization saved to {output_path}")
|
|
||||||
|
|
||||||
# return fig
|
|
||||||
|
|
||||||
def process_pipeline(self, input_file: str, output_dir: str = "output"):
|
|
||||||
"""
|
|
||||||
Run the complete processing pipeline.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
input_file(str): Path to input JSON file
|
|
||||||
output_dir(str): Output directory for results
|
|
||||||
"""
|
|
||||||
# Create output directory
|
|
||||||
Path(output_dir).mkdir(exist_ok=True)
|
|
||||||
|
|
||||||
logger.info("Starting Persian Vector Analysis Pipeline...")
|
|
||||||
|
|
||||||
# Step 1: Load data
|
|
||||||
# sentences = self.load_json_data(input_file)
|
|
||||||
sentences = ALL_SECTIONS
|
|
||||||
|
|
||||||
# for s in sentences:
|
|
||||||
# s_len = len(self.tokenize_sentence(s))
|
|
||||||
# if s_len > 512:
|
|
||||||
# print(f'long: {s}')
|
|
||||||
# Step 2: Extract words
|
|
||||||
# all_words = self.extract_words(sentences)
|
|
||||||
|
|
||||||
# Step 3: Remove stop words
|
|
||||||
# filtered_words = self.remove_stop_words(all_words)
|
|
||||||
# filtered_words = all_words
|
|
||||||
|
|
||||||
# Step 4: Get unique words
|
|
||||||
# unique_words = self.get_unique_words(filtered_words)
|
|
||||||
|
|
||||||
# Step 5: Compute word vectors
|
|
||||||
sentences_vectors = self.compute_word_vectors(sentences)
|
|
||||||
|
|
||||||
# Step 6: Save word vectors
|
|
||||||
self.save_json(sentences_vectors, f"{output_dir}/sections-vec-285k.json")
|
|
||||||
|
|
||||||
# Step 7: Find closest words to key words
|
|
||||||
# selected_words = self.find_closest_words(word_vectors, self.key_words)
|
|
||||||
|
|
||||||
# Step 8: Save selected words
|
|
||||||
# self.save_json(selected_words, f"{output_dir}/selected_words.json")
|
|
||||||
|
|
||||||
# Step 9: Reduce to 3D
|
|
||||||
# word_vectors_3d = self.reduce_to_3d(word_vectors, method='tsne')
|
|
||||||
|
|
||||||
# Step 10: Save 3D vectors
|
|
||||||
# self.save_json(word_vectors_3d, f"{output_dir}/words_vector_3d.json")
|
|
||||||
|
|
||||||
# Step 11: Create visualization
|
|
||||||
# self.create_3d_visualization(word_vectors_3d, selected_words,
|
|
||||||
# f"{output_dir}/persian_words_3d.html")
|
|
||||||
|
|
||||||
logger.info("Pipeline completed successfully!")
|
|
||||||
|
|
||||||
# Print summary
|
|
||||||
print("\n" + "="*50)
|
|
||||||
print("PIPELINE SUMMARY")
|
|
||||||
print("="*50)
|
|
||||||
print(f"Input sentences: {len(sentences)}")
|
|
||||||
# print(f"Total words extracted: {len(all_words)}")
|
|
||||||
# print(f"Unique words after preprocessing: {len(unique_words)}")
|
|
||||||
# print(f"Word vectors computed: {len(word_vectors)}")
|
|
||||||
# print(f"Key words processed: {len(self.key_words)}")
|
|
||||||
print(f"Output files saved to: {output_dir}/")
|
|
||||||
print("="*50)
|
|
||||||
|
|
||||||
def full_path_text_maker(full_path):
|
|
||||||
"""
|
|
||||||
این متد مسیر یک سکشن را می گیرد و متنی را بر اساس ترتیب بخش های آن از جزء به کل بازسازی می کند و بر می گرداند
|
|
||||||
|
|
||||||
Args:
|
|
||||||
full_path(list): لیستی از عناصر مشخص کننده مسیر درختی این سکشن
|
|
||||||
Returns:
|
|
||||||
full_path_text(str): متن بازسازی شده از مسیر یک سکشن
|
|
||||||
"""
|
|
||||||
full_path_text = ""
|
|
||||||
for i, path_item in enumerate(reversed(full_path)):
|
|
||||||
if i == len(full_path) - 1:
|
|
||||||
full_path_text += ''.join(f'{path_item}')
|
|
||||||
break
|
|
||||||
full_path_text += ''.join(f'{path_item} از ')
|
|
||||||
full_path_text = full_path_text.strip()
|
|
||||||
return full_path_text
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""
|
|
||||||
Main function to run the Persian Vector Analysis.
|
|
||||||
"""
|
|
||||||
# Initialize analyzer
|
|
||||||
analyzer = PersianVectorAnalyzer()
|
|
||||||
|
|
||||||
# Define input and output paths
|
|
||||||
# input_file = "./output-speechs/nahj_speechs_sentences.json"
|
|
||||||
# output_dir = "output-speechs"
|
|
||||||
# input_file = "./majles/data/sections.json"
|
|
||||||
input_file = ""
|
|
||||||
output_dir = "majles-output"
|
|
||||||
|
|
||||||
# Run the complete pipeline
|
|
||||||
analyzer.process_pipeline(input_file, output_dir)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
eh_obj = ElasticHelper()
|
|
||||||
path = "/home/gpu/data_11/14040611/mj_qa_section.zip"
|
|
||||||
sections_elastic = eh_obj.iterateJsonFile(path, True)
|
|
||||||
all_count = 0
|
|
||||||
dont_cares = []
|
|
||||||
ALL_SECTIONS = []
|
|
||||||
for index, item in enumerate(sections_elastic):
|
|
||||||
all_count +=1
|
|
||||||
source = item['source']
|
|
||||||
section_path = source['other_info']['full_path']
|
|
||||||
id = item['id']
|
|
||||||
|
|
||||||
filtered_keys = ['فصل','موخره','امضاء','عنوان']
|
|
||||||
section_path = source['other_info']['full_path']
|
|
||||||
flag = False
|
|
||||||
if '>' in section_path:
|
|
||||||
path_parts = section_path.split('>')
|
|
||||||
for key in filtered_keys:
|
|
||||||
if key in path_parts[-1]:
|
|
||||||
dont_cares.append(id)
|
|
||||||
flag = True
|
|
||||||
break
|
|
||||||
if flag:
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
for key in filtered_keys:
|
|
||||||
if key in section_path:
|
|
||||||
dont_cares.append(id)
|
|
||||||
flag = True
|
|
||||||
break
|
|
||||||
if flag:
|
|
||||||
continue
|
|
||||||
|
|
||||||
qanon_title = source['qanon_title']
|
|
||||||
full_path_text = full_path_text_maker(section_path.split('>'))
|
|
||||||
section_prefix = f"محتوای {full_path_text} {cleaning(qanon_title)} عبارت است از: "
|
|
||||||
|
|
||||||
try:
|
|
||||||
content = cleaning(item['source']['content'])
|
|
||||||
# کنار گذاشتن سکشن های خیلی کوچک که عملا محتوا ندارند
|
|
||||||
if len(content.split()) <= 10:
|
|
||||||
continue
|
|
||||||
except Exception as error:
|
|
||||||
print(error)
|
|
||||||
continue
|
|
||||||
data = {
|
|
||||||
'id': id,
|
|
||||||
'fullpath': section_path,
|
|
||||||
'qanon-title': qanon_title,
|
|
||||||
'section-prefix': section_prefix,
|
|
||||||
'content': content
|
|
||||||
}
|
|
||||||
ALL_SECTIONS.append(data)
|
|
||||||
print(f'all_count: {all_count}')
|
|
||||||
print(f'dont_cares: {len(dont_cares)}')
|
|
||||||
print(f'ALL_SECTIONS without dont-cares: {len(ALL_SECTIONS)}')
|
|
||||||
|
|
||||||
main()
|
|
||||||
|
|
||||||
"""
|
|
||||||
:: *** نکته مهم *** ::
|
|
||||||
NOTE !!! after this process run convert_qavanin_json_to_faiss.py due to create faiss index which is used in RAG process
|
|
||||||
"""
|
|
||||||
|
|
@ -1,76 +0,0 @@
|
||||||
#import hazm
|
|
||||||
from cleantext import clean
|
|
||||||
import re
|
|
||||||
|
|
||||||
def cleanhtml(raw_html):
|
|
||||||
cleanr = re.compile('<.*?>')
|
|
||||||
cleantext = re.sub(cleanr, '', raw_html)
|
|
||||||
return cleantext
|
|
||||||
|
|
||||||
#normalizer = hazm.Normalizer()
|
|
||||||
wierd_pattern = re.compile("["
|
|
||||||
u"\U0001F600-\U0001F64F" # emoticons
|
|
||||||
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
|
||||||
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
|
||||||
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
|
||||||
u"\U00002702-\U000027B0"
|
|
||||||
u"\U000024C2-\U0001F251"
|
|
||||||
u"\U0001f926-\U0001f937"
|
|
||||||
u'\U00010000-\U0010ffff'
|
|
||||||
u"\u200d"
|
|
||||||
u"\u2640-\u2642"
|
|
||||||
u"\u2600-\u2B55"
|
|
||||||
u"\u23cf"
|
|
||||||
u"\u23e9"
|
|
||||||
u"\u231a"
|
|
||||||
u"\u3030"
|
|
||||||
u"\ufe0f"
|
|
||||||
u"\u2069"
|
|
||||||
u"\u2066"
|
|
||||||
# u"\u200c"
|
|
||||||
u"\u2068"
|
|
||||||
u"\u2067"
|
|
||||||
"]+", flags=re.UNICODE)
|
|
||||||
|
|
||||||
def cleaning(text):
|
|
||||||
text = text.strip()
|
|
||||||
|
|
||||||
# regular cleaning
|
|
||||||
# text = clean(text,
|
|
||||||
# fix_unicode=True,
|
|
||||||
# to_ascii=False,
|
|
||||||
# lower=True,
|
|
||||||
# no_line_breaks=True,
|
|
||||||
# no_urls=True,
|
|
||||||
# no_emails=True,
|
|
||||||
# no_phone_numbers=True,
|
|
||||||
# no_numbers=False,
|
|
||||||
# no_digits=False,
|
|
||||||
# no_currency_symbols=True,
|
|
||||||
# no_punct=False,
|
|
||||||
# replace_with_url="",
|
|
||||||
# replace_with_email="",
|
|
||||||
# replace_with_phone_number="",
|
|
||||||
# replace_with_number="",
|
|
||||||
# replace_with_digit="0",
|
|
||||||
# replace_with_currency_symbol="",
|
|
||||||
# )
|
|
||||||
text = clean(text,
|
|
||||||
extra_spaces = True,
|
|
||||||
lowercase = True
|
|
||||||
)
|
|
||||||
|
|
||||||
# cleaning htmls
|
|
||||||
text = cleanhtml(text)
|
|
||||||
|
|
||||||
# normalizing
|
|
||||||
#text = normalizer.normalize(text)
|
|
||||||
|
|
||||||
# removing wierd patterns
|
|
||||||
text = wierd_pattern.sub(r'', text)
|
|
||||||
|
|
||||||
# removing extra spaces, hashtags
|
|
||||||
text = re.sub("#", "", text)
|
|
||||||
text = re.sub("\s+", " ", text)
|
|
||||||
|
|
||||||
return text
|
|
||||||
64
_old/oss.py
64
_old/oss.py
|
|
@ -1,64 +0,0 @@
|
||||||
from openai import AsyncOpenAI
|
|
||||||
|
|
||||||
LLM_URL = "http://172.16.29.102:8001/v1/"
|
|
||||||
|
|
||||||
# item structure:
|
|
||||||
# item = {
|
|
||||||
# 'id' : '',
|
|
||||||
# 'system_prompt' : '',
|
|
||||||
# 'user_prompt' : '',
|
|
||||||
# 'assistant_prompt' : '',
|
|
||||||
# }
|
|
||||||
|
|
||||||
|
|
||||||
async def process_item(messages, reasoning_effort= 'medium', temperature= 0.4, top_p= 0.9, max_tokens= 2048):
|
|
||||||
"""
|
|
||||||
generates answer with gpt-oss-120b model
|
|
||||||
|
|
||||||
**Args:
|
|
||||||
reasoning_effort = 'medium' # -> low / high / medium
|
|
||||||
temperature = 0.4 # 0-1 -> creativity
|
|
||||||
top_p = 0.9 # 0-1 -> logic
|
|
||||||
max_tokens = 2048 # -> ... 128K
|
|
||||||
** Returns(tuple):
|
|
||||||
returns True, generated answer / False, failed message
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
async with AsyncOpenAI(base_url= LLM_URL, api_key="EMPTY") as client:
|
|
||||||
|
|
||||||
model_name = 'gpt-oss-120b'
|
|
||||||
|
|
||||||
# messages = [
|
|
||||||
# {"role": "system", "content": prompt_params.get("system_prompt", "")},
|
|
||||||
# {"role": "user", "content": prompt_params.get("user_prompt", "")},
|
|
||||||
# ]
|
|
||||||
# if prompt_params.get("assistant_prompt"):
|
|
||||||
# messages.append(
|
|
||||||
# {"role": "assistant", "content": prompt_params["assistant_prompt"]}
|
|
||||||
# )
|
|
||||||
# print(f'==== max_token {max_token}')
|
|
||||||
|
|
||||||
response = await client.chat.completions.parse(
|
|
||||||
model= model_name,
|
|
||||||
messages= messages,
|
|
||||||
temperature= temperature, # 0-1
|
|
||||||
top_p=top_p, # 0-1
|
|
||||||
reasoning_effort= reasoning_effort, # low , high , medium
|
|
||||||
# max_tokens= max_tokens, # ... 128K
|
|
||||||
stop= None,
|
|
||||||
)
|
|
||||||
|
|
||||||
# print('666666666666666666666666666666666')
|
|
||||||
# print(f"response.choices[0].message.parsed: {response.choices[0].message.parsed}")
|
|
||||||
# print('666666666666666666666666666666666')
|
|
||||||
|
|
||||||
if response and response.choices : # and response.choices[0].message.parsed:
|
|
||||||
response_message = response.choices[0].message.content
|
|
||||||
return True, response_message
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
response_message = 'error in llm response generation!'
|
|
||||||
print('!!!!!!!!!!!!!!!!!!!!!!!!!')
|
|
||||||
print(e)
|
|
||||||
print('!!!!!!!!!!!!!!!!!!!!!!!!!')
|
|
||||||
return False, response_message
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
cleantext==1.1.4
|
|
||||||
elasticsearch7==7.17.12
|
|
||||||
faiss_cpu==1.9.0
|
|
||||||
fastapi==0.117.1
|
|
||||||
hazm==0.10.0
|
|
||||||
langchain_openai==0.3.33
|
|
||||||
numpy==1.21.5
|
|
||||||
openai==1.108.1
|
|
||||||
pandas==2.3.2
|
|
||||||
pydantic==2.11.9
|
|
||||||
scikit_learn==1.7.2
|
|
||||||
sentence_transformers==2.5.1
|
|
||||||
torch==2.4.0
|
|
||||||
torch==2.1.2
|
|
||||||
transformers==4.55.1
|
|
||||||
|
|
@ -1,3 +0,0 @@
|
||||||
docker stop qachat
|
|
||||||
docker rm qachat
|
|
||||||
docker run --name qachat -p 2425:80 --net qachat_net --gpus=all -v ./:/src/app/ -v ./qavanin-faiss/:/src/app/qavanin-faiss/ -v ./llm-answer/:/src/app/llm-answer/ -v ./../MODELS:/src/MODELS -v ./../cache:/root/.cache/huggingface/hub -it --restart unless-stopped docker.tavasi.ir/tavasi/qachat:1.0.0
|
|
||||||
|
|
@ -1,443 +0,0 @@
|
||||||
[
|
|
||||||
{
|
|
||||||
"update_id": 1,
|
|
||||||
"message": {
|
|
||||||
"message_id": 1,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1761831331,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "/start",
|
|
||||||
"entities": [
|
|
||||||
{
|
|
||||||
"type": "bot_command",
|
|
||||||
"offset": 0,
|
|
||||||
"length": 6
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"update_id": 2,
|
|
||||||
"message": {
|
|
||||||
"message_id": 2,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1761833932,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "/start",
|
|
||||||
"entities": [
|
|
||||||
{
|
|
||||||
"type": "bot_command",
|
|
||||||
"offset": 0,
|
|
||||||
"length": 6
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"update_id": 3,
|
|
||||||
"message": {
|
|
||||||
"message_id": 3,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1761836593,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "/start",
|
|
||||||
"entities": [
|
|
||||||
{
|
|
||||||
"type": "bot_command",
|
|
||||||
"offset": 0,
|
|
||||||
"length": 6
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"update_id": 4,
|
|
||||||
"message": {
|
|
||||||
"message_id": 4,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1761836694,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "/start",
|
|
||||||
"entities": [
|
|
||||||
{
|
|
||||||
"type": "bot_command",
|
|
||||||
"offset": 0,
|
|
||||||
"length": 6
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"update_id": 5,
|
|
||||||
"message": {
|
|
||||||
"message_id": 5,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1761836899,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "/start",
|
|
||||||
"entities": [
|
|
||||||
{
|
|
||||||
"type": "bot_command",
|
|
||||||
"offset": 0,
|
|
||||||
"length": 6
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"update_id": 6,
|
|
||||||
"message": {
|
|
||||||
"message_id": 6,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1761836915,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "/start",
|
|
||||||
"entities": [
|
|
||||||
{
|
|
||||||
"type": "bot_command",
|
|
||||||
"offset": 0,
|
|
||||||
"length": 6
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"update_id": 7,
|
|
||||||
"message": {
|
|
||||||
"message_id": 7,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1761837001,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "/start",
|
|
||||||
"entities": [
|
|
||||||
{
|
|
||||||
"type": "bot_command",
|
|
||||||
"offset": 0,
|
|
||||||
"length": 6
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"update_id": 8,
|
|
||||||
"message": {
|
|
||||||
"message_id": 8,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1762099430,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "/start",
|
|
||||||
"entities": [
|
|
||||||
{
|
|
||||||
"type": "bot_command",
|
|
||||||
"offset": 0,
|
|
||||||
"length": 6
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"update_id": 9,
|
|
||||||
"message": {
|
|
||||||
"message_id": 9,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1762099450,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "/start",
|
|
||||||
"entities": [
|
|
||||||
{
|
|
||||||
"type": "bot_command",
|
|
||||||
"offset": 0,
|
|
||||||
"length": 6
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"update_id": 10,
|
|
||||||
"message": {
|
|
||||||
"message_id": 10,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1762100301,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "سلام"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"update_id": 11,
|
|
||||||
"message": {
|
|
||||||
"message_id": 11,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1762100357,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "/start",
|
|
||||||
"entities": [
|
|
||||||
{
|
|
||||||
"type": "bot_command",
|
|
||||||
"offset": 0,
|
|
||||||
"length": 6
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"update_id": 12,
|
|
||||||
"message": {
|
|
||||||
"message_id": 12,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1762100360,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "سلام"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"update_id": 13,
|
|
||||||
"message": {
|
|
||||||
"message_id": 13,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1762100364,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "رلیردت"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"update_id": 14,
|
|
||||||
"message": {
|
|
||||||
"message_id": 14,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1762179038,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "/start",
|
|
||||||
"entities": [
|
|
||||||
{
|
|
||||||
"type": "bot_command",
|
|
||||||
"offset": 0,
|
|
||||||
"length": 6
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"update_id": 26,
|
|
||||||
"message": {
|
|
||||||
"message_id": 95,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1762181681,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "تست"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"update_id": 27,
|
|
||||||
"message": {
|
|
||||||
"message_id": 97,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1762182073,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "/start",
|
|
||||||
"entities": [
|
|
||||||
{
|
|
||||||
"type": "bot_command",
|
|
||||||
"offset": 0,
|
|
||||||
"length": 6
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"update_id": 28,
|
|
||||||
"message": {
|
|
||||||
"message_id": 99,
|
|
||||||
"from": {
|
|
||||||
"id": 899452608,
|
|
||||||
"is_bot": false,
|
|
||||||
"first_name": "جوکار",
|
|
||||||
"last_name": "",
|
|
||||||
"username": "nasle_sevvom"
|
|
||||||
},
|
|
||||||
"date": 1762182086,
|
|
||||||
"chat": {
|
|
||||||
"id": 899452608,
|
|
||||||
"type": "private",
|
|
||||||
"username": "nasle_sevvom",
|
|
||||||
"first_name": "جوکار"
|
|
||||||
},
|
|
||||||
"text": "سلام"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
0
config.env
Normal file → Executable file
0
config.env
Normal file → Executable file
67
main.py
Normal file → Executable file
67
main.py
Normal file → Executable file
|
|
@ -1,20 +1,38 @@
|
||||||
import datetime
|
from fastapi import FastAPI
|
||||||
|
from routers.rag_base import router as rag_base
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from fastapi import FastAPI ,Header
|
|
||||||
from openai import AsyncOpenAI
|
|
||||||
from routes.rag_base import router as rag_base
|
|
||||||
|
|
||||||
|
|
||||||
async def get_oss_client():
|
# --- Lifespan manager ---
|
||||||
LLM_URL = "http://172.16.29.102:8001/v1/"
|
@asynccontextmanager
|
||||||
client = await AsyncOpenAI(base_url= LLM_URL, api_key="EMPTY")
|
async def lifespan(app: FastAPI):
|
||||||
return client
|
# 🚀 Startup
|
||||||
|
print("🚀 Starting up RAG system...")
|
||||||
|
# ایجاد OSS client و ذخیره در app.state
|
||||||
|
|
||||||
|
# --- نکته مهم: اگر elastic_client هم میخوای توی startup درست کنی، اینجا اضافه کن ---
|
||||||
|
# elastic_client = get_elastic_client()
|
||||||
|
# app.state.elastic_client = elastic_client
|
||||||
|
|
||||||
|
yield # برنامه در این حالت اجرا میشود
|
||||||
|
|
||||||
|
# 🛑 Shutdown
|
||||||
|
print("🛑 Shutting down RAG system...")
|
||||||
|
# بستن اتصالهای باز
|
||||||
|
client = getattr(app.state, "elastic_client", None)
|
||||||
|
if client is not None:
|
||||||
|
await client.close()
|
||||||
|
|
||||||
|
|
||||||
|
# --- ساخت اپلیکیشن ---
|
||||||
def create_app() -> FastAPI:
|
def create_app() -> FastAPI:
|
||||||
app = FastAPI(title="qachat2 Backend", version="0.1.0")
|
app = FastAPI(
|
||||||
|
title="qachat2 Backend",
|
||||||
|
version="0.1.0",
|
||||||
|
lifespan=lifespan, # ✅ اینجا lifespan رو متصل میکنیم
|
||||||
|
)
|
||||||
|
|
||||||
origins = ["*"]
|
origins = ["*"]
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
CORSMiddleware,
|
CORSMiddleware,
|
||||||
|
|
@ -24,36 +42,17 @@ def create_app() -> FastAPI:
|
||||||
allow_headers=["*"],
|
allow_headers=["*"],
|
||||||
)
|
)
|
||||||
|
|
||||||
# app.state.settings = get_settings()
|
|
||||||
|
|
||||||
@app.on_event("startup")
|
|
||||||
async def on_startup() -> None:
|
|
||||||
print("startup app")
|
|
||||||
client = getattr(app.state, "oss_client", None)
|
|
||||||
if not client :
|
|
||||||
client = get_oss_client()
|
|
||||||
app.state.oss_client = client
|
|
||||||
|
|
||||||
|
|
||||||
@app.on_event("shutdown")
|
|
||||||
async def on_shutdown() -> None:
|
|
||||||
client = getattr(app.state, "elastic_client", None)
|
|
||||||
if client is not None:
|
|
||||||
await client.close()
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
async def simple():
|
async def simple():
|
||||||
return "ai rag caht qanon OK"
|
return "ai rag chat qanon OK"
|
||||||
|
|
||||||
@app.get("/ping")
|
@app.get("/ping")
|
||||||
async def ping():
|
async def ping():
|
||||||
return "ai rag caht qanon OK"
|
return "ai rag chat qanon OK"
|
||||||
|
|
||||||
app.include_router(rag_base, prefix="")
|
app.include_router(rag_base, prefix="")
|
||||||
return app
|
return app
|
||||||
|
|
||||||
|
|
||||||
|
# ✅ نمونهسازی نهایی
|
||||||
app = create_app()
|
app = create_app()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
16
new_requirements.txt
Executable file
16
new_requirements.txt
Executable file
|
|
@ -0,0 +1,16 @@
|
||||||
|
cleantext
|
||||||
|
elasticsearch7
|
||||||
|
faiss_cpu
|
||||||
|
fastapi
|
||||||
|
hazm
|
||||||
|
langchain_openai
|
||||||
|
numpy
|
||||||
|
openai
|
||||||
|
pandas
|
||||||
|
pydantic
|
||||||
|
scikit_learn
|
||||||
|
sentence_transformers
|
||||||
|
torch
|
||||||
|
transformers
|
||||||
|
orjson
|
||||||
|
FlagEmbedding==1.3.5
|
||||||
0
routers/__init__.py
Executable file
0
routers/__init__.py
Executable file
425
routers/ai_data_parser.py
Executable file
425
routers/ai_data_parser.py
Executable file
|
|
@ -0,0 +1,425 @@
|
||||||
|
from typing import List
|
||||||
|
from pathlib import Path
|
||||||
|
import os, orjson, time, json, re, asyncio, traceback
|
||||||
|
from openai import AsyncOpenAI
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------ پردازش API ------------------------------
|
||||||
|
class AsyncCore:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model_name,
|
||||||
|
task_name,
|
||||||
|
output_schema,
|
||||||
|
api_url,
|
||||||
|
data_path=None,
|
||||||
|
reasoning_effort="low",
|
||||||
|
top_p=1,
|
||||||
|
temperature=0.0,
|
||||||
|
max_token=128000,
|
||||||
|
output_path=None,
|
||||||
|
ai_code_version=None,
|
||||||
|
request_timeout=30, # ثانیه
|
||||||
|
api_key="EMPTY",
|
||||||
|
save_number=2,
|
||||||
|
semaphore_number=5,
|
||||||
|
):
|
||||||
|
|
||||||
|
self.save_number = save_number
|
||||||
|
# json file of data
|
||||||
|
self.data_path = data_path
|
||||||
|
self.semaphore_number = semaphore_number
|
||||||
|
|
||||||
|
self.task_name = task_name
|
||||||
|
if output_path is None:
|
||||||
|
output_path = f"./{task_name}"
|
||||||
|
|
||||||
|
self.output_path = Path(output_path)
|
||||||
|
self._temp_path = self.output_path / "batch_data"
|
||||||
|
self._temp_processed_id_path = self._temp_path / "processed_id.json"
|
||||||
|
|
||||||
|
# Create output directory and subdirectories if they don't exist
|
||||||
|
self.output_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._temp_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
# self._temp_processed_id_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
self.request_timeout = request_timeout
|
||||||
|
self.model_name = model_name
|
||||||
|
self.api_key = api_key
|
||||||
|
self.output_schema = output_schema
|
||||||
|
self.api_url = api_url
|
||||||
|
self.reasoning_effort = reasoning_effort
|
||||||
|
self.top_p = top_p
|
||||||
|
self.temperature = temperature
|
||||||
|
self.max_token = max_token
|
||||||
|
|
||||||
|
if ai_code_version is None:
|
||||||
|
ai_code_version = f"{model_name}_{reasoning_effort}"
|
||||||
|
self.ai_code_version = ai_code_version
|
||||||
|
|
||||||
|
self.PRIMARY_KEY = {"system_prompt", "user_prompt", "id"}
|
||||||
|
if data_path != None:
|
||||||
|
try:
|
||||||
|
self.data = self.__data_process()
|
||||||
|
print(f"📦 Loaded {len(self.data)} words")
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(
|
||||||
|
f"Data loading/validation failed: {e}\n{traceback.format_exc()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def __validate_item(self, item, idx):
|
||||||
|
# Mandatory fields
|
||||||
|
for key in self.PRIMARY_KEY:
|
||||||
|
if key not in item:
|
||||||
|
raise ValueError(f"Missing mandatory key '{key}' in item #{idx}")
|
||||||
|
if not isinstance(item[key], str):
|
||||||
|
raise TypeError(
|
||||||
|
f"Item #{idx}: '{key}' must be a string, got {type(item[key]).__name__}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Optional field: assistant_prompt
|
||||||
|
if "assistant_prompt" not in item or item["assistant_prompt"] is None:
|
||||||
|
item["assistant_prompt"] = None
|
||||||
|
else:
|
||||||
|
if not isinstance(item["assistant_prompt"], str):
|
||||||
|
raise TypeError(
|
||||||
|
f"Item #{idx}: 'assistant_prompt' must be a string or absent, got {type(item['assistant_prompt']).__name__}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return item # now normalized
|
||||||
|
|
||||||
|
def __data_process(self):
|
||||||
|
raw_data = self.__load_orjson(self.data_path)
|
||||||
|
if not isinstance(raw_data, list):
|
||||||
|
raise ValueError("Data must be a list of dictionaries.")
|
||||||
|
|
||||||
|
processed_data = []
|
||||||
|
for idx, item in enumerate(raw_data):
|
||||||
|
if not isinstance(item, dict):
|
||||||
|
raise ValueError(f"Item #{idx} is not a dictionary.")
|
||||||
|
validated_item = self.__validate_item(item, idx)
|
||||||
|
processed_data.append(validated_item)
|
||||||
|
|
||||||
|
return processed_data
|
||||||
|
|
||||||
|
def __get_max_number_file(self, directory):
|
||||||
|
# Pattern to match filenames like out_1.json, out_25.json, etc.
|
||||||
|
pattern = re.compile(r"output_(\d+)\.json$")
|
||||||
|
max_num = 0
|
||||||
|
|
||||||
|
for filename in os.listdir(directory):
|
||||||
|
match = pattern.match(filename)
|
||||||
|
if match:
|
||||||
|
num = int(match.group(1))
|
||||||
|
if num > max_num:
|
||||||
|
max_num = num
|
||||||
|
return max_num + 1
|
||||||
|
|
||||||
|
def __load_orjson(self, path: str | Path):
|
||||||
|
path = Path(path)
|
||||||
|
with path.open("rb") as f: # باید باینری باز بشه برای orjson
|
||||||
|
return orjson.loads(f.read())
|
||||||
|
|
||||||
|
def __save_orjson(self, path, data):
|
||||||
|
with open(path, "wb") as f:
|
||||||
|
f.write(
|
||||||
|
orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_NON_STR_KEYS)
|
||||||
|
)
|
||||||
|
|
||||||
|
def merge_json_dir(self, input_path, output_path):
|
||||||
|
directory = Path(input_path)
|
||||||
|
if not directory.is_dir():
|
||||||
|
raise ValueError(f"Not valid PATH: {input_path}")
|
||||||
|
|
||||||
|
seen_ids = set() # برای ردیابی idهای دیدهشده (سریع!)
|
||||||
|
unique_data = [] # فقط دادههای یکتا
|
||||||
|
failed_files = []
|
||||||
|
|
||||||
|
json_files = list(directory.glob("*.json"))
|
||||||
|
if not json_files:
|
||||||
|
print("⚠️ NO JSON File Found In This PATH")
|
||||||
|
return
|
||||||
|
|
||||||
|
for json_file in json_files:
|
||||||
|
try:
|
||||||
|
data = self.__load_orjson(json_file)
|
||||||
|
if not data: # خالی یا None
|
||||||
|
failed_files.append(json_file.name)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if isinstance(data, list) and isinstance(data[0], dict):
|
||||||
|
for item in data:
|
||||||
|
item_id = item.get("id")
|
||||||
|
if item_id is None:
|
||||||
|
# اگر id نداشت، میتونی تصمیم بگیری: نگه داری یا ردش کنی
|
||||||
|
# اینجا فرض میکنیم فقط مواردی با id معتبر مهم هستند
|
||||||
|
continue
|
||||||
|
if item_id not in seen_ids:
|
||||||
|
seen_ids.add(item_id)
|
||||||
|
unique_data.append(item)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"no list available in this json -> {json_file}")
|
||||||
|
except (
|
||||||
|
json.JSONDecodeError,
|
||||||
|
ValueError,
|
||||||
|
OSError,
|
||||||
|
KeyError,
|
||||||
|
TypeError,
|
||||||
|
) as e:
|
||||||
|
# print(f"❌ Failed in process '{json_file.name}': {e}")
|
||||||
|
failed_files.append(json_file.name)
|
||||||
|
|
||||||
|
# گزارش خطاها
|
||||||
|
if failed_files:
|
||||||
|
print("\n❌ We lose this file:")
|
||||||
|
for name in failed_files:
|
||||||
|
print(f" - {name}")
|
||||||
|
else:
|
||||||
|
print("\n✅ All JSON added")
|
||||||
|
|
||||||
|
# ذخیره خروجی
|
||||||
|
try:
|
||||||
|
self.__save_orjson(data=unique_data, path=output_path)
|
||||||
|
print(
|
||||||
|
f"\n💾 Final file saved: {output_path} (Total unique items: {len(unique_data)})"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error in saving final file: {e}")
|
||||||
|
|
||||||
|
def make_new_proccessed_ids_from_file(self, json_in, out_path):
|
||||||
|
data = self.__load_orjson(json_in)
|
||||||
|
|
||||||
|
finall_data = []
|
||||||
|
for d in data:
|
||||||
|
if d["id"]:
|
||||||
|
finall_data.append(d["id"])
|
||||||
|
finall_data = set(finall_data)
|
||||||
|
finall_data = list(finall_data)
|
||||||
|
print(f"-- len ids {len(finall_data)}")
|
||||||
|
|
||||||
|
self.__save_orjson(data=finall_data, path=out_path)
|
||||||
|
|
||||||
|
# ------------------------------ Main ------------------------------
|
||||||
|
async def __process_item(self, client, item):
|
||||||
|
try:
|
||||||
|
messages = [
|
||||||
|
{"role": "user", "content": item["user_prompt"]},
|
||||||
|
]
|
||||||
|
if item.get("system_prompt"):
|
||||||
|
messages.append(
|
||||||
|
{"role": "system", "content": item["system_prompt"]}
|
||||||
|
)
|
||||||
|
if item.get("assistant_prompt"):
|
||||||
|
messages.append(
|
||||||
|
{"role": "assistant", "content": item["assistant_prompt"]}
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await client.chat.completions.parse(
|
||||||
|
model=self.model_name,
|
||||||
|
messages=messages,
|
||||||
|
temperature=self.temperature,
|
||||||
|
top_p=self.top_p,
|
||||||
|
reasoning_effort=self.reasoning_effort,
|
||||||
|
max_tokens=self.max_token,
|
||||||
|
stop=None,
|
||||||
|
response_format=self.output_schema,
|
||||||
|
)
|
||||||
|
|
||||||
|
parsed = (
|
||||||
|
response.choices[0].message.parsed
|
||||||
|
if response and response.choices and response.choices[0].message.parsed
|
||||||
|
else {"raw_text": str(response)}
|
||||||
|
)
|
||||||
|
|
||||||
|
parsed = self.output_schema.model_validate(parsed)
|
||||||
|
parsed = parsed.model_dump()
|
||||||
|
parsed = dict(parsed)
|
||||||
|
parsed["ai_code_version"] = self.ai_code_version
|
||||||
|
parsed["id"] = item["id"]
|
||||||
|
# parsed["item"] = item
|
||||||
|
return parsed, 200
|
||||||
|
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
print(f"⏳ Timeout on item {item['id']}")
|
||||||
|
return None, 408
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Error __process_item {item['id']}: {traceback.print_exc()}")
|
||||||
|
return None, 400
|
||||||
|
|
||||||
|
def async_eval(self, processed_id: List = []):
|
||||||
|
try:
|
||||||
|
asyncio.run(self.__async_eval(processed_id))
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n🛑 Interrupted by user.")
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
async def __async_eval(self, processed_id: List):
|
||||||
|
"""
|
||||||
|
اجرای اصلی تکهستهای و async برای تولید خروجی نهایی.
|
||||||
|
"""
|
||||||
|
print("🔹 Starting async data processing...")
|
||||||
|
|
||||||
|
# ------------------ مرحله ۱: بازیابی شناسههای قبلاً پردازششده ------------------
|
||||||
|
if not processed_id:
|
||||||
|
try:
|
||||||
|
processed_id = self.__load_orjson(self._temp_processed_id_path)
|
||||||
|
print(
|
||||||
|
f"📂 Loaded existing processed_id from {self._temp_processed_id_path}"
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
print("⚠️ No valid processed_id found. Starting fresh.")
|
||||||
|
processed_id = []
|
||||||
|
|
||||||
|
# ------------------ مرحله ۲: آمادهسازی دادهها ------------------
|
||||||
|
all_processed_id = set(processed_id)
|
||||||
|
all_results = []
|
||||||
|
total_time = []
|
||||||
|
|
||||||
|
data = [item for item in self.data if item.get("id") not in all_processed_id]
|
||||||
|
print(
|
||||||
|
f"➕ Total items: {len(self.data)} - {len(all_processed_id)} = {len(data)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# اگر چیزی برای پردازش نیست
|
||||||
|
if not data:
|
||||||
|
print("✅ Nothing new to process. All items are already done.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# ------------------ مرحله ۳: شروع پردازش ------------------
|
||||||
|
print(f"🤖 Model: {self.model_name} | Reasoning: {self.reasoning_effort}")
|
||||||
|
async with AsyncOpenAI(base_url=self.api_url, api_key=self.api_key) as client:
|
||||||
|
semaphore = asyncio.Semaphore(5)
|
||||||
|
|
||||||
|
async def limited_process(item):
|
||||||
|
async with semaphore:
|
||||||
|
return await self.__process_item(client, item)
|
||||||
|
|
||||||
|
tasks = [asyncio.create_task(limited_process(item)) for item in data]
|
||||||
|
|
||||||
|
total_i = 0
|
||||||
|
# ✅ پردازش به ترتیب تکمیل (نه ترتیب لیست)
|
||||||
|
for i, task in enumerate(asyncio.as_completed(tasks), start=1):
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
parsed, status_code = await asyncio.wait_for(
|
||||||
|
task, timeout=self.request_timeout
|
||||||
|
) # ⏱ حداکثر 2 دقیقه
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
print(f"⏳ Task {i} timed out completely")
|
||||||
|
parsed, status_code = None, 408
|
||||||
|
total_time.append(time.time() - start)
|
||||||
|
|
||||||
|
if status_code == 200:
|
||||||
|
all_results.append(parsed)
|
||||||
|
all_processed_id.add(parsed.get("id"))
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Skipped item (status={status_code})")
|
||||||
|
|
||||||
|
total_i += 1
|
||||||
|
# ✅ ذخیرهی موقت هر n مورد
|
||||||
|
if total_i >= self.save_number:
|
||||||
|
print(f"total_i {total_i}")
|
||||||
|
print(f"self.save_number {self.save_number}")
|
||||||
|
total_i = 0
|
||||||
|
self.__save_orjson(
|
||||||
|
data=list(all_processed_id),
|
||||||
|
path=self._temp_processed_id_path,
|
||||||
|
)
|
||||||
|
print(f"💾 Auto-saved processed ids: {len(all_processed_id)}")
|
||||||
|
number = self.__get_max_number_file(self._temp_path)
|
||||||
|
print(f"number {number}")
|
||||||
|
temp_output_path = self._temp_path / f"output_{number}.json"
|
||||||
|
self.__save_orjson(data=list(all_results), path=temp_output_path)
|
||||||
|
print(f"💾 Auto-saved partial data: {len(all_results)}")
|
||||||
|
all_results.clear()
|
||||||
|
|
||||||
|
# ✅ بعد از پایان تمام تسکها، ذخیره نهایی برای دادههای باقیمانده
|
||||||
|
if total_i > 0 or len(all_results) > 0:
|
||||||
|
print("💾 Final save of remaining data...")
|
||||||
|
self.__save_orjson(
|
||||||
|
data=list(all_processed_id),
|
||||||
|
path=self._temp_processed_id_path,
|
||||||
|
)
|
||||||
|
print(f"💾 Auto-saved processed ids: {len(all_processed_id)}")
|
||||||
|
number = self.__get_max_number_file(self._temp_path)
|
||||||
|
print(f"number {number}")
|
||||||
|
|
||||||
|
temp_output_path = self._temp_path / f"output_{number}.json"
|
||||||
|
self.__save_orjson(data=list(all_results), path=temp_output_path)
|
||||||
|
print(f"💾 Auto-saved partial data: {len(all_results)}")
|
||||||
|
all_results.clear()
|
||||||
|
|
||||||
|
# ------------------ مرحله ۴: ذخیره خروجی ------------------
|
||||||
|
final_data_path = self.output_path / f"final_data_{self.task_name}.json"
|
||||||
|
processed_id_path = self.output_path / "processed_id.json"
|
||||||
|
|
||||||
|
self.merge_json_dir(input_path=self._temp_path, output_path=final_data_path)
|
||||||
|
all_results = self.__load_orjson(final_data_path)
|
||||||
|
# make_new_proccessed_ids_from_file()
|
||||||
|
self.__save_orjson(data=list(all_processed_id), path=processed_id_path)
|
||||||
|
self.__save_orjson(data=all_results, path=final_data_path)
|
||||||
|
|
||||||
|
avg_time = (sum(total_time) / len(total_time)) if total_time else 0
|
||||||
|
print(
|
||||||
|
f"\n✅ Processing completed!\n"
|
||||||
|
f"📊 Total-Data: {len(data)} | "
|
||||||
|
f"⭕ Ignored-Data: {len(processed_id)} | "
|
||||||
|
f"📦 Proccessed-Data: {len(all_results)} | "
|
||||||
|
f"❌ Loss-Data: {len(data)-len(all_results)} | "
|
||||||
|
f"🕒 Avg Time: {avg_time:.2f}'s per item | "
|
||||||
|
f"🕒 Total Time: {sum(total_time):.4f}'s | "
|
||||||
|
f"💾 Results saved to: {final_data_path}"
|
||||||
|
)
|
||||||
|
|
||||||
|
async def single_simple_async_proccess_item(self, item, functions, function_name):
|
||||||
|
async with AsyncOpenAI(base_url=self.api_url, api_key=self.api_key) as client:
|
||||||
|
semaphore = asyncio.Semaphore(5)
|
||||||
|
async with semaphore:
|
||||||
|
try:
|
||||||
|
messages = [
|
||||||
|
{"role": "user", "content": item["user_prompt"]},
|
||||||
|
]
|
||||||
|
if item.get("system_prompt"):
|
||||||
|
messages.append(
|
||||||
|
{"role": "system", "content": item["system_prompt"]}
|
||||||
|
)
|
||||||
|
if item.get("assistant_prompt"):
|
||||||
|
messages.append(
|
||||||
|
{"role": "assistant", "content": item["assistant_prompt"]}
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await client.chat.completions.parse(
|
||||||
|
model=self.model_name,
|
||||||
|
messages=messages,
|
||||||
|
temperature=self.temperature,
|
||||||
|
top_p=self.top_p,
|
||||||
|
reasoning_effort=self.reasoning_effort,
|
||||||
|
max_tokens=self.max_token,
|
||||||
|
stop=None,
|
||||||
|
response_format=self.output_schema,
|
||||||
|
functions=functions,
|
||||||
|
function_call={"name": function_name}
|
||||||
|
)
|
||||||
|
|
||||||
|
parsed = (
|
||||||
|
response.choices[0].message.parsed
|
||||||
|
if response and response.choices and response.choices[0].message.parsed
|
||||||
|
else {"raw_text": str(response)}
|
||||||
|
)
|
||||||
|
|
||||||
|
parsed = self.output_schema.model_validate(parsed)
|
||||||
|
parsed = parsed.model_dump()
|
||||||
|
parsed = dict(parsed)
|
||||||
|
parsed["ai_code_version"] = self.ai_code_version
|
||||||
|
return parsed, 200
|
||||||
|
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
print(f"⏳ Timeout on item {item}")
|
||||||
|
return None, 408
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Error __process_item {item}: {traceback.print_exc()}")
|
||||||
|
return None, 400
|
||||||
|
|
||||||
|
|
||||||
28
routers/base_model.py
Executable file
28
routers/base_model.py
Executable file
|
|
@ -0,0 +1,28 @@
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
class Title(BaseModel):
|
||||||
|
title: str
|
||||||
|
|
||||||
|
class Query(BaseModel):
|
||||||
|
query: str
|
||||||
|
|
||||||
|
class ChatObject(BaseModel):
|
||||||
|
title: str
|
||||||
|
user_query: str
|
||||||
|
model_key: str
|
||||||
|
retrived_passage: str
|
||||||
|
retrived_ref_ids: str
|
||||||
|
model_answer: str
|
||||||
|
status:str='success'
|
||||||
|
prompt_type: str= "question-answer"
|
||||||
|
|
||||||
|
|
||||||
|
class LLMOutput(BaseModel):
|
||||||
|
text : str
|
||||||
|
source : List[str]
|
||||||
|
|
||||||
|
class LLMInput(BaseModel):
|
||||||
|
query : str
|
||||||
|
knowledge : List[dict]
|
||||||
337
routers/chatbot_handler.py
Executable file
337
routers/chatbot_handler.py
Executable file
|
|
@ -0,0 +1,337 @@
|
||||||
|
import numpy as np
|
||||||
|
import torch, orjson, faiss, re
|
||||||
|
from typing import List
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
from FlagEmbedding import FlagReranker
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# nlist = 2048
|
||||||
|
# quantizer = faiss.IndexFlatIP(dim)
|
||||||
|
# index = faiss.IndexIVFFlat(quantizer, dim, nlist)
|
||||||
|
# index.train(embeddings)
|
||||||
|
# index.add(embeddings)
|
||||||
|
|
||||||
|
class InitHybridRetrieverReranker:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
embeder_path,
|
||||||
|
reranker_path,
|
||||||
|
dict_content: List[dict],
|
||||||
|
faiss_index,
|
||||||
|
dense_alpha: float = 0.6,
|
||||||
|
device: str = None,
|
||||||
|
cache_dir="/src/MODELS",
|
||||||
|
batch_size=512,
|
||||||
|
|
||||||
|
):
|
||||||
|
|
||||||
|
if device is None:
|
||||||
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
|
self.device = device
|
||||||
|
self.dense_alpha = dense_alpha
|
||||||
|
|
||||||
|
# ===============================
|
||||||
|
# تبدیل ورودی فقط یک بار
|
||||||
|
# ===============================
|
||||||
|
self.content_list = [x["content"] for x in dict_content]
|
||||||
|
self.ids_list = [x["id"] for x in dict_content]
|
||||||
|
self.N = len(self.content_list)
|
||||||
|
self.faiss_index = faiss_index
|
||||||
|
|
||||||
|
# Dense embedder
|
||||||
|
self.embedder = SentenceTransformer(
|
||||||
|
local_files_only=True,
|
||||||
|
model_name_or_path=embeder_path,
|
||||||
|
cache_folder=cache_dir,
|
||||||
|
device=self.device,
|
||||||
|
similarity_fn_name="cosine",
|
||||||
|
)
|
||||||
|
|
||||||
|
# TF-IDF
|
||||||
|
self.vectorizer = TfidfVectorizer(
|
||||||
|
analyzer="word",
|
||||||
|
ngram_range=(1, 2),
|
||||||
|
token_pattern=r"(?u)\b[\w\u0600-\u06FF]{2,}\b",
|
||||||
|
)
|
||||||
|
self.tfidf_matrix = self.vectorizer.fit_transform(self.content_list)
|
||||||
|
|
||||||
|
# Reranker
|
||||||
|
self.reranker = FlagReranker(
|
||||||
|
model_name_or_path=reranker_path,
|
||||||
|
local_files_only=True,
|
||||||
|
use_fp16=True,
|
||||||
|
devices=device,
|
||||||
|
cache_dir=cache_dir,
|
||||||
|
batch_size=batch_size,
|
||||||
|
normalize=True,
|
||||||
|
# max_length=1024,
|
||||||
|
# trust_remote_code=False,
|
||||||
|
# query_max_length=
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
print("RAG Ready — Retriever + Reranker Loaded")
|
||||||
|
|
||||||
|
# ================================
|
||||||
|
# Dense Search (FAISS)
|
||||||
|
# ================================
|
||||||
|
async def dense_retrieve(self, query: str, top_k: int):
|
||||||
|
if top_k <= 0:
|
||||||
|
return [], np.array([])
|
||||||
|
|
||||||
|
emb = self.embedder.encode(query, convert_to_numpy=True).astype(np.float32)
|
||||||
|
|
||||||
|
D, I = self.faiss_index.search(emb.reshape(1, -1), top_k)
|
||||||
|
return I[0], D[0]
|
||||||
|
|
||||||
|
# ================================
|
||||||
|
# Sparse Search (TF-IDF)
|
||||||
|
# ================================
|
||||||
|
async def sparse_retrieve(self, query: str, top_k: int):
|
||||||
|
if top_k <= 0:
|
||||||
|
return [], np.array([])
|
||||||
|
|
||||||
|
q_vec = self.vectorizer.transform([query])
|
||||||
|
sims = cosine_similarity(q_vec, self.tfidf_matrix)[0]
|
||||||
|
|
||||||
|
k = min(top_k, len(sims))
|
||||||
|
idx = np.argpartition(-sims, k - 1)[:k]
|
||||||
|
idx = idx[np.argsort(-sims[idx], kind="mergesort")]
|
||||||
|
|
||||||
|
return idx, sims[idx]
|
||||||
|
|
||||||
|
# ================================
|
||||||
|
# Reciprocal Rank Fusion
|
||||||
|
# ================================
|
||||||
|
async def fuse(self, d_idx, d_scores, s_idx, s_scores, top_k=50, k_rrf=60):
|
||||||
|
combined = {}
|
||||||
|
|
||||||
|
for rank, idx in enumerate(d_idx):
|
||||||
|
combined[idx] = combined.get(idx, 0) + 1.0 / (k_rrf + rank)
|
||||||
|
|
||||||
|
for rank, idx in enumerate(s_idx):
|
||||||
|
combined[idx] = combined.get(idx, 0) + 1.0 / (k_rrf + rank)
|
||||||
|
|
||||||
|
sorted_items = sorted(combined.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
return [i[0] for i in sorted_items[:top_k]]
|
||||||
|
|
||||||
|
# ================================
|
||||||
|
# Rerank
|
||||||
|
# ================================
|
||||||
|
async def rerank(self, query: str, cand_idx: List[int], final_k: int = 10):
|
||||||
|
if not cand_idx:
|
||||||
|
return []
|
||||||
|
|
||||||
|
passages = [self.content_list[i] for i in cand_idx]
|
||||||
|
pairs = [[query, p] for p in passages]
|
||||||
|
|
||||||
|
scores = self.reranker.compute_score(pairs, normalize=True, max_length=512)
|
||||||
|
|
||||||
|
if isinstance(scores, float):
|
||||||
|
scores = [scores]
|
||||||
|
|
||||||
|
idx_score = list(zip(cand_idx, scores))
|
||||||
|
idx_score.sort(key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
return idx_score[:final_k]
|
||||||
|
|
||||||
|
# ================================
|
||||||
|
# Main Search Function
|
||||||
|
# ================================
|
||||||
|
async def search_base(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
topk_dense=50,
|
||||||
|
topk_sparse=50,
|
||||||
|
pre_rerank_k=50,
|
||||||
|
final_k=10,
|
||||||
|
):
|
||||||
|
|
||||||
|
d_idx, d_scores = await self.dense_retrieve(query, topk_dense)
|
||||||
|
s_idx, s_scores = await self.sparse_retrieve(query, topk_sparse)
|
||||||
|
|
||||||
|
cand_idx = await self.fuse(d_idx, d_scores, s_idx, s_scores, pre_rerank_k)
|
||||||
|
final_rank = await self.rerank(query, cand_idx, final_k)
|
||||||
|
|
||||||
|
# ===============================
|
||||||
|
# خروجی سریع و تمیز
|
||||||
|
# ===============================
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"id": self.ids_list[idx],
|
||||||
|
"content": self.content_list[idx],
|
||||||
|
"score": score,
|
||||||
|
}
|
||||||
|
for idx, score in final_rank
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def load_orjson(path: str | Path):
|
||||||
|
path = Path(path)
|
||||||
|
with path.open("rb") as f: # باید باینری باز بشه برای orjson
|
||||||
|
return orjson.loads(f.read())
|
||||||
|
|
||||||
|
|
||||||
|
def save_orjson(path, data):
|
||||||
|
with open(path, "wb") as f:
|
||||||
|
f.write(
|
||||||
|
orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_NON_STR_KEYS)
|
||||||
|
)
|
||||||
|
|
||||||
|
WEB_LINK = "https://majles.tavasi.ir/entity/detail/view/qsection/"
|
||||||
|
# ref = f"[«{i}»](https://majles.tavasi.ir/entity/detail/view/qsection/{idx})"
|
||||||
|
|
||||||
|
def get_in_form(title: str, sections: list, max_len: int = 4000):
|
||||||
|
chunks = []
|
||||||
|
current = f"برای پرسش: {title}\n\n"
|
||||||
|
ref_text = "«منبع»"
|
||||||
|
|
||||||
|
for i, data in enumerate(sections, start=1):
|
||||||
|
sec_text = data.get("content", "")
|
||||||
|
idx = data.get("id")
|
||||||
|
|
||||||
|
# ساخت ref کامل
|
||||||
|
ref = f"[{ref_text}]({WEB_LINK}{idx})"
|
||||||
|
# متن کامل آیتم
|
||||||
|
block = f"{i}: {sec_text}\n{ref}\n\n"
|
||||||
|
|
||||||
|
# اگر با اضافه شدن این آیتم از حد مجاز عبور میکنیم → شروع چانک جدید
|
||||||
|
if len(current) + len(block) > max_len:
|
||||||
|
chunks.append(current.rstrip())
|
||||||
|
current = ""
|
||||||
|
|
||||||
|
current += block
|
||||||
|
|
||||||
|
# آخرین چانک را هم اضافه کن
|
||||||
|
if current.strip():
|
||||||
|
chunks.append(current.rstrip())
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def format_answer_bale(answer_text: str, sources: list, max_len: int = 4000):
|
||||||
|
"""
|
||||||
|
answer_text: متن خروجی مدل که داخلش عبارتهای مثل (منبع: qs2117427) وجود دارد
|
||||||
|
sources: مثل ['qs2117427']
|
||||||
|
"""
|
||||||
|
ref_text = "«منبع»"
|
||||||
|
|
||||||
|
def make_link(src):
|
||||||
|
return f"[{ref_text}]({WEB_LINK}{src})"
|
||||||
|
|
||||||
|
|
||||||
|
# الگو برای تشخیص هر پرانتز که شامل یک یا چند کد باشد
|
||||||
|
# مثلا: (qs123) یا (qs123, qs456, qs789)
|
||||||
|
pattern = r"\((?:منبع[:: ]+)?([a-zA-Z0-9_, ]+)\)"
|
||||||
|
|
||||||
|
def replace_source(m):
|
||||||
|
content = m.group(1)
|
||||||
|
codes = [c.strip() for c in content.split(",")] # جداسازی چند کد
|
||||||
|
links = [make_link(code) for code in codes]
|
||||||
|
full_match = m.group(0)
|
||||||
|
# if "منبع" in full_match:
|
||||||
|
# print(f'Found explicit source(s): {links}')
|
||||||
|
# else:
|
||||||
|
# print(f'Found implicit source(s): {links}')
|
||||||
|
return ", ".join(links) # جایگزینی همه کدها با لینکهایشان
|
||||||
|
|
||||||
|
# جایگزینی در متن
|
||||||
|
answer_text = re.sub(pattern, replace_source, answer_text)
|
||||||
|
|
||||||
|
# اگر طول کمتر از max_len بود → تمام
|
||||||
|
if len(answer_text) <= max_len:
|
||||||
|
return [answer_text]
|
||||||
|
|
||||||
|
# تقسیم متن اگر طول زیاد شد
|
||||||
|
chunks = []
|
||||||
|
current = ""
|
||||||
|
|
||||||
|
sentences = answer_text.split(". ")
|
||||||
|
for sentence in sentences:
|
||||||
|
st = sentence.strip()
|
||||||
|
if not st.endswith("."):
|
||||||
|
st += "."
|
||||||
|
|
||||||
|
if len(current) + len(st) > max_len:
|
||||||
|
chunks.append(current.strip())
|
||||||
|
current = ""
|
||||||
|
|
||||||
|
current += st + " "
|
||||||
|
|
||||||
|
if current.strip():
|
||||||
|
chunks.append(current.strip())
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_prompt(query: str):
|
||||||
|
"""
|
||||||
|
get a query and prepare a prompt to generate title based on that
|
||||||
|
"""
|
||||||
|
title_prompt = f"برای متن {query} یک عنوان با معنا که بین 3 تا 6 کلمه داشته باشد، در قالب یک رشته متن ایجاد کن. سبک و لحن عنوان، حقوقی و کاملا رسمی باشد. عنوان تولید شده کاملا ساده و بدون هیچ مارک داون یا علائم افزوده ای باشد. غیر از عنوان، به هیچ وجه توضیح اضافه ای در قبل یا بعد آن اضافه نکن."
|
||||||
|
return title_prompt
|
||||||
|
|
||||||
|
|
||||||
|
def format_knowledge_block(knowledge):
|
||||||
|
lines = []
|
||||||
|
for item in knowledge:
|
||||||
|
_id = item.get("id", "unknown")
|
||||||
|
_content = item.get("content", "")
|
||||||
|
lines.append(f"- ({_id}) { _content }")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_prompt2(obj):
|
||||||
|
query = obj.query
|
||||||
|
knowledge = obj.knowledge
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
شما باید تنها بر اساس اطلاعات ارائه شده پاسخ بدهید و هیچ دانشی خارج از آنها استفاده نکنید.
|
||||||
|
|
||||||
|
### پرسش:
|
||||||
|
{query}
|
||||||
|
|
||||||
|
### اسناد قابل استناد:
|
||||||
|
{format_knowledge_block(knowledge)}
|
||||||
|
|
||||||
|
### دستور تولید خروجی:
|
||||||
|
- پاسخی کاملاً دقیق، تحلیلی و مفهومی ایجاد کن
|
||||||
|
- لحن رسمی و حقوقی باشد
|
||||||
|
- اگر پاسخ نیاز به ترکیب چند سند دارد، آنها را ادغام کن
|
||||||
|
- اگر دادهها کافی نبود، این موضوع را شفاف اعلام کن اما اطلاعات مرتبط را همچنان ارائه بده
|
||||||
|
"""
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_prompt3(query, knowledge_json):
|
||||||
|
sys = f"""Answer the following based ONLY on the knowledge:
|
||||||
|
|
||||||
|
Query:
|
||||||
|
{query}
|
||||||
|
|
||||||
|
Knowledge:
|
||||||
|
{knowledge_json}"""
|
||||||
|
return sys
|
||||||
|
|
||||||
|
def load_faiss_index(index_path: str, metadata_path: str):
|
||||||
|
"""بارگذاری ایندکس FAISS و متادیتا (لیست جملات + عناوین)."""
|
||||||
|
index = faiss.read_index(index_path)
|
||||||
|
|
||||||
|
metadata = load_orjson(metadata_path)
|
||||||
|
|
||||||
|
metadata = [
|
||||||
|
{
|
||||||
|
"id": item["id"],
|
||||||
|
"content": item["content"],
|
||||||
|
"prefix": item["prefix"],
|
||||||
|
}
|
||||||
|
for item in metadata
|
||||||
|
]
|
||||||
|
return metadata, index
|
||||||
169
routers/rag_base.py
Executable file
169
routers/rag_base.py
Executable file
|
|
@ -0,0 +1,169 @@
|
||||||
|
from fastapi import APIRouter, Request
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
import time, os, traceback
|
||||||
|
from .base_model import Query, LLMOutput, LLMInput, Title
|
||||||
|
from .ai_data_parser import AsyncCore
|
||||||
|
from .chatbot_handler import (
|
||||||
|
InitHybridRetrieverReranker,
|
||||||
|
format_answer_bale,
|
||||||
|
get_user_prompt2,
|
||||||
|
get_user_prompt3,
|
||||||
|
load_faiss_index,
|
||||||
|
get_in_form,
|
||||||
|
|
||||||
|
)
|
||||||
|
from .static import (
|
||||||
|
EMBED_MODEL_PATH,
|
||||||
|
FAISS_INDEX_PATH,
|
||||||
|
FAISS_METADATA_PATH,
|
||||||
|
LLM_URL,
|
||||||
|
SYSTEM_PROMPT_FINALL,
|
||||||
|
RERANKER_MODEL_PATH,
|
||||||
|
LLM_ERROR,
|
||||||
|
MODEL_KEY,
|
||||||
|
MODEL_NAME,
|
||||||
|
OUTPUT_PATH_LLM,
|
||||||
|
REASONING_EFFORT,
|
||||||
|
TASK_NAME,
|
||||||
|
LLM_TIME_OUT,MAX_TOKEN, SYSTEM_PROPMT2
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ################################################## Global-params
|
||||||
|
router = APIRouter(tags=["ragchat"])
|
||||||
|
# # settings= get_settings()
|
||||||
|
|
||||||
|
|
||||||
|
METADATA_DICT, FAISS_INDEX = load_faiss_index(
|
||||||
|
index_path=FAISS_INDEX_PATH, metadata_path=FAISS_METADATA_PATH
|
||||||
|
)
|
||||||
|
|
||||||
|
RAG = InitHybridRetrieverReranker(
|
||||||
|
embeder_path=EMBED_MODEL_PATH,
|
||||||
|
reranker_path=RERANKER_MODEL_PATH,
|
||||||
|
dict_content=METADATA_DICT,
|
||||||
|
faiss_index=FAISS_INDEX,
|
||||||
|
dense_alpha=0.6,
|
||||||
|
device="cuda",
|
||||||
|
)
|
||||||
|
|
||||||
|
RUNNER_PROMPT = AsyncCore(
|
||||||
|
model_name=MODEL_NAME,
|
||||||
|
api_url=LLM_URL,
|
||||||
|
output_path=OUTPUT_PATH_LLM,
|
||||||
|
task_name=TASK_NAME,
|
||||||
|
output_schema=LLMOutput,
|
||||||
|
reasoning_effort=REASONING_EFFORT,
|
||||||
|
ai_code_version=MODEL_KEY,
|
||||||
|
request_timeout=LLM_TIME_OUT,
|
||||||
|
max_token=MAX_TOKEN,
|
||||||
|
save_number=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
functions = [
|
||||||
|
{
|
||||||
|
"name": "legal_answer",
|
||||||
|
"description": "خروجی ساختیافته از تحلیل حقوقی با ارجاع کامل به اسناد",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"text": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "متن کامل پاسخ شامل ارجاع (qsID)"
|
||||||
|
},
|
||||||
|
"source": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"},
|
||||||
|
"description": "فهرست شناسه اسناد استفاده شده"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["text", "source"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
async def chat_bot_run(query):
|
||||||
|
try:
|
||||||
|
s = time.time()
|
||||||
|
sections_dict = await RAG.search_base(
|
||||||
|
query,
|
||||||
|
final_k=10,
|
||||||
|
topk_dense=100,
|
||||||
|
topk_sparse=100,
|
||||||
|
pre_rerank_k=100,
|
||||||
|
)
|
||||||
|
e = time.time()
|
||||||
|
input_data = LLMInput(query=query, knowledge=sections_dict)
|
||||||
|
# prompt = get_user_prompt2(input_data)
|
||||||
|
prompt = get_user_prompt3(query=query, knowledge_json=sections_dict)
|
||||||
|
|
||||||
|
llm_answer, _ = await RUNNER_PROMPT.single_simple_async_proccess_item(
|
||||||
|
item={"user_prompt": prompt, "system_prompt": SYSTEM_PROPMT2},
|
||||||
|
functions=functions,
|
||||||
|
function_name="legal_answer",
|
||||||
|
)
|
||||||
|
ee = time.time()
|
||||||
|
finall = format_answer_bale(
|
||||||
|
answer_text=llm_answer["text"], sources=llm_answer["source"]
|
||||||
|
)
|
||||||
|
eee = time.time()
|
||||||
|
print(
|
||||||
|
f'Rag = {e-s}',
|
||||||
|
f'llm_answer = {ee-e}',
|
||||||
|
f'Form = {eee-ee}',
|
||||||
|
sep='\n'
|
||||||
|
)
|
||||||
|
return finall
|
||||||
|
except:
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def rag_run(query):
|
||||||
|
try:
|
||||||
|
s = time.time()
|
||||||
|
sections_dict = await RAG.search_base(
|
||||||
|
query,
|
||||||
|
final_k=10,
|
||||||
|
topk_dense=100,
|
||||||
|
topk_sparse=100,
|
||||||
|
pre_rerank_k=100,
|
||||||
|
)
|
||||||
|
e = time.time()
|
||||||
|
finall = get_in_form(title=query, sections=sections_dict)
|
||||||
|
ee = time.time()
|
||||||
|
print(
|
||||||
|
f'Rag = {e-s}',
|
||||||
|
f'Form = {ee-e}',
|
||||||
|
sep='\n'
|
||||||
|
)
|
||||||
|
return finall
|
||||||
|
except:
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
@router.post("/run_chat")
|
||||||
|
async def run_chat(payload: Query, request: Request):
|
||||||
|
s = time.time()
|
||||||
|
try:
|
||||||
|
answer = await chat_bot_run(payload.query)
|
||||||
|
except:
|
||||||
|
print(f"chat_bot_run FAIL!")
|
||||||
|
answer = LLM_ERROR
|
||||||
|
e = time.time()
|
||||||
|
print(f"Total Time {e-s:.2f}'s")
|
||||||
|
return JSONResponse({"result": answer}, status_code=201)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/run_rag")
|
||||||
|
async def run_chat(payload: Query, request: Request):
|
||||||
|
s = time.time()
|
||||||
|
try:
|
||||||
|
answer = await rag_run(payload.query)
|
||||||
|
except:
|
||||||
|
print(f"chat_bot_run FAIL!")
|
||||||
|
answer = LLM_ERROR
|
||||||
|
e = time.time()
|
||||||
|
print(f"Total Time {e-s:.2f}'s")
|
||||||
|
return JSONResponse({"result": answer}, status_code=201)
|
||||||
8
routers/readme.md
Executable file
8
routers/readme.md
Executable file
|
|
@ -0,0 +1,8 @@
|
||||||
|
# "gpt-4o", "gpt-4o-mini", "deepseek-chat" , "gemini-2.0-flash", gemini-2.5-flash-lite
|
||||||
|
# gpt-4o : 500
|
||||||
|
# gpt-4o-mini : 34
|
||||||
|
# deepseek-chat: : 150
|
||||||
|
# gemini-2.0-flash : error
|
||||||
|
# cf.gemma-3-12b-it : 1
|
||||||
|
# gemini-2.5-flash-lite : 35 خیلی خوب
|
||||||
|
|
||||||
160
routers/static.py
Executable file
160
routers/static.py
Executable file
|
|
@ -0,0 +1,160 @@
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import os
|
||||||
|
LLM_URL = "http://localhost:8004/v1/" # "http://172.16.29.102:8001/v1/"
|
||||||
|
EMBED_MODEL_PATH = "/home2/MODELS/models--sentence-transformers--paraphrase-multilingual-MiniLM-L12-v2/snapshots/86741b4e3f5cb7765a600d3a3d55a0f6a6cb443d" # "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
||||||
|
RERANKER_MODEL_PATH = "/home2/MODELS/bge_reranker_m3_v2/bge-reranker-v2-m3" # "BAAI/bge-reranker-v2-m3"
|
||||||
|
|
||||||
|
|
||||||
|
FAISS_INDEX_PATH = "/home2/rag_qavanin_api/data/qavanin-faiss/faiss_index_qavanin_285k.index" # "/src/app/data/qavanin-faiss/faiss_index_qavanin_285k.index"
|
||||||
|
FAISS_METADATA_PATH = "/home2/rag_qavanin_api/data/qavanin-faiss/faiss_index_qavanin_285k_metadata.json" # "/src/app/data/qavanin-faiss/faiss_index_qavanin_285k_metadata.json"
|
||||||
|
|
||||||
|
|
||||||
|
PATH_LOG = "./data/llm-answer/"
|
||||||
|
load_dotenv()
|
||||||
|
RERANK_BATCH = int(os.environ.get("rerank_batch_size"))
|
||||||
|
API_KEY = os.getenv("api_key")
|
||||||
|
LLM_ERROR = "با عرض پوزش؛ ❌in ragمتاسفانه خطایی رخ داده است. لطفا لحظاتی دیگر دوباره تلاش نمائید"
|
||||||
|
MODEL_KEY = "oss-120-hamava"
|
||||||
|
MODEL_NAME="gpt-oss-20b" # "gpt-oss-120b"
|
||||||
|
OUTPUT_PATH_LLM="/home2/rag_qa_chat2/data/_temp"
|
||||||
|
TASK_NAME="bale-chat"
|
||||||
|
REASONING_EFFORT="low"
|
||||||
|
LLM_TIME_OUT=30
|
||||||
|
MAX_TOKEN=8192
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
SYSTEM_PROMPT_FINALL = """شما یک دستیار تحلیلگر حقوقی متخصص در استنتاج دقیق از اسناد قانونی هستید.
|
||||||
|
|
||||||
|
ورودی شما شامل:
|
||||||
|
- یک پرسش کاربر (query)
|
||||||
|
- مجموعهای از چند متن قانونی (knowledge)، که هر کدام شامل:
|
||||||
|
- id (شناسه سند)
|
||||||
|
- content (متن بند قانونی)
|
||||||
|
|
||||||
|
وظیفه شما:
|
||||||
|
1. پرسش را دقیق بخوانید و فقط بر اساس اطلاعات موجود در اسناد ارائه شده پاسخ دهید.
|
||||||
|
2. از خودتان هیچ اطلاعات جدید، تخمین، تفسیر شخصی، یا دانش خارج از اسناد وارد نکنید.
|
||||||
|
3. اگر یک پاسخ نیاز به ترکیب چند سند دارد، آنها را استخراج و در هم ادغام کنید و نتیجه را کاملاً روان و قابل فهم بنویسید.
|
||||||
|
4. پاسخ باید:
|
||||||
|
- تحلیلمحور
|
||||||
|
- شفاف
|
||||||
|
- فارسی استاندارد و حقوقی
|
||||||
|
- ساختاریافته و قابل ارائه باشد
|
||||||
|
5. هر جمله یا بند از پاسخ **حتماً باید به یک یا چند id سند مشخص وصل شود**.
|
||||||
|
- اگر برای جملهای منبعی پیدا نشد، صریحاً در متن ذکر کنید: "(هیچ منبع مرتبط موجود نیست)"
|
||||||
|
- از اضافه کردن idهای فرضی یا خارج از knowledge خودداری شود.
|
||||||
|
6. از تکرار مستقیم یا کپی کردن جملات خام اسناد اجتناب کنید. آنها را با بازنویسی تحلیلی به کار ببرید.
|
||||||
|
7. در پایان پاسخ:
|
||||||
|
- حتماً لیست تمام شناسههای سندهای استفادهشده را برگردانید.
|
||||||
|
- فقط id های اسنادی که واقعاً در پاسخ استفاده شدهاند ذکر شوند به صورت دقیقا: (qs2127)
|
||||||
|
- ترتیب اهمیت و ارتباط در لیست رعایت شود.
|
||||||
|
8. پاسخ نهایی باید دقیقاً در فرمت JSON زیر برگردد و هیچ متن دیگری خارج از آن اضافه نشود:
|
||||||
|
|
||||||
|
{
|
||||||
|
"text" : "متن کامل پاسخ تحلیلی و دقیق به پرسش، هر جمله یا بند با (id) سند مرتبط یا (هیچ منبع مرتبط موجود نیست) مشخص شود.",
|
||||||
|
"source": ["qs123", "qs545", ...]
|
||||||
|
}
|
||||||
|
|
||||||
|
ورودی نمونه:
|
||||||
|
{
|
||||||
|
query: "متن سوال",
|
||||||
|
knowledge: [
|
||||||
|
{"id": "qs01", "content": "..."},
|
||||||
|
{"id": "qs02", "content": "..."},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
SYSTEM_PROPMT2 = '''You are a legal reasoning model that MUST base the answer ONLY on the documents provided in `knowledge`.
|
||||||
|
|
||||||
|
STRICT RULES:
|
||||||
|
1. You have no knowledge outside the provided documents.
|
||||||
|
2. Before generating the answer you MUST:
|
||||||
|
A. Extract the list of all valid document IDs from `knowledge`.
|
||||||
|
B. Think through the answer sentence-by-sentence.
|
||||||
|
C. Every sentence MUST be directly supported by one or more document IDs.
|
||||||
|
|
||||||
|
3. Any sentence that is not directly supported by at least one `id` MUST be removed.
|
||||||
|
|
||||||
|
4. Document IDs must appear in the text as:
|
||||||
|
(qs123)
|
||||||
|
(qs1002)
|
||||||
|
etc.
|
||||||
|
|
||||||
|
5. The final answer MUST be returned strictly as:
|
||||||
|
{
|
||||||
|
"text": "...",
|
||||||
|
"source": ["qs001", "qs999"]
|
||||||
|
}
|
||||||
|
|
||||||
|
Where:
|
||||||
|
- `text` contains the final written response with citations inline.
|
||||||
|
- `source` contains ONLY the list of IDs actually used in the answer, no duplicates, order by relevance.
|
||||||
|
|
||||||
|
6. JSON MUST be valid. No comments, no trailing commas.
|
||||||
|
|
||||||
|
7. To the extent that there is even the slightest relevance to the question in the documentation, generate an answer from the documentation, indicating that a close answer to the user's question was not found.
|
||||||
|
|
||||||
|
8. Finally, if no document supports the question, return:
|
||||||
|
{
|
||||||
|
"text": "هیچ سند مرتبطی یافت نشد.",
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
|
||||||
|
9. Length must NOT be shortened. Provide full analysis, fully detailed.
|
||||||
|
Before generating your answer:
|
||||||
|
|
||||||
|
Extract the list of VALID IDs from `knowledge`.
|
||||||
|
You MUST NOT invent IDs.
|
||||||
|
Any ID not in that list is forbidden.
|
||||||
|
|
||||||
|
'''
|
||||||
|
#############
|
||||||
|
"""
|
||||||
|
شما یک دستیار تحلیلگر حقوقی متخصص در استنتاج دقیق از اسناد قانونی هستید.
|
||||||
|
|
||||||
|
ورودی شما شامل:
|
||||||
|
- یک پرسش کاربر (query)
|
||||||
|
- مجموعهای از چند متن قانونی (knowledge)، که هر کدام شامل:
|
||||||
|
- id (شناسه سند)
|
||||||
|
- content (متن بند قانونی)
|
||||||
|
|
||||||
|
وظیفه شما:
|
||||||
|
1. پرسش را دقیق بخوانید و فقط بر اساس اطلاعات موجود در اسناد ارائه شده پاسخ دهید.
|
||||||
|
2. از خودتان هیچ اطلاعات جدید، تخمین، تفسیر شخصی، یا دانش خارج از اسناد وارد نکنید.
|
||||||
|
3. اگر یک پاسخ نیاز به ترکیب چند سند دارد، آنها را استخراج و در هم ادغام کنید و نتیجه را کاملاً روان و قابل فهم بنویسید.
|
||||||
|
4. پاسخ باید:
|
||||||
|
- تحلیلمحور
|
||||||
|
- شفاف
|
||||||
|
- فارسی استاندارد و حقوقی
|
||||||
|
- ساختاریافته و قابل ارائه باشد
|
||||||
|
5. از تکرار مستقیم یا کپی کردن جملات خام اسناد اجتناب کنید. آنها را با بازنویسی تحلیلی به کار ببرید.
|
||||||
|
6. اگر اطلاعات موجود برای پاسخ کامل کافی نبود:
|
||||||
|
- این موضوع را صریح اعلام کنید
|
||||||
|
- اما موارد مرتبط موجود را همچنان خلاصه و ارائه کنید
|
||||||
|
7. در پایان پاسخ:
|
||||||
|
- لیست شناسههای سندهای استفادهشده را برگردانید
|
||||||
|
- فقط id های اسنادی که واقعاً در پاسخ استفاده شدهاند ذکر شوند به صورت دقیقا : (qs2127)
|
||||||
|
- ترتیب اهمیت در لیست رعایت شود
|
||||||
|
8. پاسخ نهایی باید دقیقاً در فرمت زیر برگردد:
|
||||||
|
|
||||||
|
خروجی نمونه:
|
||||||
|
{
|
||||||
|
"text" : "متن کامل پاسخ تحلیلی و دقیق به پرسش",
|
||||||
|
"source": ["qs123", "qs545", ...]
|
||||||
|
}
|
||||||
|
|
||||||
|
بدون هیچ توضیح یا متن اضافه خارج از این قالب.
|
||||||
|
|
||||||
|
ورودی نمونه:
|
||||||
|
{
|
||||||
|
query: "متن سوال",
|
||||||
|
knowledge: [
|
||||||
|
{"id": "qs01", "content": "..."},
|
||||||
|
{"id": "qs02", "content": "..."},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
}"""
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,47 +0,0 @@
|
||||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
import routes.chatbot_handler as chatbot_handler
|
|
||||||
import datetime
|
|
||||||
import random
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
router = APIRouter(tags=["ragchat"])
|
|
||||||
# settings= get_settings()
|
|
||||||
|
|
||||||
|
|
||||||
# تعریف مدل دادهها برای درخواستهای API
|
|
||||||
class RagQueryModal(BaseModel):
|
|
||||||
query: str
|
|
||||||
|
|
||||||
|
|
||||||
async def create_chat_id():
|
|
||||||
date = str((datetime.datetime.now())).replace(' ','-').replace(':','').replace('.','-')
|
|
||||||
|
|
||||||
chat_id = f'{date}-{random.randint(100000, 999999)}'
|
|
||||||
|
|
||||||
return chat_id
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/emergency_call")
|
|
||||||
async def emergency_call(payload: RagQueryModal):
|
|
||||||
print('emergency generate answer ...')
|
|
||||||
chat_id = await create_chat_id()
|
|
||||||
answer = await chatbot_handler.ask_chatbot_avalai(payload.query, chat_id)
|
|
||||||
# print('emergency answer ...', answer)
|
|
||||||
await chatbot_handler.credit_refresh()
|
|
||||||
print('*** ... ready for next ... ***')
|
|
||||||
return {"answer": answer}
|
|
||||||
|
|
||||||
@router.post("/run_chat")
|
|
||||||
async def run_chat(payload: RagQueryModal, request: Request):
|
|
||||||
# request.state.app
|
|
||||||
print('run_chat start ...')
|
|
||||||
chat_id = await create_chat_id()
|
|
||||||
answer = await chatbot_handler.ask_chatbot(payload.query, chat_id)
|
|
||||||
print('*** ... ready for next ... ***')
|
|
||||||
|
|
||||||
return {"answer": answer}
|
|
||||||
|
|
@ -1,3 +1,3 @@
|
||||||
docker stop qachat
|
docker stop qachat2
|
||||||
docker rm qachat
|
docker rm qachat2
|
||||||
docker run --name qachat -p 2425:80 --net qachat_net --gpus=all -v ./:/src/app/ -v ./qavanin-faiss/:/src/app/qavanin-faiss/ -v ./llm-answer/:/src/app/llm-answer/ -v ./../MODELS:/src/MODELS -v ./../cache:/root/.cache/huggingface/hub -it --restart unless-stopped docker.tavasi.ir/tavasi/qachat2:1.0.0
|
docker run --name qachat2 -p 8009:80 --net qachat_net --gpus=all -v ./:/src/app/ -v ./data/:/src/app/data/ -v ./../MODELS:/src/MODELS -v ./../cache:/root/.cache/huggingface/hub -it --restart unless-stopped docker.tavasi.ir/tavasi/qachat2:1.0.0
|
||||||
|
|
|
||||||
3
run_env.bash
Executable file
3
run_env.bash
Executable file
|
|
@ -0,0 +1,3 @@
|
||||||
|
source /home2/.venv/bin/activate
|
||||||
|
|
||||||
|
uvicorn main:app --port=8009 --host=0.0.0.0
|
||||||
Loading…
Reference in New Issue
Block a user