change project strategy
This commit is contained in:
parent
713776f6b0
commit
2218df5d09
|
|
@ -1,2 +1,3 @@
|
||||||
./qavanin-faiss
|
./qavanin-faiss
|
||||||
./llm-answer
|
./llm-answer
|
||||||
|
./data
|
||||||
3
.gitignore
vendored
Normal file → Executable file
3
.gitignore
vendored
Normal file → Executable file
|
|
@ -1,5 +1,4 @@
|
||||||
__pycache__/
|
__pycache__/
|
||||||
qavanin-faiss/faiss_index_qavanin_285k_metadata.json
|
data/
|
||||||
qavanin-faiss/faiss_index_qavanin_285k.index
|
|
||||||
.vscode
|
.vscode
|
||||||
.gitignore
|
.gitignore
|
||||||
|
|
|
||||||
2
_old/README.md
Executable file
2
_old/README.md
Executable file
|
|
@ -0,0 +1,2 @@
|
||||||
|
# Qavanin Chatbot
|
||||||
|
|
||||||
0
bale_qabot.py → _old/bale_qabot.py
Normal file → Executable file
0
bale_qabot.py → _old/bale_qabot.py
Normal file → Executable file
|
|
@ -1,5 +1,5 @@
|
||||||
import json
|
import json
|
||||||
import chatbot_handler as chat
|
import chatbot_handler as chatbot_handler
|
||||||
# import bale_qabot
|
# import bale_qabot
|
||||||
import os
|
import os
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
@ -56,18 +56,20 @@ async def ping():
|
||||||
@chatbot.post("/emergency_call")
|
@chatbot.post("/emergency_call")
|
||||||
async def emergency_call(query: Query):
|
async def emergency_call(query: Query):
|
||||||
print('emergency generate answer ...')
|
print('emergency generate answer ...')
|
||||||
chat_id = await chat.create_chat_id()
|
chat_id = await chatbot_handler.create_chat_id()
|
||||||
answer = await chat.ask_chatbot_avalai(query.query, chat_id)
|
print('emergency chat_id ...', chat_id)
|
||||||
await chat.credit_refresh()
|
answer = await chatbot_handler.ask_chatbot_avalai(query.query, chat_id)
|
||||||
|
print('emergency answer ...', answer)
|
||||||
|
await chatbot_handler.credit_refresh()
|
||||||
|
print('credit updated')
|
||||||
return {"answer": answer}
|
return {"answer": answer}
|
||||||
|
|
||||||
@chatbot.post("/run_chat")
|
@chatbot.post("/run_chat")
|
||||||
async def run_chat(query: Query):
|
async def run_chat(query: Query):
|
||||||
print('regular generate answer ...')
|
print('regular generate answer ...')
|
||||||
chat_id = await chat.create_chat_id()
|
chat_id = await chatbot_handler.create_chat_id()
|
||||||
answer = await chat.ask_chatbot(query.query, chat_id)
|
answer = await chatbot_handler.ask_chatbot(query.query, chat_id)
|
||||||
await chat.credit_refresh()
|
await chatbot_handler.credit_refresh()
|
||||||
|
|
||||||
return {"answer": answer}
|
return {"answer": answer}
|
||||||
|
|
||||||
|
|
@ -83,20 +85,20 @@ if __name__ == "__main__":
|
||||||
continue
|
continue
|
||||||
start = (datetime.datetime.now())
|
start = (datetime.datetime.now())
|
||||||
# result = test_dataset()
|
# result = test_dataset()
|
||||||
result = chat.single_query(query)
|
result = chatbot_handler.single_query(query)
|
||||||
end_retrive = datetime.datetime.now()
|
end_retrive = datetime.datetime.now()
|
||||||
print('-'*40)
|
print('-'*40)
|
||||||
print(f'retrive duration: {(end_retrive - start).total_seconds()}')
|
print(f'retrive duration: {(end_retrive - start).total_seconds()}')
|
||||||
|
|
||||||
prompt = f'برای پرسش "{query}" از میان مواد قانونی "{result}" .پاسخ مناسب و دقیق را استخراج کن. درصورتی که مطلبی مرتبط با پرسش در متن پیدا نشد، فقط پاسخ بده: "متاسفانه در منابع، پاسخی پیدا نشد!"'
|
prompt = f'برای پرسش "{query}" از میان مواد قانونی "{result}" .پاسخ مناسب و دقیق را استخراج کن. درصورتی که مطلبی مرتبط با پرسش در متن پیدا نشد، فقط پاسخ بده: "متاسفانه در منابع، پاسخی پیدا نشد!"'
|
||||||
llm_answer = chat.llm_request(prompt)
|
llm_answer = chatbot_handler.llm_request(prompt)
|
||||||
|
|
||||||
print('-'*40)
|
print('-'*40)
|
||||||
print(f'llm duration: {(datetime.datetime.now() - end_retrive).total_seconds()}')
|
print(f'llm duration: {(datetime.datetime.now() - end_retrive).total_seconds()}')
|
||||||
|
|
||||||
refrences = ''
|
refrences = ''
|
||||||
recognized_refrences = chat.find_refrences(llm_answer)
|
recognized_refrences = chatbot_handler.find_refrences(llm_answer)
|
||||||
llm_answer = chat.replace_refrences(llm_answer, recognized_refrences)
|
llm_answer = chatbot_handler.replace_refrences(llm_answer, recognized_refrences)
|
||||||
|
|
||||||
with open('./llm-answer/result.txt', mode='a+', encoding='utf-8') as file:
|
with open('./llm-answer/result.txt', mode='a+', encoding='utf-8') as file:
|
||||||
result_message = f'متن پرامپت: {query.strip()}\n\nپاسخ: {llm_answer} \n----------------------------------------------------------\n'
|
result_message = f'متن پرامپت: {query.strip()}\n\nپاسخ: {llm_answer} \n----------------------------------------------------------\n'
|
||||||
|
|
@ -202,7 +202,7 @@ async def oss_request(query):
|
||||||
response_dict['output'] = str(response)
|
response_dict['output'] = str(response)
|
||||||
async with aiofiles. open('./llm-answer/messages.json', mode='w', encoding='utf-8') as output:
|
async with aiofiles. open('./llm-answer/messages.json', mode='w', encoding='utf-8') as output:
|
||||||
await output.write(json.dumps(response_dict, ensure_ascii=False, indent=2))
|
await output.write(json.dumps(response_dict, ensure_ascii=False, indent=2))
|
||||||
print('response created')
|
print('oss response created')
|
||||||
async with aiofiles.open('./llm-answer/chat-objs.txt', mode='a+', encoding='utf-8') as file:
|
async with aiofiles.open('./llm-answer/chat-objs.txt', mode='a+', encoding='utf-8') as file:
|
||||||
response_value = '0'
|
response_value = '0'
|
||||||
await file.write(response_value) # estimated_cost
|
await file.write(response_value) # estimated_cost
|
||||||
|
|
@ -239,13 +239,13 @@ async def llm_request(query, model):
|
||||||
# gemini-2.5-flash-lite : 35 خیلی خوب
|
# gemini-2.5-flash-lite : 35 خیلی خوب
|
||||||
|
|
||||||
answer = response.choices[0].message.content
|
answer = response.choices[0].message.content
|
||||||
print('$'*50)
|
# print('$'*50)
|
||||||
print(f'answer: {answer}')
|
# print(f'answer: {answer}')
|
||||||
print('$'*50)
|
# print('$'*50)
|
||||||
cost_prompt = response.estimated_cost['irt']
|
cost_prompt = response.estimated_cost['irt']
|
||||||
print('$'*50)
|
# print('$'*50)
|
||||||
print(f'answer: {cost_prompt}')
|
# print(f'answer: {cost_prompt}')
|
||||||
print('$'*50)
|
# print('$'*50)
|
||||||
# پاسخ را هم به سابقه اضافه میکنیم
|
# پاسخ را هم به سابقه اضافه میکنیم
|
||||||
# messages.append({"role": "assistant", "content": answer})
|
# messages.append({"role": "assistant", "content": answer})
|
||||||
# print(f'type(response): {type(response)}')
|
# print(f'type(response): {type(response)}')
|
||||||
|
|
@ -254,7 +254,7 @@ async def llm_request(query, model):
|
||||||
response_dict['output'] = str(response)
|
response_dict['output'] = str(response)
|
||||||
async with aiofiles. open('./llm-answer/messages.json', mode='w', encoding='utf-8') as output:
|
async with aiofiles. open('./llm-answer/messages.json', mode='w', encoding='utf-8') as output:
|
||||||
await output.write(json.dumps(response_dict, ensure_ascii=False, indent=2))
|
await output.write(json.dumps(response_dict, ensure_ascii=False, indent=2))
|
||||||
print('response created')
|
print('llm response created')
|
||||||
async with aiofiles.open('./llm-answer/chat-objs.txt', mode='a+', encoding='utf-8') as file:
|
async with aiofiles.open('./llm-answer/chat-objs.txt', mode='a+', encoding='utf-8') as file:
|
||||||
response_value = f"{response.estimated_cost['irt']}\n-------------------------------\n\n"
|
response_value = f"{response.estimated_cost['irt']}\n-------------------------------\n\n"
|
||||||
await file.write(response_value) # estimated_cost
|
await file.write(response_value) # estimated_cost
|
||||||
|
|
@ -496,7 +496,7 @@ async def single_query(query: str):
|
||||||
# query = cleaning(query)
|
# query = cleaning(query)
|
||||||
retrived_sections_ids = []
|
retrived_sections_ids = []
|
||||||
|
|
||||||
retrived_sections = pipe.search(query, content_list, topk_dense=100, topk_sparse=100, pre_rerank_k=100, final_k=15)
|
retrived_sections = pipe.search(query, content_list, topk_dense=100, topk_sparse=100, pre_rerank_k=100, final_k=10)
|
||||||
final_similars = ''
|
final_similars = ''
|
||||||
for i, row in enumerate(retrived_sections, 1):
|
for i, row in enumerate(retrived_sections, 1):
|
||||||
id_value = '{' + str(ids[row['idx']]) + '}'
|
id_value = '{' + str(ids[row['idx']]) + '}'
|
||||||
|
|
@ -545,7 +545,7 @@ async def replace_refrences(llm_answer: str, refrences_list:List[str]) -> List[s
|
||||||
# refrences = ''
|
# refrences = ''
|
||||||
for index, ref in enumerate(refrences_list,1):
|
for index, ref in enumerate(refrences_list,1):
|
||||||
new_ref = '{' + str(ref) + '}'
|
new_ref = '{' + str(ref) + '}'
|
||||||
llm_answer = llm_answer.replace(new_ref, f'[«{str(index)}»](https://majles.tavasi.ir/entity/detail/view/qsection/{ref}) ')
|
llm_answer = llm_answer.replace(new_ref, f' [«{str(index)}»](https://majles.tavasi.ir/entity/detail/view/qsection/{ref}) ')
|
||||||
# id = ref.lstrip('{')
|
# id = ref.lstrip('{')
|
||||||
# id = id.rstrip('}')
|
# id = id.rstrip('}')
|
||||||
# refrences += ''.join(f'[{index}] https://majles.tavasi.ir/entity/detail/view/qsection/{id}\n')
|
# refrences += ''.join(f'[{index}] https://majles.tavasi.ir/entity/detail/view/qsection/{id}\n')
|
||||||
|
|
@ -584,6 +584,7 @@ async def get_title_system_prompt():
|
||||||
|
|
||||||
|
|
||||||
async def ask_chatbot_avalai(query:str, chat_id:str):
|
async def ask_chatbot_avalai(query:str, chat_id:str):
|
||||||
|
print('ask avalai func')
|
||||||
prompt_status = True
|
prompt_status = True
|
||||||
llm_model = ''
|
llm_model = ''
|
||||||
llm_answer = ''
|
llm_answer = ''
|
||||||
|
|
@ -623,19 +624,17 @@ async def ask_chatbot_avalai(query:str, chat_id:str):
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
for model in models:
|
for model in models:
|
||||||
before_prompt_credit = await credit_refresh()
|
before_prompt_credit = await credit_refresh()
|
||||||
llm_model = model
|
llm_model = model
|
||||||
print(f'using model: {model}')
|
print(f'using model: {model}')
|
||||||
try:
|
try:
|
||||||
llm_answer, cost_prompt = await llm_request(prompt, model)
|
llm_answer, cost_prompt = await llm_request(prompt, model)
|
||||||
# llm_answer, cost_prompt = await oss_request(prompt)
|
# llm_answer, cost_prompt = await oss_request(prompt)
|
||||||
except:
|
|
||||||
print(f'error in ask-chatbot-avalai model:{model}')
|
|
||||||
continue
|
|
||||||
break
|
break
|
||||||
except Exception as error:
|
except Exception as error:
|
||||||
|
print(f'error in ask-chatbot-avalai model:{model}')
|
||||||
after_prompt_credit = await credit_refresh()
|
after_prompt_credit = await credit_refresh()
|
||||||
prompt_cost = int(before_prompt_credit) - int(after_prompt_credit)
|
prompt_cost = int(before_prompt_credit) - int(after_prompt_credit)
|
||||||
error = f'model: {model} \n{error}\n\n'
|
error = f'model: {model} \n{error}\n\n'
|
||||||
|
|
@ -646,6 +645,8 @@ async def ask_chatbot_avalai(query:str, chat_id:str):
|
||||||
await file.write(error)
|
await file.write(error)
|
||||||
prompt_status = False
|
prompt_status = False
|
||||||
status_text = 'با عرض پوزش، سرویس موقتا در دسترس نیست. لطفا دقایقی دیگر دوباره تلاش نمائید!'
|
status_text = 'با عرض پوزش، سرویس موقتا در دسترس نیست. لطفا دقایقی دیگر دوباره تلاش نمائید!'
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
# حالتی که وضعیت پرامپت، نامعتبر باشد، یک شی با مقادیر زیر برگردانده می شود
|
# حالتی که وضعیت پرامپت، نامعتبر باشد، یک شی با مقادیر زیر برگردانده می شود
|
||||||
else:
|
else:
|
||||||
|
|
@ -710,17 +711,22 @@ async def ask_chatbot_avalai(query:str, chat_id:str):
|
||||||
'status' : True, # or False # bool
|
'status' : True, # or False # bool
|
||||||
}
|
}
|
||||||
prev_chat_data = []
|
prev_chat_data = []
|
||||||
|
number = 1
|
||||||
try:
|
try:
|
||||||
async with aiofiles.open('./llm-answer/chat-messages1.json', mode='r', encoding='utf-8') as file:
|
async with aiofiles.open(f'./llm-answer/chat-messages{number}.json', mode='r', encoding='utf-8') as file:
|
||||||
content = await file.read()
|
content = await file.read()
|
||||||
prev_chat_data = json.loads(content)
|
prev_chat_data = json.loads(content)
|
||||||
prev_chat_data.append(chat_obj)
|
prev_chat_data.append(chat_obj)
|
||||||
except:
|
except:
|
||||||
pass
|
number += 1
|
||||||
|
|
||||||
prev_chat_data.append(chat_obj)
|
prev_chat_data.append(chat_obj)
|
||||||
async with aiofiles. open('./llm-answer/chat-messages1.json', mode='w', encoding='utf-8') as output:
|
async with aiofiles.open(f'./llm-answer/chat-messages{number}.json', mode='w', encoding='utf-8') as output:
|
||||||
await output.write(json.dumps(prev_chat_data, ensure_ascii=False, indent=2))
|
await output.write(json.dumps(prev_chat_data, ensure_ascii=False, indent=2))
|
||||||
|
|
||||||
|
async with aiofiles.open(f'./llm-answer/chat-messages-answer{number}.txt', mode='a+', encoding='utf-8') as output:
|
||||||
|
await output.write(f'{chat_obj}\n+++++++++++++++++++++++++++\n')
|
||||||
|
|
||||||
# save_result(chat_obj)
|
# save_result(chat_obj)
|
||||||
|
|
||||||
# ایجاد آبجکت بازگشتی به فرانت
|
# ایجاد آبجکت بازگشتی به فرانت
|
||||||
|
|
@ -732,6 +738,7 @@ async def ask_chatbot_avalai(query:str, chat_id:str):
|
||||||
return chat_obj
|
return chat_obj
|
||||||
|
|
||||||
async def ask_chatbot(query:str, chat_id:str):
|
async def ask_chatbot(query:str, chat_id:str):
|
||||||
|
print('ask oss func')
|
||||||
prompt_status = True
|
prompt_status = True
|
||||||
llm_model = 'gpt.oss.120b'
|
llm_model = 'gpt.oss.120b'
|
||||||
llm_answer = ''
|
llm_answer = ''
|
||||||
|
|
@ -741,20 +748,21 @@ async def ask_chatbot(query:str, chat_id:str):
|
||||||
if query == '':
|
if query == '':
|
||||||
prompt_status = False
|
prompt_status = False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# در صورتی که وضعیت پرامپت معتبر باشد، وارد فرایند شو
|
# در صورتی که وضعیت پرامپت معتبر باشد، وارد فرایند شو
|
||||||
if prompt_status:
|
if prompt_status:
|
||||||
|
|
||||||
before_title_time = datetime.datetime.now()
|
before_title_time = datetime.datetime.now()
|
||||||
title_system_prompt = await get_title_system_prompt()
|
title_system_prompt = await get_title_system_prompt()
|
||||||
title_user_prompt = await get_title_user_prompt(query)
|
title_user_prompt = await get_title_user_prompt(query)
|
||||||
|
title = ''
|
||||||
# title, cost_title = await llm_base_request(title_system_prompt, title_user_prompt)
|
# title, cost_title = await llm_base_request(title_system_prompt, title_user_prompt)
|
||||||
title, cost_title = await oss_base_request(title_system_prompt, title_user_prompt)
|
# title, cost_title = await oss_base_request(title_system_prompt, title_user_prompt)
|
||||||
if not title:
|
if not title:
|
||||||
title = query
|
title = query
|
||||||
|
|
||||||
title_prompt_duration = (datetime.datetime.now() - before_title_time).total_seconds()
|
title_prompt_duration = (datetime.datetime.now() - before_title_time).total_seconds()
|
||||||
|
print('-'*40)
|
||||||
|
print(f'title_prompt_duration: {title_prompt_duration}')
|
||||||
|
|
||||||
if title == '':
|
if title == '':
|
||||||
title = query.split()[0:10]
|
title = query.split()[0:10]
|
||||||
|
|
@ -762,8 +770,6 @@ async def ask_chatbot(query:str, chat_id:str):
|
||||||
start_time = (datetime.datetime.now())
|
start_time = (datetime.datetime.now())
|
||||||
result_passages_text, result_passages_ids = await single_query(query)
|
result_passages_text, result_passages_ids = await single_query(query)
|
||||||
end_retrive = datetime.datetime.now()
|
end_retrive = datetime.datetime.now()
|
||||||
print('-'*40)
|
|
||||||
print(f'title_prompt_duration: {title_prompt_duration}')
|
|
||||||
retrive_duration = (end_retrive - start_time).total_seconds()
|
retrive_duration = (end_retrive - start_time).total_seconds()
|
||||||
print(f'retrive duration: {str(retrive_duration)}')
|
print(f'retrive duration: {str(retrive_duration)}')
|
||||||
|
|
||||||
|
|
@ -856,15 +862,29 @@ async def ask_chatbot(query:str, chat_id:str):
|
||||||
'status_text' : status_text, # str
|
'status_text' : status_text, # str
|
||||||
'status' : True, # or False # bool
|
'status' : True, # or False # bool
|
||||||
}
|
}
|
||||||
prev_chat_data = []
|
|
||||||
async with aiofiles.open('./llm-answer/chat-messages1.json', mode='r', encoding='utf-8') as file:
|
|
||||||
content = await file.read()
|
|
||||||
prev_chat_data = json.loads(content)
|
|
||||||
prev_chat_data.append(chat_obj)
|
|
||||||
|
|
||||||
async with aiofiles. open('./llm-answer/chat-messages1.json', mode='w', encoding='utf-8') as output:
|
prev_chat_data = []
|
||||||
|
number = 1
|
||||||
|
try:
|
||||||
|
async with aiofiles.open(f'./llm-answer/chat-messages{number}.json', mode='r', encoding='utf-8') as file:
|
||||||
|
content = await file.read()
|
||||||
|
prev_chat_data = json.loads(content)
|
||||||
|
prev_chat_data.append(chat_obj)
|
||||||
|
except:
|
||||||
|
number += 1
|
||||||
|
|
||||||
|
prev_chat_data.append(chat_obj)
|
||||||
|
async with aiofiles. open(f'./llm-answer/chat-messages{number}.json', mode='w', encoding='utf-8') as output:
|
||||||
await output.write(json.dumps(prev_chat_data, ensure_ascii=False, indent=2))
|
await output.write(json.dumps(prev_chat_data, ensure_ascii=False, indent=2))
|
||||||
|
|
||||||
|
# async with aiofiles. open(f'./llm-answer/chat-messages-answer{number}.txt', mode='a+', encoding='utf-8') as output:
|
||||||
|
# await output.write(f'{chat_obj}\n+++++++++++++++++++++++++++\n')
|
||||||
|
|
||||||
|
|
||||||
|
full_prompt_duration = (datetime.datetime.now() - start_time).total_seconds()
|
||||||
|
print(f'aiofiles duration: {full_prompt_duration}')
|
||||||
|
print('~'*40)
|
||||||
|
|
||||||
# save_result(chat_obj)
|
# save_result(chat_obj)
|
||||||
|
|
||||||
# ایجاد آبجکت بازگشتی به فرانت
|
# ایجاد آبجکت بازگشتی به فرانت
|
||||||
|
|
@ -886,7 +906,7 @@ async def credit_refresh():
|
||||||
}
|
}
|
||||||
remained_credit = requests.get(url, headers=headers)
|
remained_credit = requests.get(url, headers=headers)
|
||||||
remained_credit_value = str(remained_credit.json()['remaining_irt'])
|
remained_credit_value = str(remained_credit.json()['remaining_irt'])
|
||||||
|
print('writing credit')
|
||||||
async with aiofiles.open('./llm-answer/credit.txt', mode='a+', encoding='utf-8') as file:
|
async with aiofiles.open('./llm-answer/credit.txt', mode='a+', encoding='utf-8') as file:
|
||||||
await file.write(f'{remained_credit_value}\n')
|
await file.write(f'{remained_credit_value}\n')
|
||||||
|
|
||||||
|
|
@ -933,6 +953,9 @@ if __name__ == "__main__":
|
||||||
recognized_refrences = find_refrences(llm_answer)
|
recognized_refrences = find_refrences(llm_answer)
|
||||||
llm_answer = replace_refrences(llm_answer, recognized_refrences)
|
llm_answer = replace_refrences(llm_answer, recognized_refrences)
|
||||||
|
|
||||||
|
print('-'*40)
|
||||||
|
print(f'replace_refrences duration: {(datetime.datetime.now() - end_retrive).total_seconds()}')
|
||||||
|
|
||||||
with open('./llm-answer/result.txt', mode='a+', encoding='utf-8') as file:
|
with open('./llm-answer/result.txt', mode='a+', encoding='utf-8') as file:
|
||||||
result_message = f'متن پرامپت: {query.strip()}\n\nپاسخ: {llm_answer} \n----------------------------------------------------------\n'
|
result_message = f'متن پرامپت: {query.strip()}\n\nپاسخ: {llm_answer} \n----------------------------------------------------------\n'
|
||||||
file.write(result_message)
|
file.write(result_message)
|
||||||
|
|
@ -941,6 +964,8 @@ if __name__ == "__main__":
|
||||||
result_message = f'متن پرامپت: {query.strip()}\n\مواد مشابه: {result} \n----------------------------------------------------------\n'
|
result_message = f'متن پرامپت: {query.strip()}\n\مواد مشابه: {result} \n----------------------------------------------------------\n'
|
||||||
file.write(result_message)
|
file.write(result_message)
|
||||||
|
|
||||||
|
print('-'*40)
|
||||||
|
print(f'file write duration: {(datetime.datetime.now() - end_retrive).total_seconds()}')
|
||||||
|
|
||||||
|
|
||||||
print('----------------------------------------------------------')
|
print('----------------------------------------------------------')
|
||||||
10
_old/dockerfile
Executable file
10
_old/dockerfile
Executable file
|
|
@ -0,0 +1,10 @@
|
||||||
|
FROM docker.tavasi.ir/tavasi/qachat_base:1.0.0
|
||||||
|
|
||||||
|
WORKDIR /src/app
|
||||||
|
|
||||||
|
COPY . /src/app
|
||||||
|
|
||||||
|
EXPOSE 80
|
||||||
|
|
||||||
|
CMD [ "uvicorn","chatbot:chatbot","--reload","--port","80","--host=0.0.0.0"]
|
||||||
|
|
||||||
5
_old/dockerfile_base
Executable file
5
_old/dockerfile_base
Executable file
|
|
@ -0,0 +1,5 @@
|
||||||
|
FROM docker.tavasi.ir/tavasi/qachat_base:1.0.0
|
||||||
|
RUN pip install uvicorn[standard]
|
||||||
|
RUN pip install FlagEmbedding
|
||||||
|
RUN pip install aiofiles
|
||||||
|
RUN pip install openai
|
||||||
1
oss.py → _old/oss.py
Normal file → Executable file
1
oss.py → _old/oss.py
Normal file → Executable file
|
|
@ -10,6 +10,7 @@ LLM_URL = "http://172.16.29.102:8001/v1/"
|
||||||
# 'assistant_prompt' : '',
|
# 'assistant_prompt' : '',
|
||||||
# }
|
# }
|
||||||
|
|
||||||
|
|
||||||
async def process_item(messages, reasoning_effort= 'medium', temperature= 0.4, top_p= 0.9, max_tokens= 2048):
|
async def process_item(messages, reasoning_effort= 'medium', temperature= 0.4, top_p= 0.9, max_tokens= 2048):
|
||||||
"""
|
"""
|
||||||
generates answer with gpt-oss-120b model
|
generates answer with gpt-oss-120b model
|
||||||
15
_old/requirements.txt
Executable file
15
_old/requirements.txt
Executable file
|
|
@ -0,0 +1,15 @@
|
||||||
|
cleantext==1.1.4
|
||||||
|
elasticsearch7==7.17.12
|
||||||
|
faiss_cpu==1.9.0
|
||||||
|
fastapi==0.117.1
|
||||||
|
hazm==0.10.0
|
||||||
|
langchain_openai==0.3.33
|
||||||
|
numpy==1.21.5
|
||||||
|
openai==1.108.1
|
||||||
|
pandas==2.3.2
|
||||||
|
pydantic==2.11.9
|
||||||
|
scikit_learn==1.7.2
|
||||||
|
sentence_transformers==2.5.1
|
||||||
|
torch==2.4.0
|
||||||
|
torch==2.1.2
|
||||||
|
transformers==4.55.1
|
||||||
3
_old/run_docker.bash
Executable file
3
_old/run_docker.bash
Executable file
|
|
@ -0,0 +1,3 @@
|
||||||
|
docker stop qachat
|
||||||
|
docker rm qachat
|
||||||
|
docker run --name qachat -p 2425:80 --net qachat_net --gpus=all -v ./:/src/app/ -v ./qavanin-faiss/:/src/app/qavanin-faiss/ -v ./llm-answer/:/src/app/llm-answer/ -v ./../MODELS:/src/MODELS -v ./../cache:/root/.cache/huggingface/hub -it --restart unless-stopped docker.tavasi.ir/tavasi/qachat:1.0.0
|
||||||
0
baleqabot/bot.log
Normal file → Executable file
0
baleqabot/bot.log
Normal file → Executable file
0
baleqabot/requests.json
Normal file → Executable file
0
baleqabot/requests.json
Normal file → Executable file
1
config.env
Normal file
1
config.env
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
LLM_URL="http://172.16.29.102:8001/v1/"
|
||||||
|
|
@ -6,5 +6,5 @@ COPY . /src/app
|
||||||
|
|
||||||
EXPOSE 80
|
EXPOSE 80
|
||||||
|
|
||||||
CMD [ "uvicorn","chatbot:chatbot","--reload","--port","80","--host=0.0.0.0"]
|
CMD [ "uvicorn","main:app","--reload","--port","80","--host=0.0.0.0"]
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -1,743 +0,0 @@
|
||||||
ChatCompletion(id='chatcmpl-CMEPWQSoYM74Yu0kGagCV6ruLCpoN', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='شرایط معافیت از پرداخت عوارض شهرداری به شرح زیر است:\n\n1. **تخفیف و بخشودگی:** هرگونه تخفیف یا بخشودگی حقوق و عوارض شهرداری\u200cها باید از بودجه عمومی سالانه کشور تأمین شود. در غیر این صورت، بخشودگی و تخفیف ممنوع است {qs738926}.\n\n2. **خانواده شهداء و ایثارگران:** خانواده شهداء و ایثارگران و جانبازان بیست و پنج درصد (۲۵٪) و بالاتر از پرداخت عوارض شهرداری برای احداث یک واحد مسکونی حداکثر ۱۲۵ متر مربع برای یک بار معاف هستند {qs911698}.\n\n3. **جانبازان:** جانبازان بیست و پنج درصد (۲۵٪) و بالاتر نیز از پرداخت عوارض شهرداری برای احداث یک واحد مسکونی حداکثر ۱۲۵ متر مربع برای یک بار معاف می\u200cباشند {qs289963}.\n\n4. **سازمان رادیو تلویزیون:** سازمان رادیو تلویزیون ملی ایران از پرداخت هرگونه عوارض شهرداری معاف است {qs814894}.\n\n5. **طرح\u200cهای عمومی عمرانی:** طرح\u200cهای مربوط به خرید و تملک اراضی و املاک برای اجرای برنامه\u200cهای عمومی، عمرانی و نظامی دولت از تاریخ شروع به اجرا، از پرداخت هر نوع عوارض مستقیم به شهرداری\u200cها معاف هستند {qs217429}.\n\n6. **واحدهای آموزشی:** واحدهای آموزشی و پرورشی با تأیید وزارت آموزش و پرورش از پرداخت هرگونه عوارض شهرداری معاف می\u200cباشند {qs993425} {qs212450}.\n\n7. **انتقال بلاعوض به نفع دولت و شهرداری\u200cها:** هرگونه انتقال بلاعوض به نفع دولت و شهرداری\u200cها از پرداخت عوارض معاف می\u200cباشد {qs120991}.\n\n8. **گواهی وزارت دارایی:** دولت و شهرداری\u200cها و موسسات وابسته به آن\u200cها با گواهی وزارت دارایی از پرداخت این نوع مالیات معاف خواهند بود {qs725533} {qs748695}.', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))], created=1759414646, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_efad92c60b', usage=CompletionUsage(completion_tokens=465, prompt_tokens=8403, total_tokens=8868, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0, text_tokens=None), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None)), estimated_cost={'unit': '0.0016933950', 'irt': 197.03, 'exchange_rate': 116350})
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
ChatCompletion(id='f4neaIupMtXp7M8P9viLgQs', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='شرایط معافیت از پرداخت عوارض شهرداری به شرح زیر است:\n\n* **محدودیت تخفیف و بخشودگی:** هرگونه تخفیف یا بخشودگی حقوق و عوارض شهرداری\u200cها توسط دولت و قوانین مصوب، مشروط به تأمین آن از بودجه عمومی سالانه کشور است. در غیر این صورت، بخشودگی و تخفیف ممنوع است {qs738926}.\n\n* **خانواده شهداء، ایثارگران و جانبازان:** خانواده شهداء، ایثارگران و جانبازان با از خودگذشتگی بیست و پنج درصد (۲۵٪) و بالاتر، از پرداخت عوارض شهرداری برای احداث یک واحد مسکونی با حداکثر مساحت ۱۲۵ متر مربع، برای یک بار معاف می\u200cباشند {qs911698}.\n\n* **جانبازان:** جانبازان با از خودگذشتگی بیست و پنج درصد (۲۵٪) و بالاتر، از پرداخت عوارض شهرداری برای احداث یک واحد مسکونی با حداکثر مساحت ۱۲۵ متر مربع، برای یک بار معاف هستند {qs289963}.\n\n* **سازمان رادیو تلویزیون ملی ایران:** این سازمان از پرداخت هرگونه عوارض شهرداری معاف است {qs814894}.\n\n* **طرح\u200cهای عمومی، عمرانی و نظامی:** طرح\u200cهایی که طبق لایحه قانونی نحوه خرید و تملک اراضی و املاک برای اجرای برنامه\u200cهای عمومی، عمرانی و نظامی دولت اجرا می\u200cشوند، از تاریخ شروع به اجرا، از پرداخت هر نوع عوارض مستقیم به شهرداری\u200cها، مانند عوارض زمین، ساختمان، اموال منقول و غیرمنقول، حق تشرف و حق مرغوبیت معاف هستند {qs217429}.\n\n* **واحدهای آموزشی و پرورشی:** واحدهای آموزشی و پرورشی، با تأیید وزارت آموزش و پرورش، از پرداخت هرگونه عوارض شهرداری معاف می\u200cباشند {qs993425} {qs212450}.\n\n* **انتقال بلاعوض به نفع دولت و شهرداری\u200cها:** هرگونه انتقال بلاعوض که به نفع دولت و شهرداری\u200cها صورت می\u200cگیرد، از پرداخت عوارض و اخذ هرگونه گواهی معاف است {qs120991}.\n\n* **معافیت دولت، شهرداری\u200cها و موسسات وابسته:** دولت، شهرداری\u200cها و موسسات وابسته به آن\u200cها، با گواهی وزارت دارایی، از پرداخت مالیات نقل و انتقالات قطعی معاف خواهند بود {qs725533} {qs748695}.', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None, images=[], thinking_blocks=[]))], created=1759414653, model='gemini-2.5-flash-lite', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=596, prompt_tokens=10041, total_tokens=10637, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetails(audio_tokens=None, cached_tokens=None, text_tokens=10041, image_tokens=None)), vertex_ai_grounding_metadata=[], vertex_ai_url_context_metadata=[], vertex_ai_safety_results=[], vertex_ai_citation_metadata=[], estimated_cost={'unit': '0.0012425000', 'irt': 144.56, 'exchange_rate': 116350})
|
|
||||||
-------------------------------
|
|
||||||
ChatCompletion(id='chatcmpl-CMEW5PqnEzAVDKITc6s0ZgLtR6A0q', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='حمایت از خانواده در جمهوری اسلامی ایران بر اساس مواد قانونی متعددی انجام می\u200cشود که به شرح زیر است:\n\n1. **حمایت کلی از خانواده:** ماده ۸۱ قانون برنامه پنجساله هفتم پیشرفت جمهوری اسلامی ایران، به حمایت از خانواده و ارتقای کارآمدی ساختار سازمانی در حوزه\u200cهای خانواده، زنان و جوانان و همچنین حمایت همه\u200cجانبه از فرزندآوری و رفع موانع اشاره دارد {qs3390392}.\n\n2. **تخصیص زمین به خانواده\u200cها:** به منظور حمایت از خانواده و تحقق جوانی جمعیت، وزارت راه و شهرسازی موظف است در سال\u200cهای ۱۴۰۲، ۱۴۰۳ و ۱۴۰۴، زمین یا واحد مسکونی به صورت رایگان به خانواده\u200cهای دارای چهار فرزند و بیشتر زیر بیست سال اختصاص دهد {qs3186952} {qs3328892} {qs67a862ff4e15f_202}. \n\n3. **حمایت حقوقی و فرهنگی:** همچنین بر اساس بند ۱۴ سیاست\u200cهای کلی خانواده، حمایت حقوقی، اقتصادی و فرهنگی از خانواده\u200cهای با سرپرستی زنان و تسهیل ازدواج آنان مورد تأکید قرار گرفته است {qs2248511}.\n\n4. **حمایت از خانواده زندانیان:** حمایت از خانواده زندانیان و معدومین از طریق سازمان\u200cها و نهادهای خیریه مردمی و غیردولتی نیز پیش\u200cبینی شده است {qs108297}.\n\n5. **تخفیف مالیاتی:** در راستای قانون حمایت از خانواده و جوانی جمعیت، اشخاص حقیقی موضوع قانون مالیات\u200cهای مستقیم که فرزند سوم و بیشتر دارند، مشمول افزایش پانزده درصد (۱۵٪) در تخفیف مالیاتی می\u200cشوند {qs3186827} {qs67a862ff4e15f_052}.\n\n6. **اهداف تشکیل خانواده:** اهداف تشکیل و تحکیم خانواده شامل ارتقاء آگاهی اعضای خانواده نسبت به حقوق و وظایف یکدیگر، پیشگیری از تزلزل و فروپاشی نهاد خانواده و حمایت از خانواده\u200cهای آسیب\u200cدیده و کودکان است {qs1028727}.\n\n7. **راهبردهای حمایت:** راهبردها شامل محافظت از خانواده در برابر آسیب\u200cهای اجتماعی، اتخاذ تدابیر مناسب برای حمایت از خانواده\u200cهای آسیب\u200cدیده و حمایت از زنان و کودکان در برابر تعرضات نیز می\u200cباشد {qs1028747}.', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))], created=1759415053, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_efad92c60b', usage=CompletionUsage(completion_tokens=521, prompt_tokens=16933, total_tokens=17454, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0, text_tokens=None), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None)), estimated_cost={'unit': '0.0031378050', 'irt': 363.51, 'exchange_rate': 115850})
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
ChatCompletion(id='GIveaNm2KKeUkdUPzcWboAU', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='قوانین و سیاست\u200cهای موجود در زمینه حمایت از خانواده شامل موارد زیر است:\n\n* **قانون حمایت از خانواده و جوانی جمعیت:** این قانون به منظور حمایت جامع از خانواده، ارتقاء کارآمدی ساختارهای سازمانی مرتبط با خانواده، زنان و جوانان، حمایت همه\u200cجانبه از فرزندآوری، رفع موانع و ایجاد مشوق\u200cهای مؤثر و اصلاح فرهنگی تدوین شده است {qs3390392}.\n\n* **اعطای زمین رایگان به خانواده\u200cهای پرجمعیت:** در راستای حمایت از خانواده و تحقق جوانی جمعیت، وزارت راه و شهرسازی مکلف است در سال\u200cهای ۱۴۰۲، ۱۴۰۳ و ۱۴۰۴، زمین یا واحد مسکونی رایگان به خانواده\u200cهای دارای چهار فرزند و بیشتر زیر بیست سال اختصاص دهد {qs3186952} {qs3328892} {qs67a862ff4e15f_202}.\n\n* **حمایت حقوقی، اقتصادی و فرهنگی از خانواده\u200cهای زنان سرپرست:** سیاست\u200cهای کلی خانواده بر حمایت حقوقی، اقتصادی و فرهنگی از خانواده\u200cهای با سرپرستی زنان و همچنین تشویق و تسهیل ازدواج آنان تأکید دارد {qs2248511}.\n\n* **حمایت از خانواده زندانیان:** حمایت از خانواده زندانیان و معدومین از طریق سازمان\u200cها و نهادهای خیریه مردمی، غیردولتی و انجمن\u200cهای حمایت از زندانیان در نظر گرفته شده است {qs108297}.\n\n* **تخفیف مالیاتی برای فرزندان:** اشخاص حقیقی که فرزند سوم و بیشتر آن\u200cها از آبان ۱۴۰۰ به بعد متولد شده است، مشمول پانزده درصد (۱۵٪) افزایش در تخفیف مالیاتی به ازای هر فرزند می\u200cشوند {qs3186827} {qs67a862ff4e15f_052}.\n\n* **اهداف و راهبردهای تحکیم خانواده:** اهداف این سیاست\u200cها شامل ارتقاء جایگاه خانواده، حمایت از تشکیل و تحکیم آن، ارتقاء سطح فرهنگی و تربیتی اعضا، هماهنگی سیاست\u200cها، آگاهی از حقوق و وظایف، گسترش ارزش\u200cهای اسلامی، و ایمن\u200cسازی خانواده از آسیب\u200cهای اجتماعی و حمایت از خانواده\u200cهای آسیب\u200cدیده است {qs1028727}. راهبردها نیز بر ارتقای آگاهی، محافظت از آسیب\u200cپذیری خانواده، مبارزه با ناهنجاری\u200cهای اجتماعی و حمایت از زنان و کودکان تأکید دارند {qs1028747}.', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None, images=[], thinking_blocks=[]))], created=1759415062, model='gemini-2.5-flash-lite', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=571, prompt_tokens=19342, total_tokens=19913, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetails(audio_tokens=None, cached_tokens=None, text_tokens=19342, image_tokens=None)), vertex_ai_grounding_metadata=[], vertex_ai_url_context_metadata=[], vertex_ai_safety_results=[], vertex_ai_citation_metadata=[], estimated_cost={'unit': '0.0021626000', 'irt': 250.54, 'exchange_rate': 115850})
|
|
||||||
-------------------------------
|
|
||||||
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
|
||||||
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
|
||||||
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
|
|
||||||
|
|
||||||
Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='متاسفانه در منابع، پاسخی پیدا نشد!', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='اتومبیل\u200cهای نمایندگی\u200cهای سیاسی و کنسولی مقیم ایران و اعضای رسمی آنها، هنگام ترخیص، مشروط بر عمل متقابل، از پرداخت کلیه حقوق گمرکی، سود بازرگانی، عوارض و مالیات\u200cها معاف می\u200cباشند. {qs852944} همچنین، این اتومبیل\u200cها پس از گذشت ۳ سال از تاریخ ترخیص، در صورت عمل متقابل، هنگام فروش از پرداخت حقوق گمرکی، سود بازرگانی و عوارض متعلقه معاف خواهند بود. {qs852947}\n\nوسایل نقلیه مسافری خارجی با شرایطی از پرداخت عوارض راه و حق توقف معاف هستند. {qs335260}\n\nمتصدیان حمل کشورهای طرف موافقت\u200cنامه حمل و نقل بین\u200cالمللی جاده\u200cای کالا و مسافر (مانند کرواسی و لیتوانی)، برای انجام حمل و نقل بین\u200cالمللی جاده\u200cای و به صورت متقابل، از پرداخت عوارض و سایر پرداخت\u200cهای مربوط به مالکیت یا استفاده از وسیله نقلیه و همچنین عوارض مربوط به استفاده یا نگهداری راه\u200cهای کشور طرف دیگر معاف می\u200cشوند. {qs3437134} , {qs935848} , {qs985052}\n\nهرگونه انتقال بلاعوض به نفع دولت و شهرداری\u200cها از پرداخت عوارض معاف است. {qs120991}\n\nمالکان خودرو در صورت پرداخت و تسویه حساب قبوض جریمه\u200cهای رانندگی تا پایان آذرماه سال\u200cهای ۱۳۹۷ و ۱۳۹۵، از جریمه دیرکرد ناشی از عدم پرداخت تا پایان سال\u200cهای ۱۳۹۶ و ۱۳۹۴ بخشیده می\u200cشوند. {qs2565159} , {qs2207235}\n\nمسافران خاصی از پرداخت عوارض خروج از کشور معاف می\u200cباشند. {qs863196}', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None, images=[], thinking_blocks=[]))
|
|
||||||
-------------------------------
|
|
||||||
#####################################################
|
|
||||||
#####################################################
|
|
||||||
#####################################################
|
|
||||||
#####################################################
|
|
||||||
|
|
||||||
|
|
||||||
[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='برای معافیت از پرداخت عوارض شهرداری، شرایط زیر وجود دارد:\n\n1. خانواده شهداء و ایثارگران و جانبازان بیست و پنج درصد (۲۵ ٪) و بالاتر، از پرداخت عوارض شهرداری برای احداث یک واحد مسکونی حداکثر یکصد و بیست و پنج متر مربع برای یک\u200cبار معاف می\u200cباشند {qs911698}.\n\n2. جانبازان بیست و پنج درصد (۲۵ ٪) و بالاتر، از پرداخت عوارض شهرداری برای احداث یک واحد مسکونی حداکثر ۱۲۵ متر مربع برای یک\u200cبار معاف می\u200cباشند {qs289963}.\n\n3. سازمان رادیو تلویزیون ملی ایران از پرداخت هر گونه عوارض شهرداری معاف است و مشمول معافیت مذکور در ماده ۲۶ قانون نوسازی و عمران شهری نیز خواهد بود {qs814894}.\n\n4. طرح\u200cهای موضوع قانون نحوه خرید و تملک اراضی و املاک برای اجرای برنامه\u200cهای عمومی، عمرانی و نظامی دولت از تاریخ شروع به اجراء از پرداخت هر نوع عوارض مستقیم به شهرداری\u200cها معاف هستند {qs217429}.\n\n5. واحدهای آموزشی و پرورشی با تأیید وزارت آموزش و پرورش از پرداخت هر گونه عوارض شهرداری معاف می\u200cباشند {qs993425} و {qs212450}.\n\n6. هرگونه انتقال بلاعوض به نفع دولت و شهرداری\u200cها از پرداخت عوارض و اخذ هرگونه گواهی معاف می\u200cباشد {qs120991}. \n\nاین شرایط نشان\u200cدهنده مواردی است که می\u200cتوان بر اساس آن\u200cها از پرداخت عوارض شهرداری معاف شد.', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))]
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='برای معافیت از پرداخت عوارض شهرداری، شرایط زیر وجود دارد:\n\n* خانواده شهداء، ایثارگران و جانبازان با درصد جانبازی بیست و پنج درصد (۲۵ ٪) و بالاتر، برای احداث یک واحد مسکونی حداکثر به متراژ یکصد و بیست و پنج متر مربع، برای یک\u200cبار از پرداخت عوارض شهرداری معاف می\u200cباشند {qs911698}.\n* جانبازان با درصد جانبازی بیست و پنج درصد (۲۵ ٪) و بالاتر، برای احداث یک واحد مسکونی حداکثر به متراژ ۱۲۵ متر مربع، برای یک\u200cبار از پرداخت عوارض شهرداری معاف می\u200cباشند {qs289963}.\n* سازمان رادیو تلویزیون ملی ایران از پرداخت هر گونه عوارض شهرداری معاف است {qs814894}.\n* طرح\u200cهای موضوع قانون نحوه خرید و تملک اراضی و املاک برای اجرای برنامه\u200cهای عمومی، عمرانی و نظامی دولت، از تاریخ شروع اجرا، از پرداخت هر نوع عوارض مستقیم به شهرداری\u200cها معاف هستند {qs217429}.\n* واحدهای آموزشی و پرورشی با تأیید وزارت آموزش و پرورش از پرداخت هر گونه عوارض شهرداری معاف می\u200cباشند {qs993425} و {qs212450}.\n* هرگونه انتقال بلاعوض به نفع دولت و شهرداری\u200cها از پرداخت عوارض و اخذ هرگونه گواهی معاف می\u200cباشد {qs120991}.\n* دولت و شهرداری\u200cها و موسسات وابسته به آن\u200cها با ارائه گواهی از وزارت دارایی، از پرداخت مالیات نقل و انتقالات قطعی معاف خواهند بود {qs725533} و {qs748695}.\n\nشرط کلی برای هرگونه تخفیف یا بخشودگی حقوق و عوارض شهرداری توسط دولت و قوانین مصوب، تأمین آن از بودجه عمومی سالانه کشور است. در غیر این صورت، بخشودگی و تخفیف حقوق و عوارض شهرداری ممنوع است {qs738926}.', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None, images=[], thinking_blocks=[]))]
|
|
||||||
-------------------------------
|
|
||||||
-------------------------------
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
{'unit': '0.0006956400', 'irt': 80.42, 'exchange_rate': 115600}
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
{'unit': '0.0011443000', 'irt': 132.28, 'exchange_rate': 115600}
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
47.58
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
89.14
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
117.51
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
149.32
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
238.13
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
199.27
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
20.63
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
22.99
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
77.09
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
79.66
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
55.49
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
69.28
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
64.43
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
85.99
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
108.5
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
58.21
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
66.88
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
97.08
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
85.15
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
98.28
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
111.41
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
124.53
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
50.12
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
206.18
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
219.27
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
249.11
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
286.66
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
281.35
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
297.52
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
331.9
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
363.84
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
376.68
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
423.29
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
466.93
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
475.04
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
20.42
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
36.08
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
50.47
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
41.91
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
55.16
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
39.06
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
61.63
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
40.3
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
69.5
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
82.35
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
140.04
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
137.97
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
200.29
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
244.43
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
257.28
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
313.51
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
315.89
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
355.38
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
396.78
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
435.83
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
478.95
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
482.55
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
249.69
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
536.13
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
575.45
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
609.36
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
643.95
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
706.14
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
748.32
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
762.59
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
527.55
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
875.18
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
939.63
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
955.53
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1001.8
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1065.1
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1078.9
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1147.06
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1150.96
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1220.98
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1219.78
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1265.87
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1296.9
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
47.79
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
73.08
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
87.81
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
112.63
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
137.99
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
157.22
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
186.68
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
214.6
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
242.14
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
256.68
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
283.86
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
80.44
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
121.42
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
139.51
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
101.81
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
71.52
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
180.64
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
80.55
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
97.99
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
104.13
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
155.02
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
170.09
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
96.74
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
131.7
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
80.5
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
135.59
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
148.67
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
102.43
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
131.11
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
148.85
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
98.64
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
186.92
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
188.55
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
238.4
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
235.39
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
258.96
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
209.47
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
259.7
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
234.19
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
292.11
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
297.42
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
275.51
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
286.96
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
319.76
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
386.88
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
344.8
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
406.07
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
438.77
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
659.01
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
539.14
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
551.07
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
575.04
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
72.18
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
96.69
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
106.42
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
133.98
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
162.94
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
182.09
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
206.28
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
268.84
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
275.0
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
24.96
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
56.4
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
43.72
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
63.34
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
77.81
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
78.82
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
18.85
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
49.17
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
51.87
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
76.95
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
101.75
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
122.79
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
118.51
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
101.94
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
84.07
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
224.65
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
130.38
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
108.94
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
295.75
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
105.6
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
345.01
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
344.0
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
137.18
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
395.53
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
414.09
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
204.4
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
191.2
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
31.68
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
45.93
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
74.96
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
100.02
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
129.08
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
123.44
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
236.05
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
259.93
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
291.77
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
319.9
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
348.04
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
383.19
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
235.45
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001239.35
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
860.89
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
2803.63
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
01440.25
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
411.56
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1213.3
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
063.92
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
99.39
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
000000000084.84
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
37.3
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
75.81
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
147.93
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
144.36
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
85.03
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
217.33
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
00000149.16
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0205.91
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0140.51
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0148.86
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0130.56
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0000000069.39
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0000000132.37
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0220.48
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0000000000336.06
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0414.51
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
401.78
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
00393.24
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
457.39
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
00879.57
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0859.83
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
01311.88
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
01576.05
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1619.78
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1524.98
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
837.66
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1641.84
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
890.8
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
883.3
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1953.82
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1821.97
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1876.64
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1124.28
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1093.42
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
1922.2
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0152.77
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
57.82
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
00200.13
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0202.76
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0405.73
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0357.92
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
54.72
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
00230.81
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
147.45
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0295.99
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0439.79
|
|
||||||
-------------------------------
|
|
||||||
|
|
||||||
0
|
|
||||||
|
|
@ -1,725 +0,0 @@
|
||||||
108315.8
|
|
||||||
108315.8
|
|
||||||
108315.8
|
|
||||||
108315.8
|
|
||||||
108315.8
|
|
||||||
108315.8
|
|
||||||
108315.8
|
|
||||||
108315.8
|
|
||||||
108273.66
|
|
||||||
108315.8
|
|
||||||
108315.8
|
|
||||||
108315.8
|
|
||||||
108148.33
|
|
||||||
108148.33
|
|
||||||
108148.33
|
|
||||||
108148.33
|
|
||||||
108148.33
|
|
||||||
108148.33
|
|
||||||
108148.33
|
|
||||||
108148.33
|
|
||||||
108148.33
|
|
||||||
107820.53
|
|
||||||
107820.53
|
|
||||||
107820.53
|
|
||||||
107820.53
|
|
||||||
107820.53
|
|
||||||
107820.53
|
|
||||||
107820.53
|
|
||||||
107820.53
|
|
||||||
107820.53
|
|
||||||
107820.53
|
|
||||||
107820.53
|
|
||||||
107820.53
|
|
||||||
107820.53
|
|
||||||
107820.53
|
|
||||||
107820.53
|
|
||||||
107820.53
|
|
||||||
107311.44
|
|
||||||
107311.44
|
|
||||||
107311.44
|
|
||||||
107311.44
|
|
||||||
107311.44
|
|
||||||
107311.44
|
|
||||||
106600.25
|
|
||||||
106600.25
|
|
||||||
106600.25
|
|
||||||
106600.25
|
|
||||||
106600.25
|
|
||||||
106600.25
|
|
||||||
106600.25
|
|
||||||
106600.25
|
|
||||||
105674.64
|
|
||||||
105674.64
|
|
||||||
105674.64
|
|
||||||
105674.64
|
|
||||||
105674.64
|
|
||||||
105674.64
|
|
||||||
105674.64
|
|
||||||
105674.64
|
|
||||||
105545.4
|
|
||||||
105545.4
|
|
||||||
105545.4
|
|
||||||
105392.43
|
|
||||||
105392.43
|
|
||||||
105319.32
|
|
||||||
105064.98
|
|
||||||
105064.98
|
|
||||||
105064.98
|
|
||||||
104994.01
|
|
||||||
104844.02
|
|
||||||
104844.02
|
|
||||||
104844.02
|
|
||||||
104844.02
|
|
||||||
104633.2
|
|
||||||
104633.2
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
104427.13
|
|
||||||
103993.0
|
|
||||||
103503.05
|
|
||||||
103503.05
|
|
||||||
103427.1
|
|
||||||
103350.01
|
|
||||||
103269.16
|
|
||||||
103269.16
|
|
||||||
103269.16
|
|
||||||
103269.16
|
|
||||||
103269.16
|
|
||||||
103269.16
|
|
||||||
103212.11
|
|
||||||
103212.11
|
|
||||||
103116.15
|
|
||||||
103050.78
|
|
||||||
103050.78
|
|
||||||
103050.78
|
|
||||||
102963.69
|
|
||||||
102854.07
|
|
||||||
102854.07
|
|
||||||
102854.07
|
|
||||||
102854.07
|
|
||||||
102854.07
|
|
||||||
102854.07
|
|
||||||
102854.07
|
|
||||||
102854.07
|
|
||||||
102854.07
|
|
||||||
102854.07
|
|
||||||
102854.07
|
|
||||||
102854.07
|
|
||||||
102854.07
|
|
||||||
102854.07
|
|
||||||
102854.07
|
|
||||||
102794.63
|
|
||||||
102794.63
|
|
||||||
102794.63
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102726.67
|
|
||||||
102628.35
|
|
||||||
102628.35
|
|
||||||
102628.35
|
|
||||||
102628.35
|
|
||||||
102628.35
|
|
||||||
102628.35
|
|
||||||
102628.35
|
|
||||||
102628.35
|
|
||||||
102628.35
|
|
||||||
102628.35
|
|
||||||
102628.35
|
|
||||||
102628.35
|
|
||||||
102628.35
|
|
||||||
102628.35
|
|
||||||
102628.35
|
|
||||||
102628.35
|
|
||||||
102542.09
|
|
||||||
102442.65
|
|
||||||
102330.13
|
|
||||||
102204.53
|
|
||||||
102144.41
|
|
||||||
102144.41
|
|
||||||
101937.29
|
|
||||||
101716.9
|
|
||||||
101716.9
|
|
||||||
101466.67
|
|
||||||
101466.67
|
|
||||||
101466.67
|
|
||||||
101466.67
|
|
||||||
101178.93
|
|
||||||
100896.33
|
|
||||||
100597.57
|
|
||||||
100264.5
|
|
||||||
100264.5
|
|
||||||
99899.47
|
|
||||||
99899.47
|
|
||||||
99521.35
|
|
||||||
99096.83
|
|
||||||
99096.83
|
|
||||||
99096.83
|
|
||||||
99096.83
|
|
||||||
98628.66
|
|
||||||
98628.66
|
|
||||||
98628.66
|
|
||||||
98628.66
|
|
||||||
98628.66
|
|
||||||
98628.66
|
|
||||||
98628.66
|
|
||||||
96472.15
|
|
||||||
96472.15
|
|
||||||
96450.82
|
|
||||||
96260.75
|
|
||||||
96260.75
|
|
||||||
96160.86
|
|
||||||
96103.13
|
|
||||||
95868.76
|
|
||||||
95803.86
|
|
||||||
95802.19
|
|
||||||
95597.97
|
|
||||||
95596.25
|
|
||||||
95285.35
|
|
||||||
95061.39
|
|
||||||
94827.5
|
|
||||||
94540.93
|
|
||||||
94539.27
|
|
||||||
94250.29
|
|
||||||
93564.76
|
|
||||||
93563.21
|
|
||||||
93165.04
|
|
||||||
92289.47
|
|
||||||
92287.87
|
|
||||||
91552.15
|
|
||||||
91025.68
|
|
||||||
91025.68
|
|
||||||
89884.47
|
|
||||||
89882.95
|
|
||||||
88558.88
|
|
||||||
87865.16
|
|
||||||
87336.23
|
|
||||||
86537.6
|
|
||||||
85683.38
|
|
||||||
84813.13
|
|
||||||
83900.61
|
|
||||||
82930.69
|
|
||||||
81948.34
|
|
||||||
79857.44
|
|
||||||
77636.63
|
|
||||||
77634.84
|
|
||||||
75303.16
|
|
||||||
75301.25
|
|
||||||
75179.23
|
|
||||||
75176.18
|
|
||||||
75087.4
|
|
||||||
74835.09
|
|
||||||
74676.85
|
|
||||||
74675.47
|
|
||||||
74487.78
|
|
||||||
74271.65
|
|
||||||
73770.94
|
|
||||||
73485.89
|
|
||||||
70998.28
|
|
||||||
70945.89
|
|
||||||
70657.52
|
|
||||||
70366.94
|
|
||||||
70115.51
|
|
||||||
68157.72
|
|
||||||
67863.0
|
|
||||||
67699.81
|
|
||||||
67612.06
|
|
||||||
67340.76
|
|
||||||
67226.26
|
|
||||||
67151.4
|
|
||||||
67038.79
|
|
||||||
66906.9
|
|
||||||
66794.53
|
|
||||||
66685.69
|
|
||||||
66581.6
|
|
||||||
66551.47
|
|
||||||
66420.04
|
|
||||||
66364.76
|
|
||||||
66232.0
|
|
||||||
66227.19
|
|
||||||
66002.66
|
|
||||||
65997.93
|
|
||||||
65911.03
|
|
||||||
56269.22
|
|
||||||
56197.89
|
|
||||||
56107.82
|
|
||||||
55996.72
|
|
||||||
55915.96
|
|
||||||
55828.47
|
|
||||||
55723.6
|
|
||||||
55652.33
|
|
||||||
55535.28
|
|
||||||
55419.11
|
|
||||||
55279.65
|
|
||||||
55145.03
|
|
||||||
55005.74
|
|
||||||
54881.07
|
|
||||||
54750.93
|
|
||||||
54615.69
|
|
||||||
54473.45
|
|
||||||
54151.09
|
|
||||||
54007.49
|
|
||||||
53828.13
|
|
||||||
53639.72
|
|
||||||
53456.63
|
|
||||||
53263.79
|
|
||||||
53047.02
|
|
||||||
52569.35
|
|
||||||
50475.24
|
|
||||||
50219.15
|
|
||||||
49950.43
|
|
||||||
49878.25
|
|
||||||
49779.95
|
|
||||||
49671.93
|
|
||||||
49534.53
|
|
||||||
49371.59
|
|
||||||
49187.86
|
|
||||||
48978.13
|
|
||||||
48709.29
|
|
||||||
48432.68
|
|
||||||
48406.11
|
|
||||||
48348.01
|
|
||||||
48302.61
|
|
||||||
48237.55
|
|
||||||
48158.0
|
|
||||||
48077.56
|
|
||||||
48056.84
|
|
||||||
48056.84
|
|
||||||
47952.03
|
|
||||||
47873.31
|
|
||||||
47769.79
|
|
||||||
47645.39
|
|
||||||
47516.55
|
|
||||||
47514.88
|
|
||||||
47512.76
|
|
||||||
47512.76
|
|
||||||
47433.21
|
|
||||||
47433.21
|
|
||||||
47099.25
|
|
||||||
46841.66
|
|
||||||
46841.66
|
|
||||||
46544.26
|
|
||||||
46389.25
|
|
||||||
46042.58
|
|
||||||
45696.9
|
|
||||||
45528.18
|
|
||||||
45528.18
|
|
||||||
45528.18
|
|
||||||
44715.19
|
|
||||||
44711.6
|
|
||||||
44499.03
|
|
||||||
44238.73
|
|
||||||
44238.73
|
|
||||||
44238.73
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
44201.47
|
|
||||||
43951.09
|
|
||||||
42258.63
|
|
||||||
42258.63
|
|
||||||
42258.63
|
|
||||||
42258.63
|
|
||||||
42258.63
|
|
||||||
42258.63
|
|
||||||
42258.63
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
42219.82
|
|
||||||
233219.82
|
|
||||||
233219.82
|
|
||||||
231116.5
|
|
||||||
228275.02
|
|
||||||
226390.3
|
|
||||||
225658.78
|
|
||||||
225658.78
|
|
||||||
225490.08
|
|
||||||
225490.08
|
|
||||||
225490.08
|
|
||||||
225490.08
|
|
||||||
225490.08
|
|
||||||
225490.08
|
|
||||||
225490.08
|
|
||||||
225490.08
|
|
||||||
225490.08
|
|
||||||
225490.08
|
|
||||||
225490.08
|
|
||||||
225402.31
|
|
||||||
225402.31
|
|
||||||
225362.36
|
|
||||||
225362.36
|
|
||||||
225362.36
|
|
||||||
225283.85
|
|
||||||
225283.85
|
|
||||||
225143.68
|
|
||||||
224999.32
|
|
||||||
224999.32
|
|
||||||
224912.5
|
|
||||||
224907.85
|
|
||||||
224907.85
|
|
||||||
224903.2
|
|
||||||
224721.37
|
|
||||||
224721.37
|
|
||||||
224721.37
|
|
||||||
224721.37
|
|
||||||
224721.37
|
|
||||||
224582.94
|
|
||||||
224374.03
|
|
||||||
224230.2
|
|
||||||
224091.97
|
|
||||||
223958.58
|
|
||||||
223958.58
|
|
||||||
223958.58
|
|
||||||
223958.58
|
|
||||||
223958.58
|
|
||||||
223958.58
|
|
||||||
223958.58
|
|
||||||
223958.58
|
|
||||||
223958.58
|
|
||||||
223958.58
|
|
||||||
223886.16
|
|
||||||
223886.16
|
|
||||||
223886.16
|
|
||||||
223886.16
|
|
||||||
223886.16
|
|
||||||
223886.16
|
|
||||||
223886.16
|
|
||||||
223886.16
|
|
||||||
223886.16
|
|
||||||
223751.03
|
|
||||||
223751.03
|
|
||||||
223751.03
|
|
||||||
223527.8
|
|
||||||
223527.8
|
|
||||||
223527.8
|
|
||||||
223527.8
|
|
||||||
223527.8
|
|
||||||
223527.8
|
|
||||||
223527.8
|
|
||||||
223527.8
|
|
||||||
223527.8
|
|
||||||
223527.8
|
|
||||||
223527.8
|
|
||||||
223188.5
|
|
||||||
223188.5
|
|
||||||
223188.5
|
|
||||||
222771.05
|
|
||||||
222768.48
|
|
||||||
222768.48
|
|
||||||
222366.71
|
|
||||||
222363.93
|
|
||||||
222363.93
|
|
||||||
221970.69
|
|
||||||
221970.69
|
|
||||||
221970.69
|
|
||||||
221510.58
|
|
||||||
220625.24
|
|
||||||
219765.41
|
|
||||||
218447.93
|
|
||||||
217015.15
|
|
||||||
217015.15
|
|
||||||
217015.15
|
|
||||||
217009.55
|
|
||||||
215537.02
|
|
||||||
214006.44
|
|
||||||
214003.7
|
|
||||||
214003.7
|
|
||||||
211518.61
|
|
||||||
211518.61
|
|
||||||
210632.26
|
|
||||||
210632.26
|
|
||||||
210632.26
|
|
||||||
210632.26
|
|
||||||
201231.56
|
|
||||||
201231.56
|
|
||||||
201231.56
|
|
||||||
201231.56
|
|
||||||
201231.56
|
|
||||||
201231.56
|
|
||||||
201231.56
|
|
||||||
201231.56
|
|
||||||
200138.15
|
|
||||||
200138.15
|
|
||||||
200138.15
|
|
||||||
199996.16
|
|
||||||
199996.16
|
|
||||||
199993.39
|
|
||||||
199935.57
|
|
||||||
199935.57
|
|
||||||
199935.57
|
|
||||||
199732.61
|
|
||||||
199729.61
|
|
||||||
199729.61
|
|
||||||
199526.85
|
|
||||||
199523.93
|
|
||||||
199155.08
|
|
||||||
199152.27
|
|
||||||
199152.27
|
|
||||||
198791.4
|
|
||||||
198791.4
|
|
||||||
198791.4
|
|
||||||
198741.66
|
|
||||||
198738.93
|
|
||||||
198738.93
|
|
||||||
198736.29
|
|
||||||
198736.29
|
|
||||||
198736.29
|
|
||||||
198379.02
|
|
||||||
198379.02
|
|
||||||
198379.02
|
|
||||||
198080.2
|
|
||||||
198077.46
|
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -1,10 +0,0 @@
|
||||||
[0.43728234058085713, 'الف - حقوق بنیادین کار (آزادی انجمنها و حمایت از حق تشکلهای مدنی روابط کار، حق سازماندهی و مذاکره دسته\u200cجمعی، تساوی مزدها برای زن و مرد در مقابل کار هم ارزش، منع تبعیض در اشتغال و حرفه، رعایت حداقل سن کار، ممنوعیت کار کودک، رعایت حداقل مزد متناسب با حداقل معیشت).', 186639]
|
|
||||||
[0.17097510097612545, 'تبصره ۱۱ - بمنظور ایجاد مرجع صلاحیتدار و بیطرفی برای حل اختلافات بین کارگر و کارفرما و ایجاد حسن تفاهم در بین آنان و تمرکز امور مربوط بکار و مراقبت در تهیه و اجرای مقررات قانون کار و قانون بیمه کارگران و همچنین حمایت و تأمین بهداشت و رفاه و بالا بردن سطح زندگی کارگران و وضع و اجرای مقررات بیمه\u200cهای اجتماعی و برقرار نمودن روابط با تشکیلات بین\u200cالمللی کار وزارتخانه\u200cای بنام وزارت کار تأسیس می\u200cشود.', 128416]
|
|
||||||
[0.15169625817516322, 'ث) حمایت از کارگران و نمایندگان آنها در برابر اقدامات انضباطی ناشی از اعمالی که آنها مطابق سیاست موضوع ماده (۴) فوق به طور معقول انجام داده\u200cاند.', 75037]
|
|
||||||
[0.11213845051838162, 'ماده ۷ - دولتهای طرف این میثاق حق هر کس را به تمتع از شرایط عادلانه و مساعد کار که بویژه متضمن مراتب زیر باشد برسمیت بشناسند: الف - اجرتی که لااقل امور ذیل را برای کلیه کارگران تأمین نماید: ۱ - مزد منصفانه و اجرت مساوی برای کار با ارزش مساوی بدون هیچ نوع تمایز بویژه اینکه زنان تضمین داشته باشند که شرایط کار آنان پائین\u200cتر از\u200cشرایط مورد استفاده مردان نباشد و برای کار مساوی مزد مساوی با مردان دریافت دارند. ۲ - مزایای کافی برای آنان و خانواده\u200cشان طبق مقررات این میثاق: ب - ایمنی و بهداشت کار. ج - تساوی فرصت برای هر کس که بتواند در خدمت خود بمدارج مناسب عالیتری ارتقاء یابد بدون در نظر گرفتن هیچگونه ملاحظات دیگری جز\u200cطول مدت خدمت و لیاقت. د - استراحت - فراغت و محدودیت معقول ساعات کار و مرخصی اداری با استفاده از حقوق همچنین مزد ایام تعطیل رسمی.', 194273]
|
|
||||||
[0.1079329839273747, '۲۰ - بمنظور تأمین شرائط مناسب\u200cتر کار و زندگی کارگران مهاجر نسبت بشرائطی که قانون یا رویه عملی برای سایر کارگران که در خدمت متشابه\u200cاشتغال دارند مقرر داشته و همچنین برای اینکه کارگران مهاجر هم مثل کارگران دیگر مشمول مقیاسهای حمایت بنحوی که در بندهای آتی این سفارش نامه خواهد آمد گردند کلیۀ مساعی باید صورت گیرد.', 8843]
|
|
||||||
[0.0531841966906351, '۱۳ ایجاد نظام جامع تأمین اجتماعی برای حمایت از حقوق محرومان و مستضعفان و مبارزه با فقر و حمایت از نهادهای عمومی و موسسات و خیریه\u200cهای مردمی با رعایت ملاحظات دینی و انقلابی.', 213766]
|
|
||||||
[0.05166811304646011, '۲ - ایجاد نظام جامع تأمین اجتماعی برای حمایت از حقوق محرومان و مستضعفان و مبارزه با فقر و حمایت از نهاد\u200cهای عمومی و موسسات و خیریه\u200cهای مردمی با رعایت ملاحظات دینی و انقلابی.', 53933]
|
|
||||||
[0.051528153447387044, 'ج تقویت همسویی منافع کارگران و کارفرمایان و تکالیف دولت با رویکرد حمایت از تولید و سه\u200cجانبه گرایی', 185751]
|
|
||||||
[0.024949120491999023, 'ماده ۲ - ۱ - هر یک از کشورهای عضو باید بوسائلی که منطبق با روشهای معمول جهت تعیین میزان اجرت باشد اجرای اصل تساوی اجرت کارگر\u200cزن و مرد را در قبال کار هم ارزش تشویق و تا حدودی که با روشهای فوق\u200cالذکر تطبیق نماید اجرای آنرا درباره عموم کارگران تأمین کند. ۲ - اجرای این اصل ممکن است بطرق زیر صورت گیرد: الف - وضع قوانین داخلی. ب - هر روشی که جهت تعیین میزان اجرت\u200cها ضمن قوانین پیش\u200cبینی\u200cشده باشد. ج - انعقاد پیمان\u200cهای دسته\u200cجمعی بین کارفرمایان و کارگران. د - ترکیبی از این روشهای مختلف.', 204904]
|
|
||||||
[0.024270693471581787, 'ز تلاش در جهت گسترش امکانات رفاهی و حفظ حقوق قانونی کارکنان واحد.', 35580]
|
|
||||||
File diff suppressed because one or more lines are too long
59
main.py
Normal file
59
main.py
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from fastapi import FastAPI ,Header
|
||||||
|
from openai import AsyncOpenAI
|
||||||
|
from routes.rag_base import router as rag_base
|
||||||
|
|
||||||
|
|
||||||
|
async def get_oss_client():
|
||||||
|
LLM_URL = "http://172.16.29.102:8001/v1/"
|
||||||
|
client = await AsyncOpenAI(base_url= LLM_URL, api_key="EMPTY")
|
||||||
|
return client
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def create_app() -> FastAPI:
|
||||||
|
app = FastAPI(title="qachat2 Backend", version="0.1.0")
|
||||||
|
origins = ["*"]
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=origins,
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# app.state.settings = get_settings()
|
||||||
|
|
||||||
|
@app.on_event("startup")
|
||||||
|
async def on_startup() -> None:
|
||||||
|
print("startup app")
|
||||||
|
client = getattr(app.state, "oss_client", None)
|
||||||
|
if not client :
|
||||||
|
client = get_oss_client()
|
||||||
|
app.state.oss_client = client
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("shutdown")
|
||||||
|
async def on_shutdown() -> None:
|
||||||
|
client = getattr(app.state, "elastic_client", None)
|
||||||
|
if client is not None:
|
||||||
|
await client.close()
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
async def simple():
|
||||||
|
return "ai rag caht qanon OK"
|
||||||
|
|
||||||
|
@app.get("/ping")
|
||||||
|
async def ping():
|
||||||
|
return "ai rag caht qanon OK"
|
||||||
|
|
||||||
|
app.include_router(rag_base, prefix="")
|
||||||
|
return app
|
||||||
|
|
||||||
|
|
||||||
|
app = create_app()
|
||||||
|
|
||||||
|
|
||||||
1008
routes/chatbot_handler.py
Executable file
1008
routes/chatbot_handler.py
Executable file
File diff suppressed because it is too large
Load Diff
47
routes/rag_base.py
Normal file
47
routes/rag_base.py
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, Request
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
import routes.chatbot_handler as chatbot_handler
|
||||||
|
import datetime
|
||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
router = APIRouter(tags=["ragchat"])
|
||||||
|
# settings= get_settings()
|
||||||
|
|
||||||
|
|
||||||
|
# تعریف مدل دادهها برای درخواستهای API
|
||||||
|
class RagQueryModal(BaseModel):
|
||||||
|
query: str
|
||||||
|
|
||||||
|
|
||||||
|
async def create_chat_id():
|
||||||
|
date = str((datetime.datetime.now())).replace(' ','-').replace(':','').replace('.','-')
|
||||||
|
|
||||||
|
chat_id = f'{date}-{random.randint(100000, 999999)}'
|
||||||
|
|
||||||
|
return chat_id
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/emergency_call")
|
||||||
|
async def emergency_call(payload: RagQueryModal):
|
||||||
|
print('emergency generate answer ...')
|
||||||
|
chat_id = await create_chat_id()
|
||||||
|
answer = await chatbot_handler.ask_chatbot_avalai(payload.query, chat_id)
|
||||||
|
# print('emergency answer ...', answer)
|
||||||
|
await chatbot_handler.credit_refresh()
|
||||||
|
print('*** ... ready for next ... ***')
|
||||||
|
return {"answer": answer}
|
||||||
|
|
||||||
|
@router.post("/run_chat")
|
||||||
|
async def run_chat(payload: RagQueryModal, request: Request):
|
||||||
|
# request.state.app
|
||||||
|
print('run_chat start ...')
|
||||||
|
chat_id = await create_chat_id()
|
||||||
|
answer = await chatbot_handler.ask_chatbot(payload.query, chat_id)
|
||||||
|
print('*** ... ready for next ... ***')
|
||||||
|
|
||||||
|
return {"answer": answer}
|
||||||
|
|
@ -1,3 +1,3 @@
|
||||||
docker stop qachat
|
docker stop qachat
|
||||||
docker rm qachat
|
docker rm qachat
|
||||||
docker run --name qachat -p 2425:80 --net qachat_net --gpus=all -v ./:/src/app/ -v ./qavanin-faiss/:/src/app/qavanin-faiss/ -v ./llm-answer/:/src/app/llm-answer/ -v ./../MODELS:/src/MODELS -v ./../cache:/root/.cache/huggingface/hub -it --restart unless-stopped docker.tavasi.ir/tavasi/qachat:1.0.0
|
docker run --name qachat -p 2425:80 --net qachat_net --gpus=all -v ./:/src/app/ -v ./qavanin-faiss/:/src/app/qavanin-faiss/ -v ./llm-answer/:/src/app/llm-answer/ -v ./../MODELS:/src/MODELS -v ./../cache:/root/.cache/huggingface/hub -it --restart unless-stopped docker.tavasi.ir/tavasi/qachat2:1.0.0
|
||||||
|
|
|
||||||
72
util/convert_qavanin_json_to_faiss.py
Executable file
72
util/convert_qavanin_json_to_faiss.py
Executable file
|
|
@ -0,0 +1,72 @@
|
||||||
|
import json
|
||||||
|
import numpy as np
|
||||||
|
import faiss
|
||||||
|
import os
|
||||||
|
|
||||||
|
def create_faiss_index_from_json(json_file_path, faiss_index_path, metadata_file_path):
|
||||||
|
print(f'try to read {json_file_path} ...')
|
||||||
|
# --- 1. بارگذاری دادهها از JSON ---
|
||||||
|
with open(json_file_path, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
print(f'file reading finished')
|
||||||
|
|
||||||
|
# فرض بر این است که هر عنصر شامل فیلدهای زیر است:
|
||||||
|
# {
|
||||||
|
# "speech_title": "title",
|
||||||
|
# "sentence": "متن جمله",
|
||||||
|
# "embeddings": [0.12, 0.34, ...]
|
||||||
|
# }
|
||||||
|
|
||||||
|
sentences = []
|
||||||
|
titles = []
|
||||||
|
embeddings_list = []
|
||||||
|
prefix_list = []
|
||||||
|
for k, item in data.items():
|
||||||
|
sentences.append(item['content'])
|
||||||
|
titles.append(item['id'])
|
||||||
|
embeddings_list.append(item['embeddings'])
|
||||||
|
prefix_list.append(item['section-prefix'])
|
||||||
|
|
||||||
|
embeddings = np.array(embeddings_list).astype('float32') # ابعاد: (n, d)
|
||||||
|
dimension = embeddings.shape[1]
|
||||||
|
|
||||||
|
print(f"Loaded {len(embeddings)} embeddings with dimension {dimension}")
|
||||||
|
|
||||||
|
# --- 2. ایجاد ایندکس FAISS برای GPU ---
|
||||||
|
# اگر فقط CPU دارید، از faiss.IndexFlatL2 استفاده کنید.
|
||||||
|
# اگر GPU دارید، ابتدا ایندکس را روی CPU ایجاد و سپس به GPU انتقال دهید.
|
||||||
|
cpu_index = faiss.IndexFlatL2(dimension) # معیار فاصله L2 (Euclidean)
|
||||||
|
|
||||||
|
# انتقال ایندکس به GPU
|
||||||
|
if faiss.get_num_gpus() > 0:
|
||||||
|
print("Using GPU for FAISS index...")
|
||||||
|
res = faiss.StandardGpuResources()
|
||||||
|
gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index)
|
||||||
|
else:
|
||||||
|
print("GPU not available, using CPU.")
|
||||||
|
gpu_index = cpu_index
|
||||||
|
|
||||||
|
# --- 3. افزودن دادهها به ایندکس ---
|
||||||
|
gpu_index.add(embeddings)
|
||||||
|
print(f"Total vectors indexed: {gpu_index.ntotal}")
|
||||||
|
|
||||||
|
# --- 4. ذخیره ایندکس به فایل ---
|
||||||
|
# برای ذخیره باید به CPU منتقل شود
|
||||||
|
final_index = faiss.index_gpu_to_cpu(gpu_index) if isinstance(gpu_index, faiss.Index) and faiss.get_num_gpus() > 0 else gpu_index
|
||||||
|
os.makedirs(os.path.dirname(faiss_index_path), exist_ok=True)
|
||||||
|
faiss.write_index(final_index, faiss_index_path)
|
||||||
|
print(f"FAISS index saved to {faiss_index_path}")
|
||||||
|
|
||||||
|
# --- 5. ذخیره متادیتا (برای نگاشت نتایج جستجو) ---
|
||||||
|
metadata = [{"id": id, "content": c, 'prefix': p} for id, c, p in zip(titles, sentences,prefix_list)]
|
||||||
|
with open(metadata_file_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
||||||
|
print(f"Metadata saved to {metadata_file_path}")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# استفاده از متد
|
||||||
|
json_file_path = '../majles-output/sections-vec-285k.json'
|
||||||
|
faiss_index_path = '../data/qavanin-faiss/faiss_index_qavanin_285k.index'
|
||||||
|
metadata_file_path = '../data/qavanin-faiss/faiss_index_qavanin_285k_metadata.json'
|
||||||
|
|
||||||
|
create_faiss_index_from_json(json_file_path, faiss_index_path, metadata_file_path)
|
||||||
2
util/docker_build.bash
Executable file
2
util/docker_build.bash
Executable file
|
|
@ -0,0 +1,2 @@
|
||||||
|
sudo docker build -t docker.tavasi.ir/tavasi/qachat_base:1.0.0 -f dockerfile_base .
|
||||||
|
sudo docker build -t docker.tavasi.ir/tavasi/qachat2:1.0.0 .
|
||||||
677
util/elastic_helper.py
Executable file
677
util/elastic_helper.py
Executable file
|
|
@ -0,0 +1,677 @@
|
||||||
|
import zipfile
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from time import sleep
|
||||||
|
from elasticsearch7 import Elasticsearch,helpers
|
||||||
|
|
||||||
|
class ElasticHelper():
|
||||||
|
|
||||||
|
counter = 0
|
||||||
|
total = 0
|
||||||
|
id = ""
|
||||||
|
path_mappings = os.getcwd() + '/repo/_other/'
|
||||||
|
|
||||||
|
def __init__(self, es_url="http://127.0.0.1:6900", es_pass="", es_user="elastic", path_mappings = ""):
|
||||||
|
|
||||||
|
if path_mappings :
|
||||||
|
self.path_mappings = path_mappings
|
||||||
|
|
||||||
|
if es_pass == '' :
|
||||||
|
self.es = Elasticsearch(es_url)
|
||||||
|
else:
|
||||||
|
self.es = Elasticsearch(
|
||||||
|
es_url,
|
||||||
|
http_auth=(es_user, es_pass),
|
||||||
|
)
|
||||||
|
|
||||||
|
# print(es_url)
|
||||||
|
# print(self.es)
|
||||||
|
|
||||||
|
self.success_connect = False
|
||||||
|
for a in range(0,10):
|
||||||
|
try :
|
||||||
|
if not self.es.ping():
|
||||||
|
print('elastic not ping, sleep 30 s : ', a)
|
||||||
|
sleep(5)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
self.success_connect = True
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
break
|
||||||
|
if not self.success_connect :
|
||||||
|
print('******','not access to elastic service')
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
self.counter = 0
|
||||||
|
self.total = 0
|
||||||
|
self.id = ""
|
||||||
|
|
||||||
|
|
||||||
|
def get_doctument(self, index_name, id):
|
||||||
|
res = self.es.get(index=index_name, id=id)
|
||||||
|
return res
|
||||||
|
|
||||||
|
def exist_doctument(self, index_name, id):
|
||||||
|
res = self.es.exists(index=index_name, id=id)
|
||||||
|
return res
|
||||||
|
|
||||||
|
def update_index_doc(self, is_update_state, index_name_o, eid, data):
|
||||||
|
if is_update_state:
|
||||||
|
resp = self.es.update(index=index_name_o, id=eid, doc=data)
|
||||||
|
# resp = self.es.update(index=index_name_o, id=eid, body={'doc':data})
|
||||||
|
else:
|
||||||
|
resp = self.es.index(index=index_name_o, id=eid, document=data)
|
||||||
|
return resp
|
||||||
|
|
||||||
|
|
||||||
|
def exportToJsonForAI(self, path_back, index_name, out_name= '', body={}, fields=[]) :
|
||||||
|
print('*' * 50, ' start backup -->', index_name)
|
||||||
|
self.counter = 0
|
||||||
|
sid = None
|
||||||
|
|
||||||
|
out = out_name
|
||||||
|
if out_name == '' :
|
||||||
|
out = index_name
|
||||||
|
|
||||||
|
fout = open( path_back + "/"+ out + '.json', 'a+' , encoding='utf-8')
|
||||||
|
|
||||||
|
s_res = self.es.search(
|
||||||
|
index=index_name,
|
||||||
|
scroll='5m',
|
||||||
|
size=1000,
|
||||||
|
body=body
|
||||||
|
)
|
||||||
|
self.total = s_res["hits"]["total"]['value']
|
||||||
|
|
||||||
|
print('start index = %s' % index_name)
|
||||||
|
print('total = %d' % self.total)
|
||||||
|
|
||||||
|
sid = s_res['_scroll_id']
|
||||||
|
scroll_size = len(s_res['hits']['hits'])
|
||||||
|
file_count = 1
|
||||||
|
out_json = []
|
||||||
|
while scroll_size > 0:
|
||||||
|
"Scrolling..."
|
||||||
|
self.counter += scroll_size
|
||||||
|
print("progress -> %.2f %%" % ((self.counter / self.total)*100))
|
||||||
|
#############################
|
||||||
|
for item in s_res['hits']['hits']:
|
||||||
|
|
||||||
|
if fields :
|
||||||
|
item2={}
|
||||||
|
item2['id']=item['_id']
|
||||||
|
for kf in fields :
|
||||||
|
#print(kf)
|
||||||
|
if kf in item['_source'] :
|
||||||
|
# print(item['_source'][kf])
|
||||||
|
item2[kf] = item['_source'][kf]
|
||||||
|
#exit()
|
||||||
|
else :
|
||||||
|
item2=item
|
||||||
|
|
||||||
|
out_json.append(item2)
|
||||||
|
|
||||||
|
|
||||||
|
s_res = self.es.scroll(scroll_id=sid, scroll='2m', request_timeout=100000)
|
||||||
|
sid = s_res['_scroll_id']
|
||||||
|
scroll_size = len(s_res['hits']['hits'])
|
||||||
|
|
||||||
|
sid = None
|
||||||
|
text = json.dumps(out_json, ensure_ascii=False)
|
||||||
|
fout.write(text)
|
||||||
|
|
||||||
|
##############################
|
||||||
|
|
||||||
|
def backupIndexToZipfile(self, path_back, index_name, out_name= '', body={}, byzip = True, fields=[], noFields=[]) :
|
||||||
|
print('*' * 50, ' start backup -->', index_name)
|
||||||
|
self.counter = 0
|
||||||
|
sid = None
|
||||||
|
|
||||||
|
out = out_name
|
||||||
|
if out_name == '' :
|
||||||
|
out = index_name
|
||||||
|
|
||||||
|
|
||||||
|
if body == {} :
|
||||||
|
s_res = self.es.search(
|
||||||
|
index=index_name,
|
||||||
|
scroll='5m',
|
||||||
|
size=1000
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
s_res = self.es.search(
|
||||||
|
index=index_name,
|
||||||
|
scroll='5m',
|
||||||
|
size=1000,
|
||||||
|
body=body
|
||||||
|
)
|
||||||
|
|
||||||
|
self.total = s_res["hits"]["total"]['value']
|
||||||
|
if self.total == 0 :
|
||||||
|
print('total index_name by query = %d' % self.total)
|
||||||
|
return False
|
||||||
|
|
||||||
|
if byzip:
|
||||||
|
fout = zipfile.ZipFile(path_back + "/"+ out + '.zip', 'w')
|
||||||
|
else:
|
||||||
|
fout = open( path_back + "/"+ out + '.json', 'a+' , encoding='utf-8')
|
||||||
|
|
||||||
|
|
||||||
|
print('start index = %s' % index_name)
|
||||||
|
print('total = %d' % self.total)
|
||||||
|
|
||||||
|
sid = s_res['_scroll_id']
|
||||||
|
scroll_size = len(s_res['hits']['hits'])
|
||||||
|
file_count = 1
|
||||||
|
while scroll_size > 0:
|
||||||
|
"Scrolling..."
|
||||||
|
self.counter += scroll_size
|
||||||
|
print("progress -> %.2f %%" % ((self.counter / self.total)*100))
|
||||||
|
#############################
|
||||||
|
out_json = []
|
||||||
|
for item in s_res['hits']['hits']:
|
||||||
|
if fields :
|
||||||
|
item2={}
|
||||||
|
item2['id']=item['_id']
|
||||||
|
item2['_source']={}
|
||||||
|
for kf in fields :
|
||||||
|
if kf in item['_source'] :
|
||||||
|
item2['_source'][kf] = item['_source'][kf]
|
||||||
|
else :
|
||||||
|
item2=item
|
||||||
|
|
||||||
|
if noFields :
|
||||||
|
for kf in noFields :
|
||||||
|
if kf in item2['_source']:
|
||||||
|
del item2['_source'][kf]
|
||||||
|
|
||||||
|
|
||||||
|
out_json.append(item2)
|
||||||
|
|
||||||
|
|
||||||
|
text = json.dumps(out_json, ensure_ascii=False)
|
||||||
|
out_json = []
|
||||||
|
if byzip:
|
||||||
|
filename = out + str(file_count) + '.json'
|
||||||
|
file_count +=1
|
||||||
|
fout.writestr(filename, text.encode('utf-8'), zipfile.ZIP_DEFLATED )
|
||||||
|
else:
|
||||||
|
fout.write(text)
|
||||||
|
|
||||||
|
##############################
|
||||||
|
s_res = self.es.scroll(scroll_id=sid, scroll='2m', request_timeout=100000)
|
||||||
|
sid = s_res['_scroll_id']
|
||||||
|
scroll_size = len(s_res['hits']['hits'])
|
||||||
|
sid = None
|
||||||
|
fout.close()
|
||||||
|
|
||||||
|
|
||||||
|
def restorFileToElastic(self, path_back, index_name, app_key = '', queryDelete = True, map_name='') :
|
||||||
|
if not os.path.exists(path_back) :
|
||||||
|
print(' **** error *** path not exist: ', path_back)
|
||||||
|
return False
|
||||||
|
|
||||||
|
file_path = path_back + '/' + index_name + '.zip'
|
||||||
|
if not os.path.exists(file_path ) :
|
||||||
|
return False
|
||||||
|
|
||||||
|
if queryDelete :
|
||||||
|
# اگر وجود داشته باشد، از کاربر برای حذفش سوال میکند
|
||||||
|
if self.deleteIndex(index_name) :
|
||||||
|
self.createIndex(index_name, app_key, map_name)
|
||||||
|
self.zipFileToElastic(file_path, index_name)
|
||||||
|
else : # اگر وجود داشته باشد پرش می کند و کاری نمیکند
|
||||||
|
self.createIndex(index_name, app_key, map_name)
|
||||||
|
self.zipFileToElastic(file_path, index_name)
|
||||||
|
|
||||||
|
def restorFileToElastic2(self, path_file, index_name, app_key = '', queryDelete = True, map_name='') :
|
||||||
|
if not os.path.exists(path_file) :
|
||||||
|
print(' **** error *** path not exist: ', path_file)
|
||||||
|
return False
|
||||||
|
|
||||||
|
file_path = path_file
|
||||||
|
if not os.path.exists(file_path ) :
|
||||||
|
return False
|
||||||
|
|
||||||
|
if queryDelete :
|
||||||
|
# اگر وجود داشته باشد، از کاربر برای حذفش سوال میکند
|
||||||
|
if self.deleteIndex(index_name) :
|
||||||
|
self.createIndex(index_name, app_key, map_name)
|
||||||
|
self.zipFileToElastic(file_path, index_name)
|
||||||
|
else : # اگر وجود داشته باشد پرش می کند و کاری نمیکند
|
||||||
|
self.createIndex(index_name, app_key, map_name)
|
||||||
|
self.zipFileToElastic(file_path, index_name)
|
||||||
|
|
||||||
|
|
||||||
|
def renameElasticIndex(self, index_name_i, index_name_o, app_key = '', map_name='') :
|
||||||
|
|
||||||
|
if self.createIndex(index_name_o, app_key, map_name) :
|
||||||
|
res = self.es.reindex(
|
||||||
|
body={
|
||||||
|
"source": {"index": index_name_i},
|
||||||
|
"dest": {"index": index_name_o}
|
||||||
|
},
|
||||||
|
wait_for_completion=False)
|
||||||
|
|
||||||
|
print(type(res))
|
||||||
|
print(res)
|
||||||
|
|
||||||
|
taskid = res["task"] if res["task"] else ""
|
||||||
|
#tasks = client.TasksClient(self.es)
|
||||||
|
tasks = self.es.tasks
|
||||||
|
while True :
|
||||||
|
res = tasks.get(task_id = taskid)
|
||||||
|
if res["completed"] :
|
||||||
|
break
|
||||||
|
|
||||||
|
# print( res["task"])
|
||||||
|
print( '----', index_name_o, ' imported : ', res["task"]["status"]["total"] , ' / ', res["task"]["status"]["created"])
|
||||||
|
sleep(1)
|
||||||
|
print( '----', index_name_o, ' complated')
|
||||||
|
|
||||||
|
|
||||||
|
def deleteIndex(self, index_name) :
|
||||||
|
if not self.es.indices.exists(index=index_name) :
|
||||||
|
print(' ' * 10, " for delete NOT exist index :", index_name )
|
||||||
|
return True
|
||||||
|
|
||||||
|
question = 'Is DELETE elastic index (' + index_name +') ? '
|
||||||
|
if self.query_yes_no(question) :
|
||||||
|
self.es.indices.delete(index = index_name)
|
||||||
|
print('%' * 10 , " Finish DELETE index :", index_name )
|
||||||
|
return True
|
||||||
|
else :
|
||||||
|
return False
|
||||||
|
|
||||||
|
def query_yes_no(self, question, default="no"):
|
||||||
|
valid = { "yes": True, "y": True, "ye": True, "no": False, "n": False }
|
||||||
|
if default is None:
|
||||||
|
prompt = " [y/n] "
|
||||||
|
elif default == "yes":
|
||||||
|
prompt = " [Y/n] "
|
||||||
|
elif default == "no":
|
||||||
|
prompt = " [y/N] "
|
||||||
|
else:
|
||||||
|
raise ValueError("invalid default answer: '%s'" % default)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print('%'*10, ' quistion ', '%'*10 , '\n')
|
||||||
|
sys.stdout.write(question + prompt)
|
||||||
|
choice = input().lower()
|
||||||
|
if default is not None and choice == "":
|
||||||
|
return valid[default]
|
||||||
|
elif choice in valid:
|
||||||
|
return valid[choice]
|
||||||
|
else:
|
||||||
|
sys.stdout.write("لطفا یکی از موارد روبرو را وارد کنید : 'yes' or 'no' " "(or 'y' or 'n').\n")
|
||||||
|
|
||||||
|
def createIndexIfNotExist(self, index_name_o, mapping_o=""):
|
||||||
|
try:
|
||||||
|
if not self.es.indices.exists(index=index_name_o):
|
||||||
|
response = self.es.indices.create(index=index_name_o, body=mapping_o)
|
||||||
|
# print out the response:
|
||||||
|
print("create index response:", response)
|
||||||
|
except:
|
||||||
|
print("....... index exist ! ... not created")
|
||||||
|
|
||||||
|
|
||||||
|
def createIndex(self, index_name, app_key='', map_name=''):
|
||||||
|
|
||||||
|
path_base = self.path_mappings
|
||||||
|
path_mapping1 = path_base + 'general/'
|
||||||
|
if app_key == '' :
|
||||||
|
app_key = 'tavasi'
|
||||||
|
path_mapping2 = path_base + app_key + '/'
|
||||||
|
|
||||||
|
|
||||||
|
if map_name == '':
|
||||||
|
map_name = index_name
|
||||||
|
|
||||||
|
if self.es.indices.exists(index=index_name) :
|
||||||
|
print("============== exist index :", index_name )
|
||||||
|
return True
|
||||||
|
|
||||||
|
if map_name == 'mj_rg_section' or map_name == 'semantic_search' :
|
||||||
|
map_name = 'mj_qa_section'
|
||||||
|
elif map_name[-3]=='_ai':
|
||||||
|
map_name=[0-len(map_name)-3]
|
||||||
|
print(map_name)
|
||||||
|
|
||||||
|
mapping_file_path = path_mapping1 + map_name + '.json'
|
||||||
|
print("mapping_file_path : " , mapping_file_path)
|
||||||
|
if not os.path.isfile(mapping_file_path):
|
||||||
|
if not os.path.isfile(mapping_file_path):
|
||||||
|
mapping_file_path = path_mapping2 + map_name + '.json'
|
||||||
|
|
||||||
|
print("mapping_file_path : " , mapping_file_path)
|
||||||
|
|
||||||
|
# Create Index With Mapping
|
||||||
|
if os.path.isfile(mapping_file_path):
|
||||||
|
mapping_file = open( mapping_file_path,'r', encoding='utf-8' )
|
||||||
|
mapping_file_read = mapping_file.read()
|
||||||
|
mapping_data = json.loads(mapping_file_read)
|
||||||
|
mapping_file.close()
|
||||||
|
if self.es.indices.exists(index=index_name) :
|
||||||
|
print("============== exist index :", index_name )
|
||||||
|
else :
|
||||||
|
self.es.indices.create(index = index_name , body = mapping_data)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print('*** error not find maping file elastic : *******', mapping_file_path)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def updateBulkList(self, listData, index_name):
|
||||||
|
chunk_size=1000
|
||||||
|
raise_on_error=False
|
||||||
|
raise_on_exception=False
|
||||||
|
stats_only=True
|
||||||
|
yield_ok = False
|
||||||
|
|
||||||
|
actions=[]
|
||||||
|
for item in listData:
|
||||||
|
actions.append({
|
||||||
|
"_op_type": "update",
|
||||||
|
"_index": index_name,
|
||||||
|
"_id" : item['_id'],
|
||||||
|
"doc": item['_source']
|
||||||
|
}
|
||||||
|
)
|
||||||
|
helpers.bulk(self.es, actions, chunk_size, raise_on_error, raise_on_exception, stats_only, yield_ok )
|
||||||
|
|
||||||
|
def importBulkList(self, listData, index_name):
|
||||||
|
chunk_size=100000
|
||||||
|
raise_on_error=False
|
||||||
|
raise_on_exception=False
|
||||||
|
stats_only=True
|
||||||
|
yield_ok = False
|
||||||
|
|
||||||
|
for item in listData:
|
||||||
|
actions = [{
|
||||||
|
"_op_type": "index",
|
||||||
|
"_index": index_name,
|
||||||
|
"_id" : item['_id'],
|
||||||
|
"_source": item['_source']
|
||||||
|
}
|
||||||
|
]
|
||||||
|
helpers.bulk(self.es, actions, chunk_size, raise_on_error, raise_on_exception, stats_only, yield_ok )
|
||||||
|
|
||||||
|
|
||||||
|
def importJsonDataToElastic(self, jsonData, index_name, fields=[]):
|
||||||
|
chunk_size=1000
|
||||||
|
raise_on_error=False
|
||||||
|
raise_on_exception=False
|
||||||
|
stats_only=True
|
||||||
|
yield_ok = False
|
||||||
|
|
||||||
|
actions=[]
|
||||||
|
|
||||||
|
for item in jsonData:
|
||||||
|
id = item['_id'] if item['_id'] else item['id']
|
||||||
|
source = item['_source']
|
||||||
|
if fields :
|
||||||
|
source = {}
|
||||||
|
for col in fields :
|
||||||
|
if col in item['_source'] :
|
||||||
|
source[col] = item['_source']
|
||||||
|
|
||||||
|
|
||||||
|
actions.append({
|
||||||
|
"_op_type": "index",
|
||||||
|
"_index": index_name,
|
||||||
|
"_id" : id,
|
||||||
|
"_source": source
|
||||||
|
})
|
||||||
|
helpers.bulk(self.es, actions, chunk_size, raise_on_error, raise_on_exception, stats_only, yield_ok )
|
||||||
|
|
||||||
|
|
||||||
|
def fileToElastic(self, file_path, index_name, limit_pack = -1, fields=[]):
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
print("file zip:" , file_path , " not exist")
|
||||||
|
return
|
||||||
|
print("index:" , index_name , '=>' , file_path )
|
||||||
|
self.counter = 0
|
||||||
|
with open(file_path) as file:
|
||||||
|
data = json.loads(file.read())
|
||||||
|
self.importJsonDataToElastic(data, index_name, fields)
|
||||||
|
|
||||||
|
self.es.indices.refresh(index=index_name)
|
||||||
|
print(self.es.cat.count(index=index_name, format="json"))
|
||||||
|
|
||||||
|
def zipFileToElastic(self, file_path, index_name, limit_pack = -1, fields=[]):
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
print("file zip:" , file_path , " not exist for imort to elastic : ", index_name )
|
||||||
|
return
|
||||||
|
|
||||||
|
fileNo = 0
|
||||||
|
with zipfile.ZipFile(file_path, 'r') as zObject:
|
||||||
|
fileNo +=1
|
||||||
|
print("="*10, " zip fileNo: " , fileNo ," - ( ", index_name," ) | File Numbers:" ,len(zObject.namelist()) , "=" * 10)
|
||||||
|
|
||||||
|
packNo = 0
|
||||||
|
self.counter = 0
|
||||||
|
for filename in zObject.namelist():
|
||||||
|
packNo += 1
|
||||||
|
if limit_pack != -1 :
|
||||||
|
if packNo > limit_pack :
|
||||||
|
print('limit_data ', index_name, ' ', limit_pack)
|
||||||
|
break
|
||||||
|
|
||||||
|
print("index:" , index_name , '=>' , filename )
|
||||||
|
with zObject.open(filename) as file:
|
||||||
|
data = json.loads(file.read())
|
||||||
|
self.importJsonDataToElastic(data, index_name, fields)
|
||||||
|
|
||||||
|
self.es.indices.refresh(index=index_name)
|
||||||
|
print(self.es.cat.count(index=index_name, format="json"))
|
||||||
|
print(" END Of Import to elastic ", index_name ,"\n")
|
||||||
|
|
||||||
|
|
||||||
|
def iterateJsonFile(self, file_path, isZip=True, limit_pack = -1):
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
print("file zip:" , file_path , " not exist iterateJsonFile " )
|
||||||
|
return
|
||||||
|
|
||||||
|
if isZip :
|
||||||
|
fileNo = 0
|
||||||
|
with zipfile.ZipFile(file_path, 'r') as zObject:
|
||||||
|
fileNo +=1
|
||||||
|
print("="*10, " zip fileNo: " , fileNo ," iterateJsonFile - | File Numbers:" ,len(zObject.namelist()) , "=" * 10)
|
||||||
|
|
||||||
|
packNo = 0
|
||||||
|
self.counter = 0
|
||||||
|
for filename in zObject.namelist():
|
||||||
|
packNo += 1
|
||||||
|
if limit_pack != -1 :
|
||||||
|
if packNo > limit_pack :
|
||||||
|
print('limit_data iterateJsonFile ', limit_pack)
|
||||||
|
break
|
||||||
|
|
||||||
|
print("index iterateJsonFile :", '=>' , filename )
|
||||||
|
with zObject.open(filename) as file:
|
||||||
|
data = json.loads(file.read())
|
||||||
|
# Yield each entry
|
||||||
|
# yield data
|
||||||
|
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in data)
|
||||||
|
else :
|
||||||
|
with open(filename, 'r', encoding='utf-8') as file:
|
||||||
|
data = json.loads(file.read())
|
||||||
|
# Yield each entry
|
||||||
|
# yield from (hit for hit in data)
|
||||||
|
#return data
|
||||||
|
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in data)
|
||||||
|
|
||||||
|
|
||||||
|
def es_iterate_all_documents(self, index, body="", pagesize=250, scroll_timeout="25m", **kwargs):
|
||||||
|
"""
|
||||||
|
Helper to iterate ALL values from a single index
|
||||||
|
Yields all the documents.
|
||||||
|
"""
|
||||||
|
is_first = True
|
||||||
|
while True:
|
||||||
|
# Scroll next
|
||||||
|
if is_first: # Initialize scroll
|
||||||
|
# result = self.es.search(index=index, scroll="2m", **kwargs, body={
|
||||||
|
# "size": pagesize
|
||||||
|
# })
|
||||||
|
if body :
|
||||||
|
result = self.es.search(
|
||||||
|
index=index,
|
||||||
|
scroll=scroll_timeout,
|
||||||
|
**kwargs,
|
||||||
|
size=pagesize,
|
||||||
|
body=body
|
||||||
|
)
|
||||||
|
else :
|
||||||
|
result = self.es.search(
|
||||||
|
index=index,
|
||||||
|
scroll=scroll_timeout,
|
||||||
|
**kwargs,
|
||||||
|
size=pagesize
|
||||||
|
)
|
||||||
|
|
||||||
|
self.total = result["hits"]["total"]["value"]
|
||||||
|
if self.total > 0:
|
||||||
|
print("total = %d" % self.total)
|
||||||
|
is_first = False
|
||||||
|
else:
|
||||||
|
# result = es.scroll(body={
|
||||||
|
# "scroll_id": scroll_id,
|
||||||
|
# "scroll": scroll_timeout
|
||||||
|
# })
|
||||||
|
result = self.es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
|
||||||
|
|
||||||
|
scroll_id = result["_scroll_id"]
|
||||||
|
hits = result["hits"]["hits"]
|
||||||
|
self.counter += len(hits)
|
||||||
|
if self.total > 0 :
|
||||||
|
print("progress -> %.2f %%" % ((self.counter / self.total) * 100))
|
||||||
|
# Stop after no more docs
|
||||||
|
if not hits:
|
||||||
|
break
|
||||||
|
# Yield each entry
|
||||||
|
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
|
||||||
|
|
||||||
|
|
||||||
|
def moveCustomFileds(self, index_name_i, index_name_o, fields=[], renameFileds={}):
|
||||||
|
try:
|
||||||
|
body = {}
|
||||||
|
list = []
|
||||||
|
try:
|
||||||
|
list = self.es_iterate_all_documents(index_name_i)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for mentry in list:
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
entry = mentry["source"]
|
||||||
|
id = mentry["id"]
|
||||||
|
# print(id)
|
||||||
|
eid = id
|
||||||
|
|
||||||
|
if (count % 100) == 0 :
|
||||||
|
print("%s -> %.2f " % (id , (count / self.total) if self.total > 0 else 0))
|
||||||
|
|
||||||
|
data_filled = False
|
||||||
|
data = {}
|
||||||
|
for col in fields:
|
||||||
|
|
||||||
|
if '.' in col :
|
||||||
|
cols = col.split('.')
|
||||||
|
subsource = entry
|
||||||
|
for sub in cols :
|
||||||
|
dCol = subsource.get(sub, None)
|
||||||
|
if dCol :
|
||||||
|
subsource = dCol
|
||||||
|
else :
|
||||||
|
break
|
||||||
|
else :
|
||||||
|
dCol = entry.get(col, None)
|
||||||
|
|
||||||
|
if dCol is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if col in renameFileds :
|
||||||
|
data[renameFileds[col]] = dCol
|
||||||
|
else:
|
||||||
|
data[col] = dCol
|
||||||
|
|
||||||
|
data_filled = True
|
||||||
|
|
||||||
|
if not data_filled :
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = self.update_index_doc(True, index_name_o, eid, data)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
# save_error(id, e)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# print("1111")
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
# save_error(id, e)
|
||||||
|
|
||||||
|
def mappingIndex(self, index_name_i):
|
||||||
|
# فقط از طریق کیبانا میشه تغییر مپ داد
|
||||||
|
|
||||||
|
# با پایتون نمیشه
|
||||||
|
# باید ایندکس جدیدی با مپ مطلوب ایجاد کرد و رایندکس کرد
|
||||||
|
pass
|
||||||
|
|
||||||
|
def updateByQueryIndex(self, index_name_i, body):
|
||||||
|
## sample
|
||||||
|
# body = {
|
||||||
|
# "script": {
|
||||||
|
# "inline": "ctx._source.Device='Test'",
|
||||||
|
# "lang": "painless"
|
||||||
|
# },
|
||||||
|
# "query": {
|
||||||
|
# "match": {
|
||||||
|
# "Device": "Boiler"
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
try:
|
||||||
|
self.es.update_by_query(body=body, index=index_name_i)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
# save_error(id, e)
|
||||||
|
|
||||||
|
|
||||||
|
def deleteByQueryIndex(self, index_name_i, body):
|
||||||
|
## sample
|
||||||
|
# body = {
|
||||||
|
# "query": {
|
||||||
|
# "match": {
|
||||||
|
# "Device": "Boiler"
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
try:
|
||||||
|
self.es.delete_by_query(index=index_name_i, body=body )
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
# save_error(id, e)
|
||||||
|
|
||||||
|
def delete_by_ids(self, index_name_i, ids):
|
||||||
|
try:
|
||||||
|
# ids = ['test1', 'test2', 'test3']
|
||||||
|
|
||||||
|
query = {"query": {"terms": {"_id": ids}}}
|
||||||
|
res = self.es.delete_by_query(index=index_name_i, body=query)
|
||||||
|
print(res)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
# save_error(id, e)
|
||||||
|
|
||||||
681
util/embedder_sbert_qavanin_285k.py
Executable file
681
util/embedder_sbert_qavanin_285k.py
Executable file
|
|
@ -0,0 +1,681 @@
|
||||||
|
# !pip install hazm
|
||||||
|
# !pip install transformers==4.26.0
|
||||||
|
# !pip install --upgrade numpy
|
||||||
|
# !pip install --upgrade sentence-transformers
|
||||||
|
"""
|
||||||
|
Persian Sentence Processing and Vector Analysis
|
||||||
|
==============================================
|
||||||
|
|
||||||
|
This script processes Persian sentences from a JSON file and performs:
|
||||||
|
1. Word extraction and preprocessing
|
||||||
|
2. Vector representation using multilingual transformer
|
||||||
|
3. Similarity analysis for key words
|
||||||
|
4. Dimensionality reduction to 3D
|
||||||
|
5. 3D visualization with Persian labels
|
||||||
|
|
||||||
|
Author: NLP Expert Assistant
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from typing import List, Dict, Tuple, Set
|
||||||
|
from collections import Counter
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# NLP and ML libraries
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
|
from sklearn.manifold import TSNE
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
#from normalizer import cleaning
|
||||||
|
try:
|
||||||
|
from util.elastic_helper import ElasticHelper
|
||||||
|
except Exception as error:
|
||||||
|
eee = error
|
||||||
|
pass
|
||||||
|
# Visualization libraries
|
||||||
|
# import matplotlib.pyplot as plt
|
||||||
|
# import plotly.graph_objects as go
|
||||||
|
# import plotly.express as px
|
||||||
|
# from plotly.subplots import make_subplots
|
||||||
|
|
||||||
|
# Persian text processing
|
||||||
|
# import hazm
|
||||||
|
# from hazm import Normalizer, word_tokenize, POSTagger
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class PersianVectorAnalyzer:
|
||||||
|
"""
|
||||||
|
A comprehensive class for Persian text processing and vector analysis.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
|
||||||
|
"""
|
||||||
|
Initialize the analyzer with the specified model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_name: The sentence transformer model to use
|
||||||
|
"""
|
||||||
|
self.model_name = model_name
|
||||||
|
self.model = None
|
||||||
|
#self.normalizer = Normalizer()
|
||||||
|
self.stop_words = self._load_persian_stop_words()
|
||||||
|
self.key_words = [
|
||||||
|
"خدا", "بنده", "جهاد", "ولی", "زکات",
|
||||||
|
"نماز", "صبر", "عبادت", "ولایت", "خلافت","پیامبر"
|
||||||
|
]
|
||||||
|
|
||||||
|
logger.info(f"Initializing Persian Vector Analyzer with model: {model_name}")
|
||||||
|
|
||||||
|
def _load_persian_stop_words(self) -> Set[str]:
|
||||||
|
"""
|
||||||
|
Load Persian stop words.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Set of Persian stop words
|
||||||
|
"""
|
||||||
|
# Common Persian stop words
|
||||||
|
stop_words = {
|
||||||
|
'و', 'در', 'به', 'از', 'که', 'این', 'آن', 'با', 'برای', 'تا',
|
||||||
|
'را', 'هم', 'یا', 'اما', 'اگر', 'چون', 'چرا', 'چگونه', 'کجا',
|
||||||
|
'چه', 'کی', 'چند', 'چقدر', 'همه', 'هیچ', 'بعضی', 'هر', 'همه',
|
||||||
|
'خود', 'خویش', 'ما', 'شما', 'آنها', 'ایشان', 'اینها', 'آنها',
|
||||||
|
'من', 'تو', 'او', 'ما', 'شما', 'آنها', 'ایشان', 'اینها',
|
||||||
|
'است', 'هست', 'بود', 'شد', 'می', 'باید', 'خواهد', 'دارد',
|
||||||
|
'کرد', 'شد', 'بود', 'هست', 'است', 'میشود', 'میکند',
|
||||||
|
'یک', 'دو', 'سه', 'چهار', 'پنج', 'شش', 'هفت', 'هشت', 'نه', 'ده',
|
||||||
|
'اول', 'دوم', 'سوم', 'چهارم', 'پنجم', 'ششم', 'هفتم', 'هشتم', 'نهم', 'دهم',
|
||||||
|
'سال', 'ماه', 'روز', 'هفته', 'ساعت', 'دقیقه', 'ثانیه','پس'
|
||||||
|
'بله', 'نه', 'آری', 'خیر', 'بلی', 'نخیر',
|
||||||
|
'حالا', 'الان', 'امروز', 'دیروز', 'فردا', 'هفته', 'ماه', 'سال',
|
||||||
|
'بالا', 'پایین', 'چپ', 'راست', 'جلو', 'عقب', 'داخل', 'خارج',
|
||||||
|
'بزرگ', 'کوچک', 'بلند', 'کوتاه', 'پهن', 'باریک', 'ضخیم', 'نازک',
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
return stop_words
|
||||||
|
|
||||||
|
def load_model(self):
|
||||||
|
"""
|
||||||
|
Load the sentence transformer model.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info("Loading sentence transformer model...")
|
||||||
|
self.model = SentenceTransformer(self.model_name)
|
||||||
|
logger.info("Model loaded successfully!")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading model: {e}")
|
||||||
|
raise
|
||||||
|
def split_sentence(self, sentence:str):
|
||||||
|
sentences = []
|
||||||
|
sentence_len = len(self.tokenize_sentence(sentence))
|
||||||
|
if sentence_len < 512:
|
||||||
|
sentences.append(sentence)
|
||||||
|
else:
|
||||||
|
temp_sentences = str(sentence).split('.')
|
||||||
|
for sent in temp_sentences:
|
||||||
|
sent_len = len(self.tokenize_sentence(sent))
|
||||||
|
if sent_len > 512:
|
||||||
|
temp_sentences_2 = str(sent).split('،')
|
||||||
|
for snt in temp_sentences_2:
|
||||||
|
sentences.append(snt)
|
||||||
|
else:
|
||||||
|
sentences.append(sent)
|
||||||
|
|
||||||
|
return sentences
|
||||||
|
|
||||||
|
def load_json_data(self, file_path: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Load Persian sentences from JSON file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the JSON file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Persian sentences
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info(f"Loading data from {file_path}")
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
# convert dict{dict} to list[dict]
|
||||||
|
if type(data) == dict:
|
||||||
|
temp_data = []
|
||||||
|
for item in data.items():
|
||||||
|
temp_data.append(item[1])
|
||||||
|
data = temp_data
|
||||||
|
|
||||||
|
sentences = []
|
||||||
|
if isinstance(data, list):
|
||||||
|
for index, item in enumerate(data):
|
||||||
|
print(f'split sentence {index}')
|
||||||
|
if isinstance(item, dict):
|
||||||
|
if item['content'] == '':
|
||||||
|
continue
|
||||||
|
sentences.append([item['id'],item['content'].strip()])
|
||||||
|
# for key in ['content']:
|
||||||
|
# if key in item and item[key]:
|
||||||
|
# # splited_sentences = self.split_sentence(item[key])
|
||||||
|
# # splited_sentences = item[key]
|
||||||
|
# sentences.append(item[key])
|
||||||
|
# # for sent in splited_sentences:
|
||||||
|
# # sentences.append(sent)
|
||||||
|
# else:
|
||||||
|
# print('fault '+item['sentence-number'])
|
||||||
|
elif isinstance(item, str):
|
||||||
|
# splited_sentences = self.split_sentence(item[key])
|
||||||
|
sentences.append(item)
|
||||||
|
# for sent in splited_sentences:
|
||||||
|
# sentences.append(sent)
|
||||||
|
elif isinstance(data, dict):
|
||||||
|
# If it's a single object, extract all string values
|
||||||
|
for value in data.values():
|
||||||
|
if isinstance(value, str):
|
||||||
|
sentences.append(value)
|
||||||
|
# splited_sentences = str(value).split('.')
|
||||||
|
# for sent in splited_sentences:
|
||||||
|
# sentences.append(sent)
|
||||||
|
|
||||||
|
sentences = [senten for senten in sentences if senten]
|
||||||
|
logger.info(f"Loaded {len(sentences)} sentences")
|
||||||
|
return sentences
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading JSON data: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def preprocess_text(self, text: str) -> str:
|
||||||
|
"""
|
||||||
|
Preprocess Persian text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Raw Persian text
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Preprocessed text
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Normalize text
|
||||||
|
#text = self.normalizer.normalize(text)
|
||||||
|
|
||||||
|
# Remove extra whitespace
|
||||||
|
text = re.sub(r'\s+', ' ', text)
|
||||||
|
|
||||||
|
# Remove special characters but keep Persian characters
|
||||||
|
text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text)
|
||||||
|
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
def tokenize_sentence(self, sentence:str):
|
||||||
|
|
||||||
|
try:
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||||
|
# print(self.model_name)
|
||||||
|
tokens = tokenizer.tokenize(sentence)
|
||||||
|
return tokens
|
||||||
|
except:
|
||||||
|
error = "An exception occurred in tokenizer : " + self.model_name
|
||||||
|
#file.write( error + '\n' )
|
||||||
|
return []
|
||||||
|
|
||||||
|
def extract_words(self, sentences: List[str]) -> List[str]:
|
||||||
|
"""
|
||||||
|
Extract all words from sentences.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sentences: List of Persian sentences
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of all words
|
||||||
|
"""
|
||||||
|
all_words = []
|
||||||
|
|
||||||
|
for sentence in sentences:
|
||||||
|
# Preprocess sentence
|
||||||
|
processed_sentence = self.preprocess_text(sentence)
|
||||||
|
|
||||||
|
# Tokenize
|
||||||
|
words = word_tokenize(processed_sentence)
|
||||||
|
# words = processed_sentence.split()
|
||||||
|
# Filter out empty strings and very short words
|
||||||
|
words = [word for word in words if len(word) > 1]
|
||||||
|
|
||||||
|
all_words.extend(words)
|
||||||
|
|
||||||
|
logger.info(f"Extracted {len(all_words)} words from {len(sentences)} sentences")
|
||||||
|
return all_words
|
||||||
|
|
||||||
|
def remove_stop_words(self, words: List[str]) -> List[str]:
|
||||||
|
"""
|
||||||
|
Remove stop words from the word list.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
words: List of words
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of words without stop words
|
||||||
|
"""
|
||||||
|
filtered_words = [word for word in words if word not in self.stop_words]
|
||||||
|
logger.info(f"Removed {len(words) - len(filtered_words)} stop words")
|
||||||
|
return filtered_words
|
||||||
|
|
||||||
|
def get_unique_words(self, words: List[str]) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get unique words from the list.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
words: List of words
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of unique words
|
||||||
|
"""
|
||||||
|
unique_words = list(set(words))
|
||||||
|
logger.info(f"Found {len(unique_words)} unique words from {len(words)} total words")
|
||||||
|
return unique_words
|
||||||
|
|
||||||
|
def compute_word_vectors(self, sentences: List[str]) -> Dict[str, List[float]]:
|
||||||
|
"""
|
||||||
|
Compute vector representations for words.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sentences: List of unique sentences
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary mapping sentences to their vector representations
|
||||||
|
"""
|
||||||
|
if self.model is None:
|
||||||
|
self.load_model()
|
||||||
|
|
||||||
|
logger.info(f"Computing vectors for {len(sentences)} sections ...")
|
||||||
|
# print(sentences[0])
|
||||||
|
# create list of just sentences
|
||||||
|
just_sentences = [sent['content'] for sent in sentences]
|
||||||
|
# Compute embeddings
|
||||||
|
embeddings = self.model.encode(just_sentences, show_progress_bar=True)
|
||||||
|
|
||||||
|
# Create dictionary
|
||||||
|
sentences_vectors = {}
|
||||||
|
for i, sent in enumerate(sentences):
|
||||||
|
sentences_vectors[f'sentence-{sentences[i]["id"]}'] = {
|
||||||
|
'id': sentences[i]['id'],
|
||||||
|
'fullpath': sentences[i]['fullpath'],
|
||||||
|
'qanon-title': sentences[i]['qanon-title'],
|
||||||
|
'section-prefix': sentences[i]['section-prefix'],
|
||||||
|
'content': sentences[i]['content'],
|
||||||
|
'embeddings': embeddings[i].tolist()
|
||||||
|
}
|
||||||
|
print(f'section {i} embedded!')
|
||||||
|
|
||||||
|
logger.info("section vectors computed successfully!")
|
||||||
|
return sentences_vectors
|
||||||
|
|
||||||
|
def find_closest_words(self, word_vectors: Dict[str, List[float]],
|
||||||
|
key_words: List[str], top_k: int = 20) -> Dict[str, List[str]]:
|
||||||
|
"""
|
||||||
|
Find the closest words to each key word.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word_vectors: Dictionary of word vectors
|
||||||
|
key_words: List of key words to find neighbors for
|
||||||
|
top_k: Number of closest words to find
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary mapping key words to their closest neighbors
|
||||||
|
"""
|
||||||
|
logger.info(f"Finding {top_k} closest words for {len(key_words)} key words...")
|
||||||
|
|
||||||
|
# Convert to numpy arrays for faster computation
|
||||||
|
words = list(word_vectors.keys())
|
||||||
|
vectors = np.array(list(word_vectors.values()))
|
||||||
|
|
||||||
|
closest_words = {}
|
||||||
|
|
||||||
|
for key_word in key_words:
|
||||||
|
if key_word in word_vectors:
|
||||||
|
# Get the key word vector
|
||||||
|
key_vector = np.array(word_vectors[key_word]).reshape(1, -1)
|
||||||
|
|
||||||
|
# Compute cosine similarities
|
||||||
|
similarities = cosine_similarity(key_vector, vectors)[0]
|
||||||
|
|
||||||
|
# Get indices of top k similar words (excluding the key word itself)
|
||||||
|
word_indices = np.argsort(similarities)[::-1]
|
||||||
|
|
||||||
|
# Filter out the key word itself and get top k
|
||||||
|
closest_indices = []
|
||||||
|
for idx in word_indices:
|
||||||
|
if words[idx] != key_word and len(closest_indices) < top_k:
|
||||||
|
closest_indices.append(idx)
|
||||||
|
|
||||||
|
# Get the closest words
|
||||||
|
closest_words[key_word] = [words[idx] for idx in closest_indices]
|
||||||
|
logger.info(f"Found {len(closest_words[key_word])} closest words for '{key_word}'")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Key word '{key_word}' not found in word vectors")
|
||||||
|
closest_words[key_word] = []
|
||||||
|
|
||||||
|
return closest_words
|
||||||
|
|
||||||
|
def reduce_to_3d(self, word_vectors: Dict[str, List[float]],
|
||||||
|
method: str = 'tsne') -> Dict[str, List[float]]:
|
||||||
|
"""
|
||||||
|
Reduce word vectors to 3D coordinates.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word_vectors: Dictionary of word vectors
|
||||||
|
method: Dimensionality reduction method ('pca' or 'tsne')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary mapping words to their 3D coordinates
|
||||||
|
"""
|
||||||
|
logger.info(f"Reducing dimensions to 3D using {method.upper()}...")
|
||||||
|
|
||||||
|
words = list(word_vectors.keys())
|
||||||
|
vectors = np.array(list(word_vectors.values()))
|
||||||
|
|
||||||
|
if method.lower() == 'pca':
|
||||||
|
reducer = PCA(n_components=3, random_state=42)
|
||||||
|
elif method.lower() == 'tsne':
|
||||||
|
reducer = TSNE(n_components=3, random_state=42, perplexity=min(30, len(vectors)-1))
|
||||||
|
else:
|
||||||
|
raise ValueError("Method must be 'pca' or 'tsne'")
|
||||||
|
|
||||||
|
# Reduce dimensions
|
||||||
|
reduced_vectors = reducer.fit_transform(vectors)
|
||||||
|
|
||||||
|
# Create dictionary
|
||||||
|
word_vectors_3d = {}
|
||||||
|
for i, word in enumerate(words):
|
||||||
|
word_vectors_3d[word] = reduced_vectors[i].tolist()
|
||||||
|
|
||||||
|
logger.info("Dimensionality reduction completed!")
|
||||||
|
return word_vectors_3d
|
||||||
|
|
||||||
|
def save_json(self, data: dict, file_path: str):
|
||||||
|
"""
|
||||||
|
Save data to JSON file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: Data to save
|
||||||
|
file_path: Output file path
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with open(file_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||||
|
logger.info(f"Data saved to {file_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error saving to {file_path}: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
# def create_3d_visualization(self, word_vectors_3d: Dict[str, List[float]],
|
||||||
|
# selected_words: Dict[str, List[str]],
|
||||||
|
# output_path: str = "persian_words_3d.html"):
|
||||||
|
# """
|
||||||
|
# Create 3D visualization of words.
|
||||||
|
|
||||||
|
# Args:
|
||||||
|
# word_vectors_3d: Dictionary of 3D word coordinates
|
||||||
|
# selected_words: Dictionary of selected words for each key word
|
||||||
|
# output_path: Output file path for the visualization
|
||||||
|
# """
|
||||||
|
# logger.info("Creating 3D visualization...")
|
||||||
|
|
||||||
|
# # Prepare data for plotting
|
||||||
|
# words = list(word_vectors_3d.keys())
|
||||||
|
# coords = np.array(list(word_vectors_3d.values()))
|
||||||
|
|
||||||
|
# # Create color mapping for key words and their neighbors
|
||||||
|
# colors = []
|
||||||
|
# sizes = []
|
||||||
|
# hover_texts = []
|
||||||
|
|
||||||
|
# for word in words:
|
||||||
|
# # Check if word is a key word
|
||||||
|
# is_key_word = word in self.key_words
|
||||||
|
|
||||||
|
# # Check if word is in selected words
|
||||||
|
# in_selected = False
|
||||||
|
# key_word_group = None
|
||||||
|
# for key_word, selected_list in selected_words.items():
|
||||||
|
# if word in selected_list:
|
||||||
|
# in_selected = True
|
||||||
|
# key_word_group = key_word
|
||||||
|
# break
|
||||||
|
|
||||||
|
# if is_key_word:
|
||||||
|
# colors.append('red')
|
||||||
|
# sizes.append(15)
|
||||||
|
# hover_texts.append(f"کلیدواژه: {word}")
|
||||||
|
# elif in_selected:
|
||||||
|
# colors.append('blue')
|
||||||
|
# sizes.append(10)
|
||||||
|
# hover_texts.append(f"کلمه مرتبط با '{key_word_group}': {word}")
|
||||||
|
# else:
|
||||||
|
# colors.append('lightgray')
|
||||||
|
# sizes.append(5)
|
||||||
|
# hover_texts.append(f"کلمه: {word}")
|
||||||
|
|
||||||
|
# # Create 3D scatter plot
|
||||||
|
# fig = go.Figure()
|
||||||
|
|
||||||
|
# # Add scatter plot
|
||||||
|
# fig.add_trace(go.Scatter3d(
|
||||||
|
# x=coords[:, 0],
|
||||||
|
# y=coords[:, 1],
|
||||||
|
# z=coords[:, 2],
|
||||||
|
# mode='markers+text',
|
||||||
|
# marker=dict(
|
||||||
|
# size=sizes,
|
||||||
|
# color=colors,
|
||||||
|
# opacity=0.8
|
||||||
|
# ),
|
||||||
|
# text=words,
|
||||||
|
# textposition="middle center",
|
||||||
|
# hovertext=hover_texts,
|
||||||
|
# hoverinfo='text'
|
||||||
|
# ))
|
||||||
|
|
||||||
|
# # Update layout
|
||||||
|
# fig.update_layout(
|
||||||
|
# title={
|
||||||
|
# 'text': 'نمایش سهبعدی کلمات فارسی',
|
||||||
|
# 'x': 0.5,
|
||||||
|
# 'xanchor': 'center',
|
||||||
|
# 'font': {'size': 20}
|
||||||
|
# },
|
||||||
|
# scene=dict(
|
||||||
|
# xaxis_title='محور X',
|
||||||
|
# yaxis_title='محور Y',
|
||||||
|
# zaxis_title='محور Z',
|
||||||
|
# camera=dict(
|
||||||
|
# eye=dict(x=1.5, y=1.5, z=1.5)
|
||||||
|
# )
|
||||||
|
# ),
|
||||||
|
# width=1000,
|
||||||
|
# height=800,
|
||||||
|
# showlegend=False
|
||||||
|
# )
|
||||||
|
|
||||||
|
# # Save the plot
|
||||||
|
# fig.write_html(output_path)
|
||||||
|
# logger.info(f"3D visualization saved to {output_path}")
|
||||||
|
|
||||||
|
# return fig
|
||||||
|
|
||||||
|
def process_pipeline(self, input_file: str, output_dir: str = "output"):
|
||||||
|
"""
|
||||||
|
Run the complete processing pipeline.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_file(str): Path to input JSON file
|
||||||
|
output_dir(str): Output directory for results
|
||||||
|
"""
|
||||||
|
# Create output directory
|
||||||
|
Path(output_dir).mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
logger.info("Starting Persian Vector Analysis Pipeline...")
|
||||||
|
|
||||||
|
# Step 1: Load data
|
||||||
|
# sentences = self.load_json_data(input_file)
|
||||||
|
sentences = ALL_SECTIONS
|
||||||
|
|
||||||
|
# for s in sentences:
|
||||||
|
# s_len = len(self.tokenize_sentence(s))
|
||||||
|
# if s_len > 512:
|
||||||
|
# print(f'long: {s}')
|
||||||
|
# Step 2: Extract words
|
||||||
|
# all_words = self.extract_words(sentences)
|
||||||
|
|
||||||
|
# Step 3: Remove stop words
|
||||||
|
# filtered_words = self.remove_stop_words(all_words)
|
||||||
|
# filtered_words = all_words
|
||||||
|
|
||||||
|
# Step 4: Get unique words
|
||||||
|
# unique_words = self.get_unique_words(filtered_words)
|
||||||
|
|
||||||
|
# Step 5: Compute word vectors
|
||||||
|
sentences_vectors = self.compute_word_vectors(sentences)
|
||||||
|
|
||||||
|
# Step 6: Save word vectors
|
||||||
|
self.save_json(sentences_vectors, f"{output_dir}/sections-vec-285k.json")
|
||||||
|
|
||||||
|
# Step 7: Find closest words to key words
|
||||||
|
# selected_words = self.find_closest_words(word_vectors, self.key_words)
|
||||||
|
|
||||||
|
# Step 8: Save selected words
|
||||||
|
# self.save_json(selected_words, f"{output_dir}/selected_words.json")
|
||||||
|
|
||||||
|
# Step 9: Reduce to 3D
|
||||||
|
# word_vectors_3d = self.reduce_to_3d(word_vectors, method='tsne')
|
||||||
|
|
||||||
|
# Step 10: Save 3D vectors
|
||||||
|
# self.save_json(word_vectors_3d, f"{output_dir}/words_vector_3d.json")
|
||||||
|
|
||||||
|
# Step 11: Create visualization
|
||||||
|
# self.create_3d_visualization(word_vectors_3d, selected_words,
|
||||||
|
# f"{output_dir}/persian_words_3d.html")
|
||||||
|
|
||||||
|
logger.info("Pipeline completed successfully!")
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("PIPELINE SUMMARY")
|
||||||
|
print("="*50)
|
||||||
|
print(f"Input sentences: {len(sentences)}")
|
||||||
|
# print(f"Total words extracted: {len(all_words)}")
|
||||||
|
# print(f"Unique words after preprocessing: {len(unique_words)}")
|
||||||
|
# print(f"Word vectors computed: {len(word_vectors)}")
|
||||||
|
# print(f"Key words processed: {len(self.key_words)}")
|
||||||
|
print(f"Output files saved to: {output_dir}/")
|
||||||
|
print("="*50)
|
||||||
|
|
||||||
|
def full_path_text_maker(full_path):
|
||||||
|
"""
|
||||||
|
این متد مسیر یک سکشن را می گیرد و متنی را بر اساس ترتیب بخش های آن از جزء به کل بازسازی می کند و بر می گرداند
|
||||||
|
|
||||||
|
Args:
|
||||||
|
full_path(list): لیستی از عناصر مشخص کننده مسیر درختی این سکشن
|
||||||
|
Returns:
|
||||||
|
full_path_text(str): متن بازسازی شده از مسیر یک سکشن
|
||||||
|
"""
|
||||||
|
full_path_text = ""
|
||||||
|
for i, path_item in enumerate(reversed(full_path)):
|
||||||
|
if i == len(full_path) - 1:
|
||||||
|
full_path_text += ''.join(f'{path_item}')
|
||||||
|
break
|
||||||
|
full_path_text += ''.join(f'{path_item} از ')
|
||||||
|
full_path_text = full_path_text.strip()
|
||||||
|
return full_path_text
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
Main function to run the Persian Vector Analysis.
|
||||||
|
"""
|
||||||
|
# Initialize analyzer
|
||||||
|
analyzer = PersianVectorAnalyzer()
|
||||||
|
|
||||||
|
# Define input and output paths
|
||||||
|
# input_file = "./output-speechs/nahj_speechs_sentences.json"
|
||||||
|
# output_dir = "output-speechs"
|
||||||
|
# input_file = "./majles/data/sections.json"
|
||||||
|
input_file = ""
|
||||||
|
output_dir = "./data/majles-output"
|
||||||
|
|
||||||
|
# Run the complete pipeline
|
||||||
|
analyzer.process_pipeline(input_file, output_dir)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
eh_obj = ElasticHelper()
|
||||||
|
path = "/home/gpu/data_11/14040611/mj_qa_section.zip"
|
||||||
|
sections_elastic = eh_obj.iterateJsonFile(path, True)
|
||||||
|
all_count = 0
|
||||||
|
dont_cares = []
|
||||||
|
ALL_SECTIONS = []
|
||||||
|
for index, item in enumerate(sections_elastic):
|
||||||
|
all_count +=1
|
||||||
|
source = item['source']
|
||||||
|
section_path = source['other_info']['full_path']
|
||||||
|
id = item['id']
|
||||||
|
|
||||||
|
filtered_keys = ['فصل','موخره','امضاء','عنوان']
|
||||||
|
section_path = source['other_info']['full_path']
|
||||||
|
flag = False
|
||||||
|
if '>' in section_path:
|
||||||
|
path_parts = section_path.split('>')
|
||||||
|
for key in filtered_keys:
|
||||||
|
if key in path_parts[-1]:
|
||||||
|
dont_cares.append(id)
|
||||||
|
flag = True
|
||||||
|
break
|
||||||
|
if flag:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
for key in filtered_keys:
|
||||||
|
if key in section_path:
|
||||||
|
dont_cares.append(id)
|
||||||
|
flag = True
|
||||||
|
break
|
||||||
|
if flag:
|
||||||
|
continue
|
||||||
|
|
||||||
|
qanon_title = source['qanon_title']
|
||||||
|
full_path_text = full_path_text_maker(section_path.split('>'))
|
||||||
|
section_prefix = f"محتوای {full_path_text} {cleaning(qanon_title)} عبارت است از: "
|
||||||
|
|
||||||
|
try:
|
||||||
|
content = cleaning(item['source']['content'])
|
||||||
|
# کنار گذاشتن سکشن های خیلی کوچک که عملا محتوا ندارند
|
||||||
|
if len(content.split()) <= 10:
|
||||||
|
continue
|
||||||
|
except Exception as error:
|
||||||
|
print(error)
|
||||||
|
continue
|
||||||
|
data = {
|
||||||
|
'id': id,
|
||||||
|
'fullpath': section_path,
|
||||||
|
'qanon-title': qanon_title,
|
||||||
|
'section-prefix': section_prefix,
|
||||||
|
'content': content
|
||||||
|
}
|
||||||
|
ALL_SECTIONS.append(data)
|
||||||
|
print(f'all_count: {all_count}')
|
||||||
|
print(f'dont_cares: {len(dont_cares)}')
|
||||||
|
print(f'ALL_SECTIONS without dont-cares: {len(ALL_SECTIONS)}')
|
||||||
|
|
||||||
|
main()
|
||||||
|
|
||||||
|
"""
|
||||||
|
:: *** نکته مهم *** ::
|
||||||
|
NOTE !!! after this process run convert_qavanin_json_to_faiss.py due to create faiss index which is used in RAG process
|
||||||
|
"""
|
||||||
76
util/normalizer.py
Executable file
76
util/normalizer.py
Executable file
|
|
@ -0,0 +1,76 @@
|
||||||
|
#import hazm
|
||||||
|
from cleantext import clean
|
||||||
|
import re
|
||||||
|
|
||||||
|
def cleanhtml(raw_html):
|
||||||
|
cleanr = re.compile('<.*?>')
|
||||||
|
cleantext = re.sub(cleanr, '', raw_html)
|
||||||
|
return cleantext
|
||||||
|
|
||||||
|
#normalizer = hazm.Normalizer()
|
||||||
|
wierd_pattern = re.compile("["
|
||||||
|
u"\U0001F600-\U0001F64F" # emoticons
|
||||||
|
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
||||||
|
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
||||||
|
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
||||||
|
u"\U00002702-\U000027B0"
|
||||||
|
u"\U000024C2-\U0001F251"
|
||||||
|
u"\U0001f926-\U0001f937"
|
||||||
|
u'\U00010000-\U0010ffff'
|
||||||
|
u"\u200d"
|
||||||
|
u"\u2640-\u2642"
|
||||||
|
u"\u2600-\u2B55"
|
||||||
|
u"\u23cf"
|
||||||
|
u"\u23e9"
|
||||||
|
u"\u231a"
|
||||||
|
u"\u3030"
|
||||||
|
u"\ufe0f"
|
||||||
|
u"\u2069"
|
||||||
|
u"\u2066"
|
||||||
|
# u"\u200c"
|
||||||
|
u"\u2068"
|
||||||
|
u"\u2067"
|
||||||
|
"]+", flags=re.UNICODE)
|
||||||
|
|
||||||
|
def cleaning(text):
|
||||||
|
text = text.strip()
|
||||||
|
|
||||||
|
# regular cleaning
|
||||||
|
# text = clean(text,
|
||||||
|
# fix_unicode=True,
|
||||||
|
# to_ascii=False,
|
||||||
|
# lower=True,
|
||||||
|
# no_line_breaks=True,
|
||||||
|
# no_urls=True,
|
||||||
|
# no_emails=True,
|
||||||
|
# no_phone_numbers=True,
|
||||||
|
# no_numbers=False,
|
||||||
|
# no_digits=False,
|
||||||
|
# no_currency_symbols=True,
|
||||||
|
# no_punct=False,
|
||||||
|
# replace_with_url="",
|
||||||
|
# replace_with_email="",
|
||||||
|
# replace_with_phone_number="",
|
||||||
|
# replace_with_number="",
|
||||||
|
# replace_with_digit="0",
|
||||||
|
# replace_with_currency_symbol="",
|
||||||
|
# )
|
||||||
|
text = clean(text,
|
||||||
|
extra_spaces = True,
|
||||||
|
lowercase = True
|
||||||
|
)
|
||||||
|
|
||||||
|
# cleaning htmls
|
||||||
|
text = cleanhtml(text)
|
||||||
|
|
||||||
|
# normalizing
|
||||||
|
#text = normalizer.normalize(text)
|
||||||
|
|
||||||
|
# removing wierd patterns
|
||||||
|
text = wierd_pattern.sub(r'', text)
|
||||||
|
|
||||||
|
# removing extra spaces, hashtags
|
||||||
|
text = re.sub("#", "", text)
|
||||||
|
text = re.sub("\s+", " ", text)
|
||||||
|
|
||||||
|
return text
|
||||||
Loading…
Reference in New Issue
Block a user