ai_dataset/import_data/diff_result_objects.py

151 lines
5.4 KiB
Python
Raw Permalink Normal View History

2024-09-17 16:44:55 +00:00
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.exc import OperationalError
import re
import requests
import json
from decimal import Decimal
# توکن و هدر برای ارسال درخواست به API
TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTg3MTY3MTMsImp0aSI6Im1aY0MwSEdIV3dxb1ppWVwvb2VqMlwvT2FWc3FTOFIwSTkiLCJpc3MiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZXhwIjoxNzIwMDE2NzEyLCJhdWQiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZGF0YSI6eyJpZCI6NywiZmlyc3RfbmFtZSI6Ilx1MDY0NVx1MDYzNVx1MDYzN1x1MDY0MVx1MDZjYyIsImxhc3RfbmFtZSI6Ilx1MDYyOFx1MDY0N1x1MDYyZlx1MDYyN1x1MDY0Nlx1MDZjYyIsImVtYWlsIjoiamFtdXNiMjc0NzRAZ21haWwuY29tIiwidXNlcm5hbWUiOiJtYmVoZGFuaSIsInVzZXJfbGV2ZWwiOjF9fQ.NhxbjdXMCEb_ninOBKpzbUsaAmxva1zpShuesXrVpEQ'
ACCEPT = "application/json"
HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
# اطلاعات اتصال به پایگاه داده
db_host = 'DESKTOP-0STSURA\\MUSTAFA'
db_name = 'Qavanin'
db_driver = 'ODBC Driver 17 for SQL Server'
db_trusted_connection = 'yes'
# ایجاد یک رشته اتصال (connection string)
connection_string = f"mssql+pyodbc://@{db_host}/{db_name}?driver={db_driver}&Trusted_Connection={db_trusted_connection}"
# ایجاد یک engine با استفاده از SQLAlchemy
engine = create_engine(connection_string)
# بررسی اتصال به پایگاه داده
try:
with engine.connect() as connection:
print("اتصال به سرور موفقیت‌آمیز بود.")
except OperationalError as e:
print(f"خطا در اتصال به سرور: {e}")
raise
# اجرای کوئری SQL و خواندن داده‌ها به یک DataFrame
query = """
SELECT
(select Top(1) gl5.SECTIONTEXT from lwSectionLog gl5 where gl1.[F_LWSECTIONLOGID_EFFECTED]=gl5.ID ) as effect_prev_text,
gl2.[SECTIONTEXT] as effected_text
FROM [Qavanin].[dbo].[lwSectionChange] gl1
LEFT JOIN lwSection gl2 on gl2.ID=gl1.[F_LWSECTIONID_EFFECTED]
LEFT JOIN lwLaw gl4 on gl1.F_LWLAWID_EFFECTIVE=gl4.ID
WHERE
gl4.ISLAW = 1
AND gl1.[F_LWSECTIONID_EFFECTED] is not null
ORDER BY gl1.[F_LWSECTIONID_EFFECTED], gl4.APPROVEDATE
"""
df = pd.read_sql(query, engine)
# تابع برای حذف کاراکترهای غیرمجاز
def remove_illegal_chars(value):
if isinstance(value, str):
return re.sub(r'[\000-\010]|[\013-\014]|[\016-\037]', '', value)
return value
df = df.applymap(remove_illegal_chars)
df = df[['effect_prev_text', 'effected_text']]
num_rows = len(df)
num_chunks = 15
chunk_size = num_rows // num_chunks + (1 if num_rows % num_chunks != 0 else 0)
BASE_URL = "https://api.tavasi.ir/repo/dataset/multi/add/qasection/keyword"
class JSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, Decimal):
return float(obj)
return json.JSONEncoder.default(self, obj)
def isNeedHtml(html):
if "<TABLE" in html or "<table" in html or "</TR" in html or "</tr" in html :
return True
return False
def removeHtmlTags(html, exeptionTag=[]):
if exeptionTag.__len__():
exceptTags = ''
for tag in exeptionTag:
if exceptTags != '':
exceptTags += '|'
exceptTags += '(' + tag + ')'
reg1 = r'<(?P<slash>/)*(?P<tag>' + exceptTags + ')(?P<class>[^>]+)*>'
html = re.sub(reg1, '', html)
return html
def createIndex(content, values):
result_objects = [{
"task": "diff",
"key": "qasection_ref_state_diff",
"label": "تغییرات در قانون",
"values": [{"text": values, "score": 4}] # مثال امتیاز تشابه
}]
output = {
"content": content,
"domain": "تغییرات قانون",
"ref_id": "",
"ref_url": "",
"task": "diff",
"result_objects": result_objects,
}
return output
def send_data_to_api(data_list):
bulk_data = []
for item in data_list:
content = item["effect_prev_text"]
values = item["effected_text"]
# بررسی خالی نبودن content و values
if not content or not values:
continue
# بررسی وجود تگ HTML خاص
if isNeedHtml(content) or isNeedHtml(values):
continue
# حذف تگ‌های HTML از content و values
content = removeHtmlTags(content)
values = removeHtmlTags(values)
data = createIndex(content, values)
bulk_data.append(data)
if len(bulk_data) > 10:
payload = json.dumps(bulk_data, cls=JSONEncoder)
response = requests.post(BASE_URL, headers=HEADERS, data=payload)
bulk_data = []
if bulk_data:
payload = json.dumps(bulk_data, cls=JSONEncoder)
response = requests.post(BASE_URL, headers=HEADERS, data=payload)
# for i, line in mainList :
for i in range(num_chunks):
start_row = i * chunk_size
end_row = start_row + chunk_size
chunk_df = df.iloc[start_row:end_row]
data_list = []
for index, row in chunk_df.iterrows():
data_list.append({
"effect_prev_text": row['effect_prev_text'],
"effected_text": row['effected_text']
})
send_data_to_api(data_list)
print("تمام داده‌ها با موفقیت ارسال شدند.")