151 lines
5.4 KiB
Python
151 lines
5.4 KiB
Python
|
import pandas as pd
|
||
|
from sqlalchemy import create_engine
|
||
|
from sqlalchemy.exc import OperationalError
|
||
|
import re
|
||
|
import requests
|
||
|
import json
|
||
|
from decimal import Decimal
|
||
|
|
||
|
# توکن و هدر برای ارسال درخواست به API
|
||
|
TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTg3MTY3MTMsImp0aSI6Im1aY0MwSEdIV3dxb1ppWVwvb2VqMlwvT2FWc3FTOFIwSTkiLCJpc3MiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZXhwIjoxNzIwMDE2NzEyLCJhdWQiOiJodHRwczpcL1wvY3AudGF2YXNpLmlyIiwiZGF0YSI6eyJpZCI6NywiZmlyc3RfbmFtZSI6Ilx1MDY0NVx1MDYzNVx1MDYzN1x1MDY0MVx1MDZjYyIsImxhc3RfbmFtZSI6Ilx1MDYyOFx1MDY0N1x1MDYyZlx1MDYyN1x1MDY0Nlx1MDZjYyIsImVtYWlsIjoiamFtdXNiMjc0NzRAZ21haWwuY29tIiwidXNlcm5hbWUiOiJtYmVoZGFuaSIsInVzZXJfbGV2ZWwiOjF9fQ.NhxbjdXMCEb_ninOBKpzbUsaAmxva1zpShuesXrVpEQ'
|
||
|
ACCEPT = "application/json"
|
||
|
HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
|
||
|
|
||
|
# اطلاعات اتصال به پایگاه داده
|
||
|
db_host = 'DESKTOP-0STSURA\\MUSTAFA'
|
||
|
db_name = 'Qavanin'
|
||
|
db_driver = 'ODBC Driver 17 for SQL Server'
|
||
|
db_trusted_connection = 'yes'
|
||
|
|
||
|
# ایجاد یک رشته اتصال (connection string)
|
||
|
connection_string = f"mssql+pyodbc://@{db_host}/{db_name}?driver={db_driver}&Trusted_Connection={db_trusted_connection}"
|
||
|
|
||
|
# ایجاد یک engine با استفاده از SQLAlchemy
|
||
|
engine = create_engine(connection_string)
|
||
|
|
||
|
# بررسی اتصال به پایگاه داده
|
||
|
try:
|
||
|
with engine.connect() as connection:
|
||
|
print("اتصال به سرور موفقیتآمیز بود.")
|
||
|
except OperationalError as e:
|
||
|
print(f"خطا در اتصال به سرور: {e}")
|
||
|
raise
|
||
|
|
||
|
# اجرای کوئری SQL و خواندن دادهها به یک DataFrame
|
||
|
query = """
|
||
|
SELECT
|
||
|
(select Top(1) gl5.SECTIONTEXT from lwSectionLog gl5 where gl1.[F_LWSECTIONLOGID_EFFECTED]=gl5.ID ) as effect_prev_text,
|
||
|
gl2.[SECTIONTEXT] as effected_text
|
||
|
FROM [Qavanin].[dbo].[lwSectionChange] gl1
|
||
|
LEFT JOIN lwSection gl2 on gl2.ID=gl1.[F_LWSECTIONID_EFFECTED]
|
||
|
LEFT JOIN lwLaw gl4 on gl1.F_LWLAWID_EFFECTIVE=gl4.ID
|
||
|
WHERE
|
||
|
gl4.ISLAW = 1
|
||
|
AND gl1.[F_LWSECTIONID_EFFECTED] is not null
|
||
|
ORDER BY gl1.[F_LWSECTIONID_EFFECTED], gl4.APPROVEDATE
|
||
|
"""
|
||
|
|
||
|
df = pd.read_sql(query, engine)
|
||
|
|
||
|
# تابع برای حذف کاراکترهای غیرمجاز
|
||
|
def remove_illegal_chars(value):
|
||
|
if isinstance(value, str):
|
||
|
return re.sub(r'[\000-\010]|[\013-\014]|[\016-\037]', '', value)
|
||
|
return value
|
||
|
|
||
|
df = df.applymap(remove_illegal_chars)
|
||
|
|
||
|
df = df[['effect_prev_text', 'effected_text']]
|
||
|
|
||
|
num_rows = len(df)
|
||
|
num_chunks = 15
|
||
|
chunk_size = num_rows // num_chunks + (1 if num_rows % num_chunks != 0 else 0)
|
||
|
|
||
|
BASE_URL = "https://api.tavasi.ir/repo/dataset/multi/add/qasection/keyword"
|
||
|
|
||
|
class JSONEncoder(json.JSONEncoder):
|
||
|
def default(self, obj):
|
||
|
if isinstance(obj, Decimal):
|
||
|
return float(obj)
|
||
|
return json.JSONEncoder.default(self, obj)
|
||
|
|
||
|
def isNeedHtml(html):
|
||
|
if "<TABLE" in html or "<table" in html or "</TR" in html or "</tr" in html :
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
def removeHtmlTags(html, exeptionTag=[]):
|
||
|
if exeptionTag.__len__():
|
||
|
exceptTags = ''
|
||
|
for tag in exeptionTag:
|
||
|
if exceptTags != '':
|
||
|
exceptTags += '|'
|
||
|
exceptTags += '(' + tag + ')'
|
||
|
reg1 = r'<(?P<slash>/)*(?P<tag>' + exceptTags + ')(?P<class>[^>]+)*>'
|
||
|
html = re.sub(reg1, '', html)
|
||
|
return html
|
||
|
|
||
|
def createIndex(content, values):
|
||
|
result_objects = [{
|
||
|
"task": "diff",
|
||
|
"key": "qasection_ref_state_diff",
|
||
|
"label": "تغییرات در قانون",
|
||
|
"values": [{"text": values, "score": 4}] # مثال امتیاز تشابه
|
||
|
}]
|
||
|
output = {
|
||
|
"content": content,
|
||
|
"domain": "تغییرات قانون",
|
||
|
"ref_id": "",
|
||
|
"ref_url": "",
|
||
|
"task": "diff",
|
||
|
"result_objects": result_objects,
|
||
|
}
|
||
|
return output
|
||
|
|
||
|
def send_data_to_api(data_list):
|
||
|
bulk_data = []
|
||
|
for item in data_list:
|
||
|
content = item["effect_prev_text"]
|
||
|
values = item["effected_text"]
|
||
|
|
||
|
# بررسی خالی نبودن content و values
|
||
|
if not content or not values:
|
||
|
continue
|
||
|
|
||
|
# بررسی وجود تگ HTML خاص
|
||
|
if isNeedHtml(content) or isNeedHtml(values):
|
||
|
continue
|
||
|
|
||
|
# حذف تگهای HTML از content و values
|
||
|
content = removeHtmlTags(content)
|
||
|
values = removeHtmlTags(values)
|
||
|
|
||
|
data = createIndex(content, values)
|
||
|
bulk_data.append(data)
|
||
|
|
||
|
if len(bulk_data) > 10:
|
||
|
payload = json.dumps(bulk_data, cls=JSONEncoder)
|
||
|
response = requests.post(BASE_URL, headers=HEADERS, data=payload)
|
||
|
bulk_data = []
|
||
|
|
||
|
if bulk_data:
|
||
|
payload = json.dumps(bulk_data, cls=JSONEncoder)
|
||
|
response = requests.post(BASE_URL, headers=HEADERS, data=payload)
|
||
|
|
||
|
# for i, line in mainList :
|
||
|
for i in range(num_chunks):
|
||
|
start_row = i * chunk_size
|
||
|
end_row = start_row + chunk_size
|
||
|
chunk_df = df.iloc[start_row:end_row]
|
||
|
|
||
|
data_list = []
|
||
|
for index, row in chunk_df.iterrows():
|
||
|
data_list.append({
|
||
|
"effect_prev_text": row['effect_prev_text'],
|
||
|
"effected_text": row['effected_text']
|
||
|
})
|
||
|
|
||
|
send_data_to_api(data_list)
|
||
|
|
||
|
print("تمام دادهها با موفقیت ارسال شدند.")
|