Merge branch 'md83' of https://git2.tavasi.ir/ajokar/Nlp_models
This commit is contained in:
commit
049311691e
BIN
persian_nlp_model.db
Normal file
BIN
persian_nlp_model.db
Normal file
Binary file not shown.
326
persian_nlp_model_sqlite.py
Normal file
326
persian_nlp_model_sqlite.py
Normal file
|
@ -0,0 +1,326 @@
|
||||||
|
#بسم الله
|
||||||
|
from huggingface_hub import HfApi
|
||||||
|
from datetime import date
|
||||||
|
from fpdf import FPDF
|
||||||
|
import random
|
||||||
|
import sqlite3
|
||||||
|
import string
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
list1 = ["ID","model_id","url","downloads","private","author","tags","tag_dataset",\
|
||||||
|
"tag_base_model","tag_license","tag_region","pipeline_tag","Likes","languages",\
|
||||||
|
"library","datasets","license","just_persian","deleted","date_added"]
|
||||||
|
cnt = sqlite3.connect("persian_nlp_model.db")
|
||||||
|
c = cnt.cursor()
|
||||||
|
today = date.today()
|
||||||
|
d1 = today.strftime("%d-%m-%Y")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# فقط برای اولین بار که جدول قرار است ساخته شود از این کد ها استفاده شود
|
||||||
|
# c.execute("""CREATE TABLE PersianNlp(
|
||||||
|
# ID INT PRIMARY KEY ,
|
||||||
|
# model_id TEXT ,
|
||||||
|
# url TEXT ,
|
||||||
|
# downloads INT,
|
||||||
|
# private TEXT,
|
||||||
|
# author TEXT,
|
||||||
|
# tags TEXT,
|
||||||
|
# tag_dataset TEXT,
|
||||||
|
# tag_base_model TEXT,
|
||||||
|
# tag_license TEXT,
|
||||||
|
# tag_region TEXT,
|
||||||
|
# pipeline_tag TEXT,
|
||||||
|
# Likes INT,
|
||||||
|
# languages TEXT,
|
||||||
|
# library TEXT,
|
||||||
|
# datasets TEXT,
|
||||||
|
# license TEXT,
|
||||||
|
# just_persian TEXT,
|
||||||
|
# deleted TEXT,
|
||||||
|
# date_added TEXT
|
||||||
|
# );""")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# برای ساخت جدول میزان دانلود ها از این کد استفاده شود
|
||||||
|
# c.execute("""CREATE TABLE downloadCountHistory(
|
||||||
|
# ID INT PRIMARY KEY ,
|
||||||
|
# key_id INT ,
|
||||||
|
# downloads INT,
|
||||||
|
# date TEXT
|
||||||
|
# );""")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# تعریف تسکهای رایج NLP
|
||||||
|
nlp_task_list = [
|
||||||
|
"text-classification",
|
||||||
|
"token-classification",
|
||||||
|
"question-answering",
|
||||||
|
"summarization",
|
||||||
|
"translation",
|
||||||
|
"text-generation",
|
||||||
|
"fill-mask",
|
||||||
|
"zero-shot-classification",
|
||||||
|
"feature-extraction",
|
||||||
|
"sentence-similarity",
|
||||||
|
"text2text-generation",
|
||||||
|
"conversational"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def generate_random_id(length=10, chars=string.ascii_letters + string.digits):
|
||||||
|
"""
|
||||||
|
یک آیدی تصادفی با طول مشخص و از کاراکترهای داده شده تولید میکند.
|
||||||
|
:param length: طول آیدی (پیشفرض: 10)
|
||||||
|
:param chars: رشتهای از کاراکترهای مجاز (پیشفرض: حروف کوچک و بزرگ انگلیسی + ارقام)
|
||||||
|
:return: رشته آیدی تصادفی
|
||||||
|
"""
|
||||||
|
return ''.join(random.choice(chars) for _ in range(length))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def persian_model_finder(nlp_tasks,idx):
|
||||||
|
today = date.today()
|
||||||
|
download_date = today.strftime("%d/%m/%Y")
|
||||||
|
idX = idx # اخرین آیدی موجود در دیتابیس را وارد میکنیم تا موارد جدید با آیدی های قبلی تداخل نکند
|
||||||
|
api = HfApi()
|
||||||
|
all_persian_nlp_models_data = []
|
||||||
|
seen_model_ids = set() # برای جلوگیری از اضافه شدن مدلهای تکراری
|
||||||
|
|
||||||
|
print("در حال جستجو و استخراج اطلاعات مدلهای NLP فارسی...")
|
||||||
|
|
||||||
|
# فیلتر کردن و پیمایش روی مدلها
|
||||||
|
# برای هر تسک NLP، مدلهای فارسی را جستجو میکنیم.
|
||||||
|
# محدودیت 500 مدل برای هر تسک در نظر گرفته شده است تا از دانلود بیش از حد جلوگیری شود.
|
||||||
|
# اگر میخواهید همه مدلها را استخراج کنید، ممکن است نیاز به پیجینیشن (pagination) باشد.
|
||||||
|
|
||||||
|
try:
|
||||||
|
allModel = c.execute(f'''SELECT *
|
||||||
|
FROM PersianNlp''')
|
||||||
|
for model in allModel:
|
||||||
|
seen_model_ids.add(model[1])
|
||||||
|
except:
|
||||||
|
print("database not find!")
|
||||||
|
|
||||||
|
for task in nlp_tasks:
|
||||||
|
|
||||||
|
print(f" جستجو برای تسک: {task} (زبان: فارسی)...")
|
||||||
|
# try:
|
||||||
|
models_for_task = api.list_models(
|
||||||
|
language="fa",
|
||||||
|
task=task,
|
||||||
|
sort="downloads",
|
||||||
|
direction=-1, # نزولی (از بیشترین دانلود به کمترین)
|
||||||
|
limit=None # میتوانید این عدد را تغییر دهید
|
||||||
|
)
|
||||||
|
|
||||||
|
for model_info in models_for_task:
|
||||||
|
if model_info.id not in seen_model_ids:
|
||||||
|
idX+=1
|
||||||
|
# try : # اگر از کارت مدل توانست اطلاعات بیشتری به دست بیاورد :
|
||||||
|
model_ = api.model_info(model_info.id) # به دست آوردن شناسه مدل
|
||||||
|
card_data_dict = model_.card_data.to_dict() # از روی کارت مدل که شامل اطلاعات مدل میباشد یک دیکشنری میسازیم
|
||||||
|
model_data = {
|
||||||
|
"model_id": model_info.id,
|
||||||
|
"url": f"https://huggingface.co/{model_info.id}",
|
||||||
|
"downloads": model_info.downloads,
|
||||||
|
"private": model_info.private,
|
||||||
|
"author": model_info.author,
|
||||||
|
"tags": model_info.tags, # شامل زبانها، تسکها، لایبرریها و...
|
||||||
|
"tag_dataset":"-",
|
||||||
|
"tag_base_model":"-",
|
||||||
|
"tag_license":"-",
|
||||||
|
"tag_region":"-",
|
||||||
|
"pipeline_tag": model_info.pipeline_tag, # تسک اصلی مدل که توسط هاب تعیین شده
|
||||||
|
"Likes":model_info.likes,
|
||||||
|
# چهار مورد پایینی از روی دیکشنری کارت مدل خوانده میشود
|
||||||
|
"languages":card_data_dict.get('language', 'N/A'), # زبان هایی که پشتیبانی میشود
|
||||||
|
"library":card_data_dict.get('library', 'N/A'), # کتابخانه های مورد استفاده
|
||||||
|
"datasets":card_data_dict.get('datasets', 'N/A'), # دیتابیس های مورد استفاده
|
||||||
|
"license":card_data_dict.get('license', 'N/A'),
|
||||||
|
"just_persian" : "False",
|
||||||
|
"deleted" : "False",
|
||||||
|
"date_added" : f"{download_date}"
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if model_data["library"] == 'N/A': # در بعضی موارد کتابخانه به این نام ('library_name') در دیکشنری کارت مدل ذخیره شده
|
||||||
|
model_data["library"] = card_data_dict.get('library_name', 'N/A')
|
||||||
|
# شرط پایینی ، مواردی که فقط مختص زبان فارسی هستند را در دیکشنری مشخص میکند
|
||||||
|
if len(model_data["languages"]) == 2 and "multilingual" in model_data["languages"] or\
|
||||||
|
len(model_data["languages"]) == 2 and "persian" in model_data["languages"] or\
|
||||||
|
len(model_data["languages"]) == 2 and "farsi" in model_data["languages"] or\
|
||||||
|
len(model_data["languages"]) == 2 and "fas" in model_data["languages"] or\
|
||||||
|
len(model_data["languages"]) == 2 and model_data["languages"]=="fa" or\
|
||||||
|
model_data["languages"] == "persian" or\
|
||||||
|
model_data["languages"] == "farsi" or\
|
||||||
|
model_data["languages"] == "fas" or\
|
||||||
|
model_data["languages"] == "pes" or\
|
||||||
|
len(model_data["languages"]) == 1 :
|
||||||
|
model_data["just_persian"] = "True"
|
||||||
|
|
||||||
|
for value in model_data["tags"]:
|
||||||
|
|
||||||
|
if "dataset:" in value :
|
||||||
|
if type(model_data["tag_dataset"]) == type(""):
|
||||||
|
model_data["tag_dataset"] = list(model_data["tag_dataset"])
|
||||||
|
model_data["tag_dataset"].pop(0)
|
||||||
|
model_data["tag_dataset"].append(f"{str(value).replace("dataset:","")}")
|
||||||
|
|
||||||
|
if "base_model:" in value :
|
||||||
|
if type(model_data["tag_base_model"]) == type(""):
|
||||||
|
model_data["tag_base_model"] = list(model_data["tag_base_model"])
|
||||||
|
model_data["tag_base_model"].pop(0)
|
||||||
|
model_data["tag_base_model"].append(f"{str(value).replace("base_model:","")}")
|
||||||
|
|
||||||
|
if "region:" in value :
|
||||||
|
model_data["tag_region"]=f"{str(value).replace("region:","")}"
|
||||||
|
|
||||||
|
if "license:" in value :
|
||||||
|
model_data["tag_license"]=f"{str(value).replace("license:","")}"
|
||||||
|
|
||||||
|
|
||||||
|
# all_persian_nlp_models_data.append(model_data)
|
||||||
|
c.execute(f"""INSERT INTO PersianNlp (ID,model_id,url,downloads,private,author,tags,tag_dataset,tag_base_model,tag_license,tag_region,pipeline_tag,Likes,languages,library,datasets,license,just_persian,deleted,date_added)
|
||||||
|
VALUES ({idX},"{model_data["model_id"]}","{model_data["url"]}",{model_data["downloads"]},"{model_data["private"]}","{model_data["author"]}","{model_data["tags"]}","{model_data["tag_dataset"]}","{model_data["tag_base_model"]}","{model_data["tag_license"]}","{model_data["tag_region"]}","{model_data["pipeline_tag"]}",{model_data["Likes"]},"{model_data["languages"]}","{model_data["library"]}","{model_data["datasets"]}","{model_data["license"]}","{model_data["just_persian"]}","{model_data["deleted"]}","{model_data["date_added"]}");""")
|
||||||
|
cnt.commit()
|
||||||
|
seen_model_ids.add(model_info.id)
|
||||||
|
|
||||||
|
print(f"\nتعداد کل مدلهای NLP فارسی منحصربهفرد یافت شده: {len(seen_model_ids)}")
|
||||||
|
|
||||||
|
#اول لیست تسک ها را میدهیم برای جست و جو ، و بعد آخرین آیدی که در تیبل مدلها در دیتابیس موجود است
|
||||||
|
# persian_model_finder(nlp_task_list,8288)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def search(name,search_by):
|
||||||
|
|
||||||
|
X = "------------------------------------------------------------------------------------\n+-+-+-+- FOUND MODEL +-+-+-+-\n------------------------------------------------------------------------------------\n\n"
|
||||||
|
n=0
|
||||||
|
|
||||||
|
if search_by == "name":
|
||||||
|
|
||||||
|
model = c.execute(f'''SELECT *
|
||||||
|
FROM PersianNlp
|
||||||
|
WHERE model_id="{name}"''')
|
||||||
|
for x in model:
|
||||||
|
for y in x :
|
||||||
|
X+= f"{list1[n]} : {y}\n-----------------------------------------------------------------\n"
|
||||||
|
n+=1
|
||||||
|
X+="\n\n"
|
||||||
|
n = 0
|
||||||
|
print(X)
|
||||||
|
|
||||||
|
if search_by == "task":
|
||||||
|
|
||||||
|
model = c.execute(f'''SELECT *
|
||||||
|
FROM PersianNlp
|
||||||
|
WHERE pipeline_tag="{name}"''')
|
||||||
|
|
||||||
|
for x in model:
|
||||||
|
for y in x :
|
||||||
|
X+= f"{list1[n]} : {y}\n-----------------------------------------------------------------\n"
|
||||||
|
n+=1
|
||||||
|
X+="\n\n"
|
||||||
|
n = 0
|
||||||
|
print(X)
|
||||||
|
|
||||||
|
# search("text-ranking","task")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def pdf_saver(name):
|
||||||
|
|
||||||
|
|
||||||
|
X = "------------------------------------------------------------------------------------\n\
|
||||||
|
+-+-+-+- FOUND MODEL +-+-+-+-\
|
||||||
|
\n------------------------------------------------------------------------------------\n\n"
|
||||||
|
model = c.execute(f'''SELECT *
|
||||||
|
FROM PersianNlp
|
||||||
|
WHERE model_id="{name}"''')
|
||||||
|
n=0
|
||||||
|
for x in model:
|
||||||
|
for y in x :
|
||||||
|
X+= f"{list1[n]} : {y}\n-----------------------------------------------------------------\n"
|
||||||
|
n+=1
|
||||||
|
print(X)
|
||||||
|
pdf = FPDF()
|
||||||
|
pdf.add_page()
|
||||||
|
pdf.set_font("Arial", size=12)
|
||||||
|
pdf.multi_cell(0, 10, X)
|
||||||
|
pdf.output("found_model.pdf")
|
||||||
|
print("PDF generated successfully!")
|
||||||
|
|
||||||
|
# pdf_saver("Alibaba-NLP/gte-multilingual-reranker-base")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# برای پیدا کردن مدل ها بر اساس تاریخ ، برای مثال سه ماه پیش
|
||||||
|
# راهنمایی بیشتر در انتها
|
||||||
|
def find_by_date(month_later , year_later):
|
||||||
|
|
||||||
|
today = date.today()
|
||||||
|
date_year = today.strftime("%Y")
|
||||||
|
date_month = today.strftime("%m")
|
||||||
|
month = int(date_month)
|
||||||
|
year = int(date_year)
|
||||||
|
allModel = c.execute(f'''SELECT *
|
||||||
|
FROM PersianNlp''')
|
||||||
|
n=0
|
||||||
|
for model in allModel:
|
||||||
|
if int(model[19].split("/")[1]) >= month-month_later and int(model[19].split("/")[2]) >= year-year_later :
|
||||||
|
X = ""
|
||||||
|
for y in model :
|
||||||
|
X+= f"{list1[n]} : {y}\n-----------------------------------------------------------------\n"
|
||||||
|
n+=1
|
||||||
|
n = 0
|
||||||
|
print(X)
|
||||||
|
|
||||||
|
# برای مثال میخواهیم مدل های یک سال و شش ماه قبل تا الان را ببینیم
|
||||||
|
# اول ماه و بعد سال را وارد میکنیم به این صورت :
|
||||||
|
# find_by_date(6,1)
|
||||||
|
|
||||||
|
# یا میخواهیم مدل های سه ماه گذشته را ببینیم :
|
||||||
|
# find_by_date(3,0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def add_download_count():
|
||||||
|
|
||||||
|
count = 1
|
||||||
|
api = HfApi()
|
||||||
|
allModel = c.execute(f'''SELECT *
|
||||||
|
FROM PersianNlp''')
|
||||||
|
all_model_id = []
|
||||||
|
for model in allModel:
|
||||||
|
all_model_id.append([model[0],model[1]])
|
||||||
|
|
||||||
|
for id_ in all_model_id:
|
||||||
|
# try:
|
||||||
|
print(count)
|
||||||
|
count+=1
|
||||||
|
id_12_digits = generate_random_id(length=12, chars=string.digits)
|
||||||
|
model_details = api.model_info(repo_id=id_[1])
|
||||||
|
c.execute(f"""INSERT INTO downloadCountHistory(ID,key_id,downloads,date)
|
||||||
|
VALUES ({id_12_digits},"{int(id_[0])}","{int(model_details.downloads)}","{str(d1)}");""")
|
||||||
|
cnt.commit()
|
||||||
|
# except:
|
||||||
|
# print("Error!!")
|
||||||
|
|
||||||
|
# add_download_count()
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user