Compare commits
No commits in common. "792e7b6ba1c5274e4eb22e1ac921f648f6ada5d9" and "a61e9d68cae1d616b20de10116bbf25606423ab6" have entirely different histories.
792e7b6ba1
...
a61e9d68ca
|
@ -1,32 +1,28 @@
|
|||
from transformers import AutoTokenizer
|
||||
import json
|
||||
|
||||
file = open('./data/models_ner_info.txt', 'r')
|
||||
models = file.readlines()
|
||||
file.close()
|
||||
file = open('./data/models_info.json', 'r')
|
||||
models = json.load(file)
|
||||
|
||||
# Strips the newline character
|
||||
text = 'جمهوری موافقتنامه معاملات قانون بودجه اساسی قضائی بینالمللی تأسیس منطقهای لازمالاجراء دامپروری راهآهن کمیسیونهای جدیدالاحداث مسئول فرآورده زائد اسقاط پنجساله'
|
||||
results = []
|
||||
for line in models:
|
||||
model_checkpoint = line.split(' -- ')[1].rstrip('\n')
|
||||
# model_checkpoint = line['model_name']
|
||||
model_checkpoint = line['model_name']
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
||||
print(model_checkpoint)
|
||||
tokens = tokenizer.tokenize(text)
|
||||
print(tokens)
|
||||
print(f'len(tokens): {len(tokens)}')
|
||||
results.append({ 'model': model_checkpoint, 'len': len(tokens), 'tokens': tokens})
|
||||
|
||||
results.append({ 'model': model_checkpoint, 'tokens': tokens})
|
||||
if len(tokens) == 2 :
|
||||
file.write( '{'+model_checkpoint + " : [ " + ','.join(tokens) + ' ] }\n' )
|
||||
#result = tokenizer(text)
|
||||
#print(result)
|
||||
#print(tokenizer.decode(result['input_ids']))
|
||||
except:
|
||||
error = "An exception occurred in tokenizer : " + model_checkpoint
|
||||
#file.write( error + '\n' )
|
||||
print(error)
|
||||
|
||||
with open('./data/models_tokenizer_info.json', 'w', encoding='utf-8') as file:
|
||||
jsondata = json.dumps(results, ensure_ascii=False, indent=2)
|
||||
file.write(jsondata)
|
||||
|
||||
print('finished!')
|
||||
|
||||
#tokenizer.save_pretrained(model_checkpoint+'-tokenizer')
|
||||
file.close()
|
||||
|
|
|
@ -1,8 +0,0 @@
|
|||
HooshvareLab/bert-base-parsbert-ner-uncased
|
||||
HooshvareLab/bert-fa-base-uncased-ner-peyma
|
||||
HooshvareLab/bert-base-parsbert-armanner-uncased
|
||||
HooshvareLab/bert-fa-base-uncased-ner-arman
|
||||
HooshvareLab/bert-base-parsbert-peymaner-uncased
|
||||
nicolauduran45/affilgood-ner-multilingual-v2
|
||||
Amirmerfan/bert-base-uncased-persian-ner-50k-base
|
||||
AliFartout/Roberta-fa-en-ner
|
11498
data/models_info.json
11498
data/models_info.json
File diff suppressed because it is too large
Load Diff
10498
data/models_info0.json
10498
data/models_info0.json
File diff suppressed because it is too large
Load Diff
|
@ -1,258 +0,0 @@
|
|||
[
|
||||
{
|
||||
"model_name": "HooshvareLab/bert-base-parsbert-ner-uncased",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2021-5-18",
|
||||
"downloads": 4060,
|
||||
"likes": 5,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "jplu/tf-xlm-r-ner-40-lang",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2022-10-6",
|
||||
"downloads": 874,
|
||||
"likes": 27,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "nicolauduran45/affilgood-ner-multilingual-v2",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2025-4-16",
|
||||
"downloads": 668,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "HooshvareLab/bert-fa-zwnj-base-ner",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2021-5-18",
|
||||
"downloads": 407,
|
||||
"likes": 4,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "igorsterner/xlmr-multilingual-sentence-segmentation",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2024-10-5",
|
||||
"downloads": 256,
|
||||
"likes": 4,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "hamedkhaledi/persian-flair-ner",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2022-4-3",
|
||||
"downloads": 230,
|
||||
"likes": 1,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "HooshvareLab/bert-fa-base-uncased-ner-peyma",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2021-5-18",
|
||||
"downloads": 138,
|
||||
"likes": 8,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "mradermacher/persian-question-generator-GGUF",
|
||||
"task": "Unknown",
|
||||
"last_modified": "2024-12-8",
|
||||
"downloads": 110,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "HooshvareLab/bert-base-parsbert-armanner-uncased",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2021-5-18",
|
||||
"downloads": 103,
|
||||
"likes": 3,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "HooshvareLab/distilbert-fa-zwnj-base-ner",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2021-3-21",
|
||||
"downloads": 101,
|
||||
"likes": 4,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "HooshvareLab/roberta-fa-zwnj-base-ner",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2021-5-20",
|
||||
"downloads": 92,
|
||||
"likes": 1,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "HooshvareLab/bert-fa-base-uncased-ner-arman",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2021-5-18",
|
||||
"downloads": 71,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "HooshvareLab/albert-fa-zwnj-base-v2-ner",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2021-3-21",
|
||||
"downloads": 58,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "artificial-nerds/pmnet",
|
||||
"task": "sentence-similarity",
|
||||
"last_modified": "2024-10-29",
|
||||
"downloads": 33,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "HooshvareLab/bert-base-parsbert-peymaner-uncased",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2021-5-18",
|
||||
"downloads": 28,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "Erfan/mT5-base_Farsi_Title_Generator",
|
||||
"task": "text-generation",
|
||||
"last_modified": "2022-1-30",
|
||||
"downloads": 24,
|
||||
"likes": 2,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "myrkur/persian-question-generator",
|
||||
"task": "summarization",
|
||||
"last_modified": "2024-12-10",
|
||||
"downloads": 24,
|
||||
"likes": 2,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "m3hrdadfi/albert-fa-base-v2-ner-arman",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2020-12-26",
|
||||
"downloads": 21,
|
||||
"likes": 3,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "hezarai/bert-fa-ner-arman",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2024-11-14",
|
||||
"downloads": 17,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "m3hrdadfi/albert-fa-base-v2-ner-peyma",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2020-12-26",
|
||||
"downloads": 16,
|
||||
"likes": 1,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "Amirmerfan/bert-base-uncased-persian-ner-50k-base",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2025-3-18",
|
||||
"downloads": 16,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "tum-nlp/neural-news-generator-bloomz-7b1-fa",
|
||||
"task": "text-generation",
|
||||
"last_modified": "2024-8-15",
|
||||
"downloads": 14,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "dadashzadeh/test-trainer-persian",
|
||||
"task": "text-classification",
|
||||
"last_modified": "2023-11-13",
|
||||
"downloads": 11,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "tum-nlp/neural-news-generator-llama-7b-fa",
|
||||
"task": "text-generation",
|
||||
"last_modified": "2024-8-15",
|
||||
"downloads": 11,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "AliFartout/Roberta-fa-en-ner",
|
||||
"task": "token-classification",
|
||||
"last_modified": "2023-9-1",
|
||||
"downloads": 9,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "myrkur/persian-title-generator",
|
||||
"task": "summarization",
|
||||
"last_modified": "2024-6-13",
|
||||
"downloads": 9,
|
||||
"likes": 2,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "NLPclass/mt5-title-generation",
|
||||
"task": "text-generation",
|
||||
"last_modified": "2024-7-24",
|
||||
"downloads": 9,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "SinaRp/Question_generator_persian",
|
||||
"task": "text-generation",
|
||||
"last_modified": "2024-11-8",
|
||||
"downloads": 9,
|
||||
"likes": 1,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "mansoorhamidzadeh/mt5-title-generation",
|
||||
"task": "text-generation",
|
||||
"last_modified": "2024-7-24",
|
||||
"downloads": 8,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "dz5035/bicleaner-ai-full-large-de-xx",
|
||||
"task": "Unknown",
|
||||
"last_modified": "2024-6-3",
|
||||
"downloads": 4,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "m0javad/conversational_Persian_paraphrase_generation",
|
||||
"task": "Unknown",
|
||||
"last_modified": "2024-2-17",
|
||||
"downloads": 0,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
},
|
||||
{
|
||||
"model_name": "PranavKeshav/event-planner-gemma-4bit",
|
||||
"task": "text-generation",
|
||||
"last_modified": "2025-5-17",
|
||||
"downloads": 0,
|
||||
"likes": 0,
|
||||
"language": "fa"
|
||||
}
|
||||
]
|
|
@ -1,17 +0,0 @@
|
|||
4060 -- HooshvareLab/bert-base-parsbert-ner-uncased
|
||||
874 -- jplu/tf-xlm-r-ner-40-lang
|
||||
668 -- nicolauduran45/affilgood-ner-multilingual-v2
|
||||
407 -- HooshvareLab/bert-fa-zwnj-base-ner
|
||||
230 -- hamedkhaledi/persian-flair-ner
|
||||
138 -- HooshvareLab/bert-fa-base-uncased-ner-peyma
|
||||
103 -- HooshvareLab/bert-base-parsbert-armanner-uncased
|
||||
101 -- HooshvareLab/distilbert-fa-zwnj-base-ner
|
||||
92 -- HooshvareLab/roberta-fa-zwnj-base-ner
|
||||
71 -- HooshvareLab/bert-fa-base-uncased-ner-arman
|
||||
58 -- HooshvareLab/albert-fa-zwnj-base-v2-ner
|
||||
28 -- HooshvareLab/bert-base-parsbert-peymaner-uncased
|
||||
21 -- m3hrdadfi/albert-fa-base-v2-ner-arman
|
||||
17 -- hezarai/bert-fa-ner-arman
|
||||
16 -- m3hrdadfi/albert-fa-base-v2-ner-peyma
|
||||
16 -- Amirmerfan/bert-base-uncased-persian-ner-50k-base
|
||||
9 -- AliFartout/Roberta-fa-en-ner
|
|
@ -1,465 +0,0 @@
|
|||
[
|
||||
{
|
||||
"model": "HooshvareLab/bert-base-parsbert-ner-uncased",
|
||||
"len": 20,
|
||||
"tokens": [
|
||||
"جمهوری",
|
||||
"موافقتنامه",
|
||||
"معاملات",
|
||||
"قانون",
|
||||
"بودجه",
|
||||
"اساسی",
|
||||
"قضايی",
|
||||
"بینالمللی",
|
||||
"تاسیس",
|
||||
"منطقهای",
|
||||
"لازمالاجراء",
|
||||
"دامپروری",
|
||||
"راهاهن",
|
||||
"کمیسیونهای",
|
||||
"جدیدالاحداث",
|
||||
"مسيول",
|
||||
"فراورده",
|
||||
"زايد",
|
||||
"اسقاط",
|
||||
"پنجساله"
|
||||
]
|
||||
},
|
||||
{
|
||||
"model": "nicolauduran45/affilgood-ner-multilingual-v2",
|
||||
"len": 39,
|
||||
"tokens": [
|
||||
"▁جمهوری",
|
||||
"▁موافقت",
|
||||
"▁نامه",
|
||||
"▁معاملات",
|
||||
"▁قانون",
|
||||
"▁بودجه",
|
||||
"▁اساسی",
|
||||
"▁قضا",
|
||||
"ئی",
|
||||
"▁بین",
|
||||
"▁المللی",
|
||||
"▁تأسیس",
|
||||
"▁منطقه",
|
||||
"▁ای",
|
||||
"▁لازم",
|
||||
"▁الاج",
|
||||
"راء",
|
||||
"▁دام",
|
||||
"پر",
|
||||
"وری",
|
||||
"▁راه",
|
||||
"▁آهن",
|
||||
"▁کمیسیون",
|
||||
"▁های",
|
||||
"▁جدید",
|
||||
"الا",
|
||||
"حد",
|
||||
"اث",
|
||||
"▁مسئول",
|
||||
"▁فر",
|
||||
"آور",
|
||||
"ده",
|
||||
"▁زائد",
|
||||
"▁اس",
|
||||
"ق",
|
||||
"اط",
|
||||
"▁پنج",
|
||||
"سال",
|
||||
"ه"
|
||||
]
|
||||
},
|
||||
{
|
||||
"model": "HooshvareLab/bert-fa-zwnj-base-ner",
|
||||
"len": 37,
|
||||
"tokens": [
|
||||
"جمهوری",
|
||||
"موافقت",
|
||||
"[ZWNJ]",
|
||||
"نامه",
|
||||
"معاملات",
|
||||
"قانون",
|
||||
"بودجه",
|
||||
"اساسی",
|
||||
"[UNK]",
|
||||
"بین",
|
||||
"[ZWNJ]",
|
||||
"المللی",
|
||||
"[UNK]",
|
||||
"منطقه",
|
||||
"[ZWNJ]",
|
||||
"ای",
|
||||
"لازم",
|
||||
"[ZWNJ]",
|
||||
"الاجرا",
|
||||
"##ء",
|
||||
"دامپروری",
|
||||
"راه",
|
||||
"[ZWNJ]",
|
||||
"آ",
|
||||
"##هن",
|
||||
"کمیسیون",
|
||||
"[ZWNJ]",
|
||||
"های",
|
||||
"جدیدا",
|
||||
"##لاح",
|
||||
"##داث",
|
||||
"[UNK]",
|
||||
"[UNK]",
|
||||
"[UNK]",
|
||||
"اسقاط",
|
||||
"پنج",
|
||||
"##ساله"
|
||||
]
|
||||
},
|
||||
{
|
||||
"model": "HooshvareLab/bert-fa-base-uncased-ner-peyma",
|
||||
"len": 20,
|
||||
"tokens": [
|
||||
"جمهوری",
|
||||
"موافقتنامه",
|
||||
"معاملات",
|
||||
"قانون",
|
||||
"بودجه",
|
||||
"اساسی",
|
||||
"قضايی",
|
||||
"بینالمللی",
|
||||
"تاسیس",
|
||||
"منطقهای",
|
||||
"لازمالاجراء",
|
||||
"دامپروری",
|
||||
"راهاهن",
|
||||
"کمیسیونهای",
|
||||
"جدیدالاحداث",
|
||||
"مسيول",
|
||||
"فراورده",
|
||||
"زايد",
|
||||
"اسقاط",
|
||||
"پنجساله"
|
||||
]
|
||||
},
|
||||
{
|
||||
"model": "HooshvareLab/bert-base-parsbert-armanner-uncased",
|
||||
"len": 20,
|
||||
"tokens": [
|
||||
"جمهوری",
|
||||
"موافقتنامه",
|
||||
"معاملات",
|
||||
"قانون",
|
||||
"بودجه",
|
||||
"اساسی",
|
||||
"قضايی",
|
||||
"بینالمللی",
|
||||
"تاسیس",
|
||||
"منطقهای",
|
||||
"لازمالاجراء",
|
||||
"دامپروری",
|
||||
"راهاهن",
|
||||
"کمیسیونهای",
|
||||
"جدیدالاحداث",
|
||||
"مسيول",
|
||||
"فراورده",
|
||||
"زايد",
|
||||
"اسقاط",
|
||||
"پنجساله"
|
||||
]
|
||||
},
|
||||
{
|
||||
"model": "HooshvareLab/distilbert-fa-zwnj-base-ner",
|
||||
"len": 37,
|
||||
"tokens": [
|
||||
"جمهوری",
|
||||
"موافقت",
|
||||
"[ZWNJ]",
|
||||
"نامه",
|
||||
"معاملات",
|
||||
"قانون",
|
||||
"بودجه",
|
||||
"اساسی",
|
||||
"[UNK]",
|
||||
"بین",
|
||||
"[ZWNJ]",
|
||||
"المللی",
|
||||
"[UNK]",
|
||||
"منطقه",
|
||||
"[ZWNJ]",
|
||||
"ای",
|
||||
"لازم",
|
||||
"[ZWNJ]",
|
||||
"الاجرا",
|
||||
"##ء",
|
||||
"دامپروری",
|
||||
"راه",
|
||||
"[ZWNJ]",
|
||||
"آ",
|
||||
"##هن",
|
||||
"کمیسیون",
|
||||
"[ZWNJ]",
|
||||
"های",
|
||||
"جدیدا",
|
||||
"##لاح",
|
||||
"##داث",
|
||||
"[UNK]",
|
||||
"[UNK]",
|
||||
"[UNK]",
|
||||
"اسقاط",
|
||||
"پنج",
|
||||
"##ساله"
|
||||
]
|
||||
},
|
||||
{
|
||||
"model": "HooshvareLab/roberta-fa-zwnj-base-ner",
|
||||
"len": 37,
|
||||
"tokens": [
|
||||
"ĠجÙħÙĩÙĪØ±ÛĮ",
|
||||
"ĠÙħÙĪØ§ÙģÙĤت",
|
||||
"âĢĮ",
|
||||
"ÙĨاÙħÙĩ",
|
||||
"ĠÙħعاÙħÙĦات",
|
||||
"ĠÙĤاÙĨÙĪÙĨ",
|
||||
"ĠبÙĪØ¯Ø¬Ùĩ",
|
||||
"ĠاساسÛĮ",
|
||||
"ĠÙĤضائÛĮ",
|
||||
"ĠبÛĮÙĨ",
|
||||
"âĢĮ",
|
||||
"اÙĦÙħÙĦÙĦÛĮ",
|
||||
"ĠتأسÛĮس",
|
||||
"ĠÙħÙĨØ·ÙĤÙĩ",
|
||||
"âĢĮ",
|
||||
"اÛĮ",
|
||||
"ĠÙĦازÙħ",
|
||||
"âĢĮ",
|
||||
"اÙĦاج",
|
||||
"راء",
|
||||
"ĠداÙħپرÙĪØ±ÛĮ",
|
||||
"ĠراÙĩ",
|
||||
"âĢĮ",
|
||||
"Ø¢ÙĩÙĨ",
|
||||
"ĠÚ©ÙħÛĮسÛĮÙĪÙĨ",
|
||||
"âĢĮ",
|
||||
"ÙĩاÛĮ",
|
||||
"ĠجدÛĮد",
|
||||
"اÙĦ",
|
||||
"اØŃ",
|
||||
"داث",
|
||||
"ĠÙħسئÙĪÙĦ",
|
||||
"ĠÙģØ±Ø¢ÙĪØ±Ø¯Ùĩ",
|
||||
"Ġزائد",
|
||||
"ĠاسÙĤاط",
|
||||
"ĠÙ¾ÙĨج",
|
||||
"ساÙĦÙĩ"
|
||||
]
|
||||
},
|
||||
{
|
||||
"model": "HooshvareLab/bert-fa-base-uncased-ner-arman",
|
||||
"len": 20,
|
||||
"tokens": [
|
||||
"جمهوری",
|
||||
"موافقتنامه",
|
||||
"معاملات",
|
||||
"قانون",
|
||||
"بودجه",
|
||||
"اساسی",
|
||||
"قضايی",
|
||||
"بینالمللی",
|
||||
"تاسیس",
|
||||
"منطقهای",
|
||||
"لازمالاجراء",
|
||||
"دامپروری",
|
||||
"راهاهن",
|
||||
"کمیسیونهای",
|
||||
"جدیدالاحداث",
|
||||
"مسيول",
|
||||
"فراورده",
|
||||
"زايد",
|
||||
"اسقاط",
|
||||
"پنجساله"
|
||||
]
|
||||
},
|
||||
{
|
||||
"model": "HooshvareLab/albert-fa-zwnj-base-v2-ner",
|
||||
"len": 46,
|
||||
"tokens": [
|
||||
"▁جمهوری",
|
||||
"▁موافقت",
|
||||
"[ZWNJ]",
|
||||
"نامه",
|
||||
"▁معاملات",
|
||||
"▁قانون",
|
||||
"▁بودجه",
|
||||
"▁اساسی",
|
||||
"▁قضا",
|
||||
"ي",
|
||||
"ی",
|
||||
"▁بین",
|
||||
"[ZWNJ]",
|
||||
"الم",
|
||||
"لل",
|
||||
"ی",
|
||||
"▁تاسیس",
|
||||
"▁منطقه",
|
||||
"[ZWNJ]",
|
||||
"ای",
|
||||
"▁لازم",
|
||||
"[ZWNJ]",
|
||||
"ال",
|
||||
"اجرا",
|
||||
"ء",
|
||||
"▁دامپروری",
|
||||
"▁راه",
|
||||
"[ZWNJ]",
|
||||
"اهن",
|
||||
"▁کمیسیون",
|
||||
"[ZWNJ]",
|
||||
"های",
|
||||
"▁جدید",
|
||||
"الا",
|
||||
"حد",
|
||||
"اث",
|
||||
"▁مس",
|
||||
"ي",
|
||||
"ول",
|
||||
"▁فراورده",
|
||||
"▁زا",
|
||||
"ي",
|
||||
"د",
|
||||
"▁اسقاط",
|
||||
"▁پنج",
|
||||
"ساله"
|
||||
]
|
||||
},
|
||||
{
|
||||
"model": "HooshvareLab/bert-base-parsbert-peymaner-uncased",
|
||||
"len": 20,
|
||||
"tokens": [
|
||||
"جمهوری",
|
||||
"موافقتنامه",
|
||||
"معاملات",
|
||||
"قانون",
|
||||
"بودجه",
|
||||
"اساسی",
|
||||
"قضايی",
|
||||
"بینالمللی",
|
||||
"تاسیس",
|
||||
"منطقهای",
|
||||
"لازمالاجراء",
|
||||
"دامپروری",
|
||||
"راهاهن",
|
||||
"کمیسیونهای",
|
||||
"جدیدالاحداث",
|
||||
"مسيول",
|
||||
"فراورده",
|
||||
"زايد",
|
||||
"اسقاط",
|
||||
"پنجساله"
|
||||
]
|
||||
},
|
||||
{
|
||||
"model": "Amirmerfan/bert-base-uncased-persian-ner-50k-base",
|
||||
"len": 56,
|
||||
"tokens": [
|
||||
"جمهوری",
|
||||
"م",
|
||||
"##وا",
|
||||
"##فق",
|
||||
"##تن",
|
||||
"##ام",
|
||||
"##ه",
|
||||
"مع",
|
||||
"##امل",
|
||||
"##ات",
|
||||
"قانون",
|
||||
"بود",
|
||||
"##جه",
|
||||
"اساسی",
|
||||
"ق",
|
||||
"##ضا",
|
||||
"##يی",
|
||||
"بینالمللی",
|
||||
"تاسیس",
|
||||
"منطقه",
|
||||
"##ای",
|
||||
"لازم",
|
||||
"##ال",
|
||||
"##اج",
|
||||
"##راء",
|
||||
"دا",
|
||||
"##م",
|
||||
"##پر",
|
||||
"##وری",
|
||||
"راه",
|
||||
"##اه",
|
||||
"##ن",
|
||||
"کمی",
|
||||
"##سی",
|
||||
"##ون",
|
||||
"##های",
|
||||
"جدید",
|
||||
"##ال",
|
||||
"##اح",
|
||||
"##دا",
|
||||
"##ث",
|
||||
"م",
|
||||
"##سي",
|
||||
"##ول",
|
||||
"ف",
|
||||
"##را",
|
||||
"##ورد",
|
||||
"##ه",
|
||||
"ز",
|
||||
"##ايد",
|
||||
"اس",
|
||||
"##قا",
|
||||
"##ط",
|
||||
"پنج",
|
||||
"##سال",
|
||||
"##ه"
|
||||
]
|
||||
},
|
||||
{
|
||||
"model": "AliFartout/Roberta-fa-en-ner",
|
||||
"len": 39,
|
||||
"tokens": [
|
||||
"▁جمهوری",
|
||||
"▁موافقت",
|
||||
"▁نامه",
|
||||
"▁معاملات",
|
||||
"▁قانون",
|
||||
"▁بودجه",
|
||||
"▁اساسی",
|
||||
"▁قضا",
|
||||
"ئی",
|
||||
"▁بین",
|
||||
"▁المللی",
|
||||
"▁تأسیس",
|
||||
"▁منطقه",
|
||||
"▁ای",
|
||||
"▁لازم",
|
||||
"▁الاج",
|
||||
"راء",
|
||||
"▁دام",
|
||||
"پر",
|
||||
"وری",
|
||||
"▁راه",
|
||||
"▁آهن",
|
||||
"▁کمیسیون",
|
||||
"▁های",
|
||||
"▁جدید",
|
||||
"الا",
|
||||
"حد",
|
||||
"اث",
|
||||
"▁مسئول",
|
||||
"▁فر",
|
||||
"آور",
|
||||
"ده",
|
||||
"▁زائد",
|
||||
"▁اس",
|
||||
"ق",
|
||||
"اط",
|
||||
"▁پنج",
|
||||
"سال",
|
||||
"ه"
|
||||
]
|
||||
}
|
||||
]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -1,11 +1,7 @@
|
|||
from huggingface_hub import HfApi
|
||||
from huggingface_hub import ModelCard
|
||||
from huggingface_hub import ModelFilter
|
||||
from huggingface_hub import HfApi, ModelFilter
|
||||
import json
|
||||
|
||||
|
||||
api = HfApi()
|
||||
# card = ModelCard.load('model-name')
|
||||
|
||||
# persian languages tags
|
||||
languages = ['pes', 'fas', 'fa'] # Language codes for Persian, Pashto, and Turkish
|
||||
|
|
173
hugingFace_persianModelList.py
Normal file
173
hugingFace_persianModelList.py
Normal file
|
@ -0,0 +1,173 @@
|
|||
from huggingface_hub import HfApi
|
||||
import json
|
||||
import datetime
|
||||
|
||||
|
||||
|
||||
# تعریف تسکهای رایج NLP
|
||||
nlp_task_list = [
|
||||
"text-classification",
|
||||
"token-classification",
|
||||
"question-answering",
|
||||
"summarization",
|
||||
"translation",
|
||||
"text-generation",
|
||||
"fill-mask",
|
||||
"zero-shot-classification",
|
||||
"feature-extraction",
|
||||
"sentence-similarity",
|
||||
"text2text-generation",
|
||||
"conversational"
|
||||
]
|
||||
|
||||
def persian_model_finder(nlp_tasks,json_file_name):
|
||||
|
||||
api = HfApi()
|
||||
all_persian_nlp_models_data = []
|
||||
seen_model_ids = set() # برای جلوگیری از اضافه شدن مدلهای تکراری
|
||||
|
||||
print("در حال جستجو و استخراج اطلاعات مدلهای NLP فارسی...")
|
||||
|
||||
# فیلتر کردن و پیمایش روی مدلها
|
||||
# برای هر تسک NLP، مدلهای فارسی را جستجو میکنیم.
|
||||
# محدودیت 500 مدل برای هر تسک در نظر گرفته شده است تا از دانلود بیش از حد جلوگیری شود.
|
||||
# اگر میخواهید همه مدلها را استخراج کنید، ممکن است نیاز به پیجینیشن (pagination) باشد.
|
||||
for task in nlp_tasks:
|
||||
print(f" جستجو برای تسک: {task} (زبان: فارسی)...")
|
||||
try:
|
||||
models_for_task = api.list_models(
|
||||
language="fa",
|
||||
task=task,
|
||||
sort="downloads",
|
||||
direction=-1, # نزولی (از بیشترین دانلود به کمترین)
|
||||
limit=None # میتوانید این عدد را تغییر دهید
|
||||
)
|
||||
|
||||
for model_info in models_for_task:
|
||||
if model_info.id not in seen_model_ids:
|
||||
try : # اگر از کارت مدل توانست اطلاعات بیشتری به دست بیاورد :
|
||||
model_ = api.model_info(model_info.id) # به دست آوردن شناسه مدل
|
||||
card_data_dict = model_.card_data.to_dict() # از روی کارت مدل که شامل اطلاعات مدل میباشد یک دیکشنری میسازیم
|
||||
model_data = {
|
||||
"model_id": model_info.id,
|
||||
"url": f"https://huggingface.co/{model_info.id}",
|
||||
"downloads": model_info.downloads,
|
||||
"private": model_info.private,
|
||||
"author": model_info.author,
|
||||
"tags": model_info.tags, # شامل زبانها، تسکها، لایبرریها و...
|
||||
"tag_dataset":"-",
|
||||
"tag_base_model":"-",
|
||||
"tag_license":"-",
|
||||
"tag_region":"-",
|
||||
"pipeline_tag": model_info.pipeline_tag, # تسک اصلی مدل که توسط هاب تعیین شده
|
||||
"Likes":model_info.likes,
|
||||
# چهار مورد پایینی از روی دیکشنری کارت مدل خوانده میشود
|
||||
"languages":card_data_dict.get('language', 'N/A'), # زبان هایی که پشتیبانی میشود
|
||||
"library":card_data_dict.get('library', 'N/A'), # کتابخانه های مورد استفاده
|
||||
"datasets":card_data_dict.get('datasets', 'N/A'), # دیتابیس های مورد استفاده
|
||||
"license":card_data_dict.get('license', 'N/A'),
|
||||
"just_persian" : False
|
||||
|
||||
}
|
||||
|
||||
if model_data["library"] == 'N/A': # در بعضی موارد کتابخانه به این نام ('library_name') در دیکشنری کارت مدل ذخیره شده
|
||||
model_data["library"] = card_data_dict.get('library_name', 'N/A')
|
||||
# شرط پایینی ، مواردی که فقط مختص زبان فارسی هستند را در دیکشنری مشخص میکند
|
||||
if len(model_data["languages"]) == 2 and "multilingual" in model_data["languages"] or\
|
||||
len(model_data["languages"]) == 2 and "persian" in model_data["languages"] or\
|
||||
len(model_data["languages"]) == 2 and "farsi" in model_data["languages"] or\
|
||||
len(model_data["languages"]) == 2 and "fas" in model_data["languages"] or\
|
||||
len(model_data["languages"]) == 2 and model_data["languages"]=="fa" or\
|
||||
model_data["languages"] == "persian" or\
|
||||
model_data["languages"] == "farsi" or\
|
||||
model_data["languages"] == "fas" or\
|
||||
model_data["languages"] == "pes" or\
|
||||
len(model_data["languages"]) == 1 :
|
||||
model_data["just_persian"] = True
|
||||
|
||||
for value in model_data["tags"]:
|
||||
|
||||
if "dataset:" in value :
|
||||
if type(model_data["tag_dataset"]) == type(""):
|
||||
model_data["tag_dataset"] = list(model_data["tag_dataset"])
|
||||
model_data["tag_dataset"].pop(0)
|
||||
model_data["tag_dataset"].append(f"{str(value).replace("dataset:","")}")
|
||||
|
||||
if "base_model:" in value :
|
||||
if type(model_data["tag_base_model"]) == type(""):
|
||||
model_data["tag_base_model"] = list(model_data["tag_base_model"])
|
||||
model_data["tag_base_model"].pop(0)
|
||||
model_data["tag_base_model"].append(f"{str(value).replace("base_model:","")}")
|
||||
|
||||
if "region:" in value :
|
||||
model_data["tag_region"]=f"{str(value).replace("region:","")}"
|
||||
|
||||
if "license:" in value :
|
||||
model_data["tag_license"]=f"{str(value).replace("license:","")}"
|
||||
|
||||
|
||||
all_persian_nlp_models_data.append(model_data)
|
||||
seen_model_ids.add(model_info.id)
|
||||
|
||||
except : # اگر استفاده از کارت مدل با مشکل مواجه شد :
|
||||
model_data = {
|
||||
"model_id": model_info.id,
|
||||
"url": f"https://huggingface.co/{model_info.id}",
|
||||
"downloads": model_info.downloads,
|
||||
"private": model_info.private,
|
||||
"author": model_info.author,
|
||||
"tags": model_info.tags, # شامل زبانها، تسکها، لایبرریها و...
|
||||
"pipeline_tag": model_info.pipeline_tag, # تسک اصلی مدل که توسط هاب تعیین شده
|
||||
"Likes":model_info.likes,
|
||||
"library":model_info.library_name,
|
||||
# افزودن لایسنس اگر موجود باشد
|
||||
"license": model_info.card_data.license if model_info.card_data and model_info.card_data.license else "N/A"
|
||||
}
|
||||
|
||||
for value in model_data["tags"]:
|
||||
|
||||
if "dataset:" in value :
|
||||
if type(model_data["tag_dataset"]) == type(""):
|
||||
model_data["tag_dataset"] = list(model_data["tag_dataset"])
|
||||
model_data["tag_dataset"].pop(0)
|
||||
model_data["tag_dataset"].append(f"{str(value).replace("dataset:","")}")
|
||||
|
||||
if "base_model:" in value :
|
||||
if type(model_data["tag_base_model"]) == type(""):
|
||||
model_data["tag_base_model"] = list(model_data["tag_base_model"])
|
||||
model_data["tag_base_model"].pop(0)
|
||||
model_data["tag_base_model"].append(f"{str(value).replace("base_model:","")}")
|
||||
|
||||
if "region:" in value :
|
||||
model_data["tag_region"]=f"{str(value).replace("region:","")}"
|
||||
|
||||
if "license:" in value :
|
||||
model_data["tag_license"]=f"{str(value).replace("license:","")}"
|
||||
|
||||
all_persian_nlp_models_data.append(model_data)
|
||||
seen_model_ids.add(model_info.id)
|
||||
|
||||
print(f" تعداد مدلهای یافت شده برای تسک '{task}': {len(models_for_task)}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" خطا در جستجو برای تسک {task}: {e}")
|
||||
|
||||
# مرتبسازی نهایی مدلها بر اساس تعداد دانلود (کل لیست)
|
||||
# این مرحله اطمینان میدهد که حتی اگر مدلها از تسکهای مختلف جمعآوری شده باشند،
|
||||
# در نهایت بر اساس دانلود مرتب شده باشند.
|
||||
all_persian_nlp_models_data_sorted = sorted(all_persian_nlp_models_data, key=lambda x: x['downloads'], reverse=True)
|
||||
|
||||
print(f"\nتعداد کل مدلهای NLP فارسی منحصربهفرد یافت شده: {len(all_persian_nlp_models_data_sorted)}")
|
||||
|
||||
# ذخیره اطلاعات در یک فایل JSON
|
||||
output_json_file = f"{json_file_name}"
|
||||
with open(output_json_file, "w", encoding="utf-8") as f:
|
||||
json.dump(all_persian_nlp_models_data_sorted, f, ensure_ascii=False, indent=4)
|
||||
|
||||
print(f"اطلاعات {len(all_persian_nlp_models_data_sorted)} مدل در فایل '{output_json_file}' ذخیره شد.")
|
||||
|
||||
|
||||
|
||||
persian_model_finder(nlp_task_list,"persian_nlp_models_info.json")
|
||||
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
facebook/xlm-v-base
|
||||
sharif-dal/dal-bert
|
||||
lifeweb-ai/shiraz
|
|
@ -1,30 +0,0 @@
|
|||
import json
|
||||
|
||||
with open('./data/models_info.json', 'r', encoding='utf-8') as file:
|
||||
models = json.load(file)
|
||||
|
||||
ner_models = []
|
||||
for mdl in models:
|
||||
if mdl['model_name'].__contains__('ner') or mdl['model_name'].__contains__('Ner') or mdl['model_name'].__contains__('Ner'):
|
||||
ner_models.append(mdl)
|
||||
# [mdl['model_name']]= {
|
||||
# "task": mdl['task'],
|
||||
# "last_modified": mdl['last_modified'],
|
||||
# "downloads": int(mdl['downloads']),
|
||||
# "likes": mdl['likes'],
|
||||
# "language": mdl['language']
|
||||
# }
|
||||
sorted_ner_models = sorted(ner_models, key=lambda x: x["downloads"], reverse=True)
|
||||
|
||||
with open('./data/models_ner_info.json', 'w', encoding='utf-8') as file:
|
||||
jsondata = json.dumps(sorted_ner_models, ensure_ascii=False, indent=2)
|
||||
file.write(jsondata)
|
||||
|
||||
ner_models_text = ''
|
||||
for mdl in sorted_ner_models:
|
||||
ner_models_text += ''.join(f"{mdl['downloads']} -- {mdl['model_name']}\n")
|
||||
|
||||
with open('./data/models_ner_info.txt', 'w', encoding='utf-8') as file:
|
||||
file.write(ner_models_text)
|
||||
|
||||
print(len(ner_models))
|
|
@ -1,83 +1,58 @@
|
|||
#بسم الله
|
||||
|
||||
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from transformers import AutoTokenizer
|
||||
from bidi.algorithm import get_display
|
||||
from huggingface_hub import HfApi
|
||||
import matplotlib.pyplot as plt
|
||||
from fastapi import FastAPI
|
||||
from datetime import date
|
||||
import arabic_reshaper
|
||||
from fpdf import FPDF
|
||||
import threading
|
||||
import random
|
||||
import logging
|
||||
import sqlite3
|
||||
import string
|
||||
import os
|
||||
|
||||
|
||||
|
||||
# تنظیم لاگدهی برای دیدن خروجی زمانبندی
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logging.getLogger('apscheduler').setLevel(logging.INFO)
|
||||
|
||||
|
||||
first_id = 6600
|
||||
text = 'جمهوری موافقتنامه معاملات قانون بودجه اساسی قضائی بینالمللی تأسیس منطقهای لازمالاجراء دامپروری راهآهن کمیسیونهای جدیدالاحداث مسئول فرآورده زائد اسقاط پنجساله'
|
||||
list1 = ["ID","model_id","url","downloads","private","author","tags","tag_dataset",\
|
||||
"tag_base_model","tag_license","tag_region","pipeline_tag","Likes","languages",\
|
||||
"library","datasets","license","just_persian","deleted","date_added","last_modified"]
|
||||
cnt = sqlite3.connect(".\\db\\persian_nlp_model.db", check_same_thread=False)
|
||||
"library","datasets","license","just_persian","deleted","date_added"]
|
||||
cnt = sqlite3.connect("persian_nlp_model.db")
|
||||
c = cnt.cursor()
|
||||
today = date.today()
|
||||
d1 = today.strftime("%d-%m-%Y")
|
||||
|
||||
|
||||
create_tables=True
|
||||
if create_tables == True :
|
||||
try:
|
||||
|
||||
|
||||
# فقط برای اولین بار که جدول قرار است ساخته شود از این کد ها استفاده شود
|
||||
c.execute("""CREATE TABLE PersianNlp(
|
||||
ID INT PRIMARY KEY ,
|
||||
model_id TEXT ,
|
||||
url TEXT ,
|
||||
downloads INT,
|
||||
private TEXT,
|
||||
author TEXT,
|
||||
tags TEXT,
|
||||
tag_dataset TEXT,
|
||||
tag_base_model TEXT,
|
||||
tag_license TEXT,
|
||||
tag_region TEXT,
|
||||
pipeline_tag TEXT,
|
||||
Likes INT,
|
||||
languages TEXT,
|
||||
library TEXT,
|
||||
datasets TEXT,
|
||||
license TEXT,
|
||||
just_persian TEXT,
|
||||
deleted TEXT,
|
||||
date_added TEXT,
|
||||
last_modified TEXT
|
||||
);""")
|
||||
# c.execute("""CREATE TABLE PersianNlp(
|
||||
# ID INT PRIMARY KEY ,
|
||||
# model_id TEXT ,
|
||||
# url TEXT ,
|
||||
# downloads INT,
|
||||
# private TEXT,
|
||||
# author TEXT,
|
||||
# tags TEXT,
|
||||
# tag_dataset TEXT,
|
||||
# tag_base_model TEXT,
|
||||
# tag_license TEXT,
|
||||
# tag_region TEXT,
|
||||
# pipeline_tag TEXT,
|
||||
# Likes INT,
|
||||
# languages TEXT,
|
||||
# library TEXT,
|
||||
# datasets TEXT,
|
||||
# license TEXT,
|
||||
# just_persian TEXT,
|
||||
# deleted TEXT,
|
||||
# date_added TEXT
|
||||
# );""")
|
||||
|
||||
|
||||
|
||||
# برای ساخت جدول میزان دانلود ها از این کد استفاده شود
|
||||
c.execute("""CREATE TABLE downloadCountHistory(
|
||||
ID INT PRIMARY KEY ,
|
||||
key_id INT ,
|
||||
downloads INT,
|
||||
date TEXT
|
||||
);""")
|
||||
create_tables = False
|
||||
|
||||
except Exception as e:
|
||||
|
||||
print("--- یک خطای غیرمنتظره در ساخت تیبل رخ داد ---")
|
||||
print(f"متن خطا: {e}")
|
||||
# c.execute("""CREATE TABLE downloadCountHistory(
|
||||
# ID INT PRIMARY KEY ,
|
||||
# key_id INT ,
|
||||
# downloads INT,
|
||||
# date TEXT
|
||||
# );""")
|
||||
|
||||
|
||||
|
||||
|
@ -114,357 +89,13 @@ def generate_random_id(length=10, chars=string.ascii_letters + string.digits):
|
|||
|
||||
|
||||
|
||||
# 3. تعریف تابع کمکی برای پردازش متن فارسی
|
||||
def process_text_for_fpdf(text):
|
||||
# مرحله 1: تغییر شکل حروف (اتصال و شکل صحیح)
|
||||
reshaped_text = arabic_reshaper.reshape(text)
|
||||
# مرحله 2: بازآرایی برای نمایش راست به چپ
|
||||
bidi_text = get_display(reshaped_text)
|
||||
return bidi_text
|
||||
|
||||
|
||||
|
||||
|
||||
def add_download_count():
|
||||
|
||||
count = 1
|
||||
api = HfApi()
|
||||
allModel = c.execute(f'''SELECT * FROM "PersianNlp"
|
||||
WHERE deleted != 'True' ;''')
|
||||
all_model_id = []
|
||||
for model in allModel:
|
||||
all_model_id.append([model[0],model[1]])
|
||||
|
||||
for id_ in all_model_id:
|
||||
# try:
|
||||
print(count)
|
||||
count+=1
|
||||
id_12_digits = generate_random_id(length=12, chars=string.digits)
|
||||
model_details = api.model_info(repo_id=id_[1])
|
||||
c.execute(f"""INSERT INTO downloadCountHistory(ID,key_id,downloads,date)
|
||||
VALUES ({id_12_digits},"{int(id_[0])}","{int(model_details.downloads)}","{str(d1)}");""")
|
||||
# c.execute(f"""INSERT INTO downloadCountHistory(ID,key_id,downloads,date)
|
||||
# VALUES ({id_12_digits},"{int(id_[0])}","{int(model_details.downloads)+1}","22-08-2025");""")
|
||||
cnt.commit()
|
||||
# except:
|
||||
# print("Error!!")
|
||||
|
||||
# add_download_count()
|
||||
|
||||
|
||||
|
||||
def MultiModelInfo(limit_number=10):
|
||||
|
||||
|
||||
today = date.today()
|
||||
date_year = today.strftime("%Y")
|
||||
date_month = today.strftime("%m")
|
||||
month = int(date_month)
|
||||
year = int(date_year)
|
||||
Models_added_this_month=[]
|
||||
Models_deleted=[]
|
||||
all_id_download = []
|
||||
all_download = []
|
||||
growth_slope_list_info = []
|
||||
growth_slope_list = []
|
||||
model_info = c.execute(f'''SELECT *
|
||||
FROM PersianNlp''')
|
||||
|
||||
for model in model_info:
|
||||
|
||||
if int(model[19].split("/")[1]) == month and int(model[19].split("/")[2]) == year :
|
||||
Models_added_this_month.append(model[1])
|
||||
if str(model[18]) == "True":
|
||||
Models_deleted.append(model[1])
|
||||
|
||||
all_id_download.append([model[0],model[1],model[3],model[11],model[20]])
|
||||
|
||||
|
||||
|
||||
|
||||
listX=[]
|
||||
for model in all_id_download:
|
||||
|
||||
downloadCountHistory = c.execute(f'''SELECT *
|
||||
FROM downloadCountHistory
|
||||
WHERE key_id = {model[0]}''')
|
||||
|
||||
for models in downloadCountHistory :
|
||||
if int(models[3].split("-")[1]) == month and int(models[3].split("-")[2]) == year :
|
||||
model[2]=models[2]
|
||||
listX.append(model)
|
||||
all_id_download = listX
|
||||
|
||||
|
||||
for model in all_id_download:
|
||||
all_download.append(model[2])
|
||||
|
||||
all_download.sort(reverse=True)
|
||||
maximum_download_list = all_download[0:limit_number]
|
||||
maximum_download_info_list = []
|
||||
n=0
|
||||
for DCount in maximum_download_list:
|
||||
|
||||
for model in all_id_download:
|
||||
if DCount == model[2]:
|
||||
if n < limit_number :
|
||||
maximum_download_info_list.append(model)
|
||||
n+=1
|
||||
|
||||
|
||||
|
||||
# پیدا کردن بیشترین شیب دانلود ها در چند ماه :
|
||||
|
||||
for model in all_id_download:
|
||||
growth_slope = []
|
||||
DHList =c.execute(f'''SELECT *
|
||||
FROM "downloadCountHistory"
|
||||
WHERE key_id = {model[0]}''')
|
||||
|
||||
for data in DHList:
|
||||
growth_slope.append(data[2])
|
||||
|
||||
growth_slopee , lenM = find_growth_slope(growth_slope) # به دست آوردن درصد رشد هر مدل
|
||||
growth_slope_list.append(growth_slopee)
|
||||
growth_slope_list_info.append([model[1],growth_slopee,lenM,model[2],model[3]])
|
||||
|
||||
|
||||
|
||||
growth_slope_list.sort(reverse=True)
|
||||
maximum_growth_slope_list = growth_slope_list[0:limit_number]
|
||||
maximum_growth_slope_info_list = []
|
||||
n=0
|
||||
for DCount in maximum_growth_slope_list:
|
||||
|
||||
for model in growth_slope_list_info:
|
||||
if DCount == model[1]:
|
||||
if n < limit_number :
|
||||
maximum_growth_slope_info_list.append(model)
|
||||
n+=1
|
||||
|
||||
# پایان پیدا کردن شیب
|
||||
|
||||
|
||||
|
||||
model_id_list = []
|
||||
model_download_count_list = []
|
||||
for info in maximum_download_info_list:
|
||||
|
||||
model_download_count_list.append(info[2])
|
||||
model_id_list.append(str(info[1]))
|
||||
|
||||
listA = []
|
||||
for x in model_download_count_list:
|
||||
listA.append(int(x)/1000000)
|
||||
model_download_count_list = listA
|
||||
|
||||
plt.plot(model_id_list,model_download_count_list,marker='o', linestyle='-')
|
||||
plt.xticks(rotation=30, ha='right', fontsize=10)
|
||||
plt.xlabel("Model Name", color='blue')
|
||||
plt.ylabel("Download Count (milion)" ,color='red')
|
||||
plt.tight_layout()
|
||||
plt.savefig(f'Top_{limit_number}_download_rate.png', dpi=300)
|
||||
pdf = FPDF()
|
||||
pdf.add_page()
|
||||
pdf.add_font('B Nazanin', '', '.\\fonts\\B Nazanin.ttf', uni=True)
|
||||
pdf.set_font("Arial", size=16)
|
||||
# اضافه کردن متن
|
||||
pdf.multi_cell(0, 10, f"Top {limit_number} Model Information : \n ----------------------------------------------------------------")
|
||||
pdf.set_font("Arial", size=12)
|
||||
|
||||
pdf.ln()
|
||||
pdf.multi_cell(0, 5, f" Download Rate Chart :")
|
||||
pdf.ln()
|
||||
|
||||
# اضافه کردن عکس
|
||||
pdf.image(f'Top_{limit_number}_download_rate.png', x=10, y=pdf.get_y() + 5, w=70)
|
||||
pdf.ln(20) # یک خط فاصله بعد از عکس
|
||||
# اضافه کردن جدول
|
||||
pdf.ln(50)
|
||||
pdf.ln()
|
||||
pdf.multi_cell(0, 5, f" Download Rate Table :")
|
||||
pdf.ln()
|
||||
# سربرگ جدول
|
||||
for header in ['Count', 'Model_name','Download-rate','Task','Last_modified']:
|
||||
if header == 'Model_name':
|
||||
pdf.cell(80, 10, header, 1, 0, 'C')
|
||||
else:
|
||||
pdf.cell(40, 10, header, 1, 0, 'C')
|
||||
pdf.ln()
|
||||
|
||||
# ردیفهای داده
|
||||
x=1
|
||||
for row in maximum_download_info_list:
|
||||
n=0
|
||||
row[0] = x
|
||||
x+=1
|
||||
for item in row:
|
||||
if n == 1 :
|
||||
pdf.set_font("Arial", size=6)
|
||||
pdf.cell(80, 10, f"https://huggingface.co/{str(item)}", 1, 0, 'C')
|
||||
n+=1
|
||||
else:
|
||||
pdf.set_font("Arial", size=10)
|
||||
pdf.cell(40, 10, str(item), 1, 0, 'C')
|
||||
n+=1
|
||||
pdf.ln()
|
||||
|
||||
|
||||
|
||||
|
||||
pdf.add_page()
|
||||
model_name_list = []
|
||||
model_growth = []
|
||||
for info in maximum_growth_slope_info_list:
|
||||
|
||||
model_name_list.append(info[0])
|
||||
model_growth.append(round(info[1], 2) )
|
||||
|
||||
pdf.ln()
|
||||
pdf.multi_cell(0, 5, f" Download Growth Chart :")
|
||||
pdf.ln()
|
||||
plt.figure(figsize=(8, 6)) # تنظیم اندازه کلی نمودار (عرض و ارتفاع بر حسب اینچ)
|
||||
plt.bar(model_name_list,model_growth,color='lightgreen', width=0.4)
|
||||
plt.xticks(rotation=30, ha='right', fontsize=10)
|
||||
plt.xlabel("Model Name", color='blue')
|
||||
plt.ylabel("Model Growth (%)" ,color='red')
|
||||
plt.tight_layout()
|
||||
plt.savefig(f'Top_{limit_number}_growth_rate.png', dpi=300)
|
||||
pdf.image(f'Top_{limit_number}_growth_rate.png', x=10, y=pdf.get_y() + 5, w=70)
|
||||
pdf.ln(80) # یک خط فاصله بعد از عکس
|
||||
|
||||
pdf.ln()
|
||||
pdf.multi_cell(0, 5, f" Download Growth Table :")
|
||||
pdf.ln()
|
||||
# سربرگ جدول
|
||||
for header in [ 'Model_name','Growth-rate','Length-month','Download-rate','Task']:
|
||||
if header == 'Model_name':
|
||||
pdf.cell(80, 10, header, 1, 0, 'C')
|
||||
else:
|
||||
pdf.cell(30, 10, header, 1, 0, 'C')
|
||||
pdf.ln()
|
||||
|
||||
# ردیفهای داده
|
||||
x=1
|
||||
for row in maximum_growth_slope_info_list:
|
||||
n=0
|
||||
x+=1
|
||||
for item in row:
|
||||
if n == 0 :
|
||||
pdf.set_font("Arial", size=6)
|
||||
pdf.cell(80, 10, f"https://huggingface.co/{str(item)}", 1, 0, 'C')
|
||||
n+=1
|
||||
else:
|
||||
pdf.set_font("Arial", size=6)
|
||||
pdf.cell(30, 10, str(item), 1, 0, 'C')
|
||||
n+=1
|
||||
pdf.ln()
|
||||
|
||||
|
||||
|
||||
pdf.add_page()
|
||||
pdf.ln()
|
||||
pdf.set_font("Arial", size=14)
|
||||
pdf.multi_cell(0, 5, f"Models added this month :")
|
||||
pdf.set_font("Arial", size=6)
|
||||
pdf.ln()
|
||||
|
||||
txt=''
|
||||
n=1
|
||||
for model_name in Models_added_this_month:
|
||||
txt +=f"{n} --> {model_name}\n"
|
||||
n+=1
|
||||
pdf.multi_cell(0, 5, f"{txt}")
|
||||
|
||||
pdf.ln()
|
||||
pdf.set_font("Arial", size=14)
|
||||
pdf.multi_cell(0, 5, f"Models deleted :")
|
||||
pdf.set_font("Arial", size=6)
|
||||
pdf.ln()
|
||||
txt=''
|
||||
n=1
|
||||
for model_name in Models_deleted:
|
||||
txt +=f"{n} --> {model_name}\n"
|
||||
n+=1
|
||||
pdf.multi_cell(0, 5, f"{txt}")
|
||||
|
||||
pdf.output(f"MultiModelInfo_{today}.pdf")
|
||||
os.remove(f'Top_{limit_number}_download_rate.png')
|
||||
os.remove(f'Top_{limit_number}_growth_rate.png')
|
||||
|
||||
# MultiModelInfo(5)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def find_growth_slope(number_list):
|
||||
|
||||
try:
|
||||
n = -1
|
||||
index = 0
|
||||
# if n == -1:
|
||||
last_num = number_list[-1]
|
||||
first_num = number_list[-1]
|
||||
|
||||
if number_list[-1] <= number_list[-2]:
|
||||
for num in range(len(number_list)-1):
|
||||
if number_list[n-num] <= number_list[n-num-1]:
|
||||
last_num = number_list[n-num-1]
|
||||
index = n-num-1
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
if number_list[-1] >= number_list[-2] and last_num >= first_num:
|
||||
for num in range(len(number_list)-1):
|
||||
if number_list[n-num] >= number_list[n-num-1]:
|
||||
last_num = number_list[n-num-1]
|
||||
index = n-num-1
|
||||
else:
|
||||
break
|
||||
percentage_growth = ((first_num - last_num) / last_num) * 100
|
||||
|
||||
except:
|
||||
percentage_growth = 0
|
||||
|
||||
|
||||
return percentage_growth ,abs(index)
|
||||
|
||||
# find_growth_slope([15,16,17,16,10,8])
|
||||
# x = find_growth_slope([1,2,3,4,5,6,7,8,10])
|
||||
# c = find_growth_slope([8,8,9,10,8,8,7])
|
||||
# v = find_growth_slope([8,8,7,5,4,6,7,7])
|
||||
# print("finish!")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def persian_model_finder(nlp_tasks,idx):
|
||||
today = date.today()
|
||||
download_date = today.strftime("%d/%m/%Y")
|
||||
idX = idx # اخرین آیدی موجود در دیتابیس را وارد میکنیم تا موارد جدید با آیدی های قبلی تداخل نکند
|
||||
api = HfApi()
|
||||
# all_persian_nlp_models_data = []
|
||||
all_persian_nlp_models_data = []
|
||||
seen_model_ids = set() # برای جلوگیری از اضافه شدن مدلهای تکراری
|
||||
new_seen_ids = set()
|
||||
|
||||
|
||||
for task in nlp_tasks:
|
||||
|
||||
models_for_task = api.list_models(
|
||||
language="fa",
|
||||
task=task,
|
||||
sort="downloads",
|
||||
direction=-1, # نزولی (از بیشترین دانلود به کمترین)
|
||||
limit=None # میتوانید این عدد را تغییر دهید
|
||||
)
|
||||
|
||||
for model_info in models_for_task:
|
||||
new_seen_ids.add(model_info.id)
|
||||
|
||||
|
||||
|
||||
print("در حال جستجو و استخراج اطلاعات مدلهای NLP فارسی...")
|
||||
|
||||
|
@ -476,12 +107,8 @@ def persian_model_finder(nlp_tasks,idx):
|
|||
try:
|
||||
allModel = c.execute(f'''SELECT *
|
||||
FROM PersianNlp''')
|
||||
|
||||
idX+=1
|
||||
|
||||
for model in allModel:
|
||||
seen_model_ids.add(model[1])
|
||||
idX+=1
|
||||
except:
|
||||
print("database not find!")
|
||||
|
||||
|
@ -502,7 +129,6 @@ FROM PersianNlp''')
|
|||
idX+=1
|
||||
# try : # اگر از کارت مدل توانست اطلاعات بیشتری به دست بیاورد :
|
||||
model_ = api.model_info(model_info.id) # به دست آوردن شناسه مدل
|
||||
lastModified = api.model_info(repo_id=model_info.id).last_modified
|
||||
card_data_dict = model_.card_data.to_dict() # از روی کارت مدل که شامل اطلاعات مدل میباشد یک دیکشنری میسازیم
|
||||
model_data = {
|
||||
"model_id": model_info.id,
|
||||
|
@ -524,8 +150,7 @@ FROM PersianNlp''')
|
|||
"license":card_data_dict.get('license', 'N/A'),
|
||||
"just_persian" : "False",
|
||||
"deleted" : "False",
|
||||
"date_added" : f"{download_date}",
|
||||
"last_modified" : f"{str(lastModified.strftime("%d-%m-%Y"))}"
|
||||
"date_added" : f"{download_date}"
|
||||
|
||||
}
|
||||
|
||||
|
@ -566,29 +191,16 @@ FROM PersianNlp''')
|
|||
|
||||
|
||||
# all_persian_nlp_models_data.append(model_data)
|
||||
c.execute(f"""INSERT INTO PersianNlp (ID,model_id,url,downloads,private,author,tags,tag_dataset,tag_base_model,tag_license,tag_region,pipeline_tag,Likes,languages,library,datasets,license,just_persian,deleted,date_added,last_modified)
|
||||
VALUES ({idX},"{model_data["model_id"]}","{model_data["url"]}",{model_data["downloads"]},"{model_data["private"]}","{model_data["author"]}","{model_data["tags"]}","{model_data["tag_dataset"]}","{model_data["tag_base_model"]}","{model_data["tag_license"]}","{model_data["tag_region"]}","{model_data["pipeline_tag"]}",{model_data["Likes"]},"{model_data["languages"]}","{model_data["library"]}","{model_data["datasets"]}","{model_data["license"]}","{model_data["just_persian"]}","{model_data["deleted"]}","{model_data["date_added"]}","{model_data['last_modified']}");""")
|
||||
c.execute(f"""INSERT INTO PersianNlp (ID,model_id,url,downloads,private,author,tags,tag_dataset,tag_base_model,tag_license,tag_region,pipeline_tag,Likes,languages,library,datasets,license,just_persian,deleted,date_added)
|
||||
VALUES ({idX},"{model_data["model_id"]}","{model_data["url"]}",{model_data["downloads"]},"{model_data["private"]}","{model_data["author"]}","{model_data["tags"]}","{model_data["tag_dataset"]}","{model_data["tag_base_model"]}","{model_data["tag_license"]}","{model_data["tag_region"]}","{model_data["pipeline_tag"]}",{model_data["Likes"]},"{model_data["languages"]}","{model_data["library"]}","{model_data["datasets"]}","{model_data["license"]}","{model_data["just_persian"]}","{model_data["deleted"]}","{model_data["date_added"]}");""")
|
||||
cnt.commit()
|
||||
seen_model_ids.add(model_info.id)
|
||||
|
||||
|
||||
print(f"\nتعداد کل مدلهای NLP فارسی منحصربهفرد یافت شده: {len(seen_model_ids)}")
|
||||
|
||||
for modelID in seen_model_ids:
|
||||
if modelID not in new_seen_ids:
|
||||
#اول لیست تسک ها را میدهیم برای جست و جو ، و بعد آخرین آیدی که در تیبل مدلها در دیتابیس موجود است
|
||||
# persian_model_finder(nlp_task_list,8288)
|
||||
|
||||
c.execute(f'''UPDATE PersianNlp
|
||||
SET deleted = 'True'
|
||||
WHERE model_id = '{modelID}';''')
|
||||
|
||||
cnt.commit()
|
||||
|
||||
|
||||
add_download_count()
|
||||
MultiModelInfo()
|
||||
|
||||
#اول لیست تسک ها را میدهیم برای جست و جو ، و بعد اولین آیدی که در تیبل مدلها در دیتابیس موجود است
|
||||
# persian_model_finder(nlp_task_list,6600)
|
||||
|
||||
|
||||
|
||||
|
@ -687,165 +299,28 @@ FROM PersianNlp''')
|
|||
|
||||
|
||||
|
||||
def singleModelInfo( model_id_ ,month_later = 6 , year_later = 0 ):
|
||||
|
||||
|
||||
today = date.today()
|
||||
date_year = today.strftime("%Y")
|
||||
date_month = today.strftime("%m")
|
||||
month = int(date_month)
|
||||
year = int(date_year)
|
||||
model_info = c.execute(f'''SELECT *
|
||||
FROM PersianNlp
|
||||
WHERE model_id = "{model_id_}"''')
|
||||
|
||||
for model in model_info:
|
||||
m = model
|
||||
model_id = m[0]
|
||||
last_modyfied = m[20]
|
||||
Likes = m[12]
|
||||
task = m[11]
|
||||
|
||||
|
||||
downloadCountHistory = c.execute(f'''SELECT *
|
||||
FROM downloadCountHistory
|
||||
WHERE key_id = {model_id}''')
|
||||
|
||||
|
||||
n=0
|
||||
downloads_list = []
|
||||
download_count_list = []
|
||||
download_date_list = []
|
||||
for model in downloadCountHistory :
|
||||
if int(model[3].split("-")[1]) >= month-month_later and int(model[3].split("-")[2]) >= year-year_later :
|
||||
|
||||
download_count_list.append(model[2])
|
||||
download_date_list.append(model[3])
|
||||
downloads_list.append([model[3],model[2]])
|
||||
|
||||
growth_slope , lenM = find_growth_slope(download_count_list)
|
||||
|
||||
plt.plot(download_date_list,download_count_list, marker='o', linestyle='-')
|
||||
plt.savefig('Download_rate_chart.png', dpi=300)
|
||||
|
||||
pdf = FPDF()
|
||||
pdf.add_page()
|
||||
pdf.add_font('B Nazanin', '', '.\\fonts\\B Nazanin.ttf', uni=True)
|
||||
pdf.set_font("Arial", size=12)
|
||||
|
||||
# اضافه کردن متن
|
||||
pdf.multi_cell(0, 10, f"Model -<< https://huggingface.co/{model_id_} >>- Information :")
|
||||
|
||||
pdf.ln()
|
||||
pdf.multi_cell(0, 5, f"Download rate chart : ")
|
||||
pdf.ln()
|
||||
# اضافه کردن عکس
|
||||
pdf.image("Download_rate_chart.png", x=10, y=pdf.get_y() + 5, w=70)
|
||||
pdf.ln(20) # یک خط فاصله بعد از عکس
|
||||
# pdf.cell(0, 10, " Download history chart ", ln=True, align='C')
|
||||
|
||||
# اضافه کردن جدول
|
||||
pdf.ln(50)
|
||||
|
||||
pdf.ln()
|
||||
pdf.multi_cell(0, 5, f"Download rate table : ")
|
||||
pdf.ln()
|
||||
# سربرگ جدول
|
||||
for header in ['Date', 'Download-rate']:
|
||||
pdf.cell(40, 10, header, 1, 0, 'C')
|
||||
pdf.ln()
|
||||
pdf.set_font("Arial", size=10)
|
||||
# ردیفهای داده
|
||||
for row in downloads_list:
|
||||
for item in row:
|
||||
pdf.cell(40, 10, str(item), 1, 0, 'C')
|
||||
pdf.ln()
|
||||
|
||||
pdf.ln()
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id_)
|
||||
tokens = tokenizer.tokenize(text)
|
||||
print(tokens)
|
||||
print(f'len(tokens): {len(tokens)}')
|
||||
results = {'model': model_id_, 'len': len(tokens), 'tokens': tokens }
|
||||
# results = str(results).encode('latin-1', 'replace').decode('latin-1')
|
||||
# results = str(results).replace("▁","")
|
||||
pdf.multi_cell(0, 3, f"Likes : {str(Likes)}")
|
||||
pdf.ln()
|
||||
pdf.multi_cell(0, 3, f"last_modyfied : {str(last_modyfied)}")
|
||||
pdf.ln()
|
||||
pdf.multi_cell(0, 3, f"Growth slope : {round(growth_slope, 2)} in {lenM} month .")
|
||||
pdf.ln()
|
||||
pdf.multi_cell(0, 7, f"task : {task}")
|
||||
pdf.ln()
|
||||
pdf.set_font("Arial", size=16)
|
||||
pdf.multi_cell(0, 5, f"Tokenize info : ")
|
||||
pdf.ln()
|
||||
pdf.set_font("Arial", size=10)
|
||||
pdf.multi_cell(0, 3, f"len : {str(results['len'])}")
|
||||
pdf.ln()
|
||||
pdf.multi_cell(0, 3, "tokenized list : ")
|
||||
pdf.ln()
|
||||
txt = ''
|
||||
for token in results["tokens"]:
|
||||
txt += f' [ {token} ] '
|
||||
pdf.set_font("B Nazanin", size=10)
|
||||
pdf.multi_cell(0, 5, f"{process_text_for_fpdf(txt)}",align='R')
|
||||
pdf.output("singleModelInfo.pdf")
|
||||
print("فایل PDF با FPDF ایجاد شد.")
|
||||
os.remove("Download_rate_chart.png")
|
||||
|
||||
# singleModelInfo("amberoad/bert-multilingual-passage-reranking-msmarco")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# --- بخش ادغام با FastAPI و APScheduler ---
|
||||
app = FastAPI()
|
||||
scheduler = BackgroundScheduler()
|
||||
|
||||
# یک تابع برای دریافت آخرین ID قبل از اجرای job
|
||||
# def get_last_id():
|
||||
# try:
|
||||
# last_id = c.execute("SELECT MAX(ID) FROM PersianNlp").fetchone()[0]
|
||||
# return last_id if last_id is not None else 0
|
||||
# except sqlite3.OperationalError:
|
||||
# return 0
|
||||
|
||||
# تعریف کار زمانبندی شده: اجرای persian_model_finder در روز آخر هر ماه
|
||||
def scheduled_job():
|
||||
idx = first_id
|
||||
print(f"شروع اجرای کار زمانبندی شده. آخرین ID: {idx}")
|
||||
persian_model_finder(nlp_task_list, idx)
|
||||
print("کار زمانبندی شده با موفقیت به پایان رسید.")
|
||||
|
||||
# scheduler.add_job(scheduled_job, 'cron', day='1', hour='0', minute='0')
|
||||
# scheduler.add_job(scheduled_job, 'interval', minutes=5) # هر بیست دقیقه تابع رو اجرا میکنه که برای تست درست کار کردن هستش و الا کامنت بشه
|
||||
scheduler.add_job(scheduled_job, 'cron', day='last', hour='0', minute='0') # آخرین روز هرماه رو به عنوان زمان بندی قرار میده
|
||||
|
||||
# رویداد startup: شروع زمانبندی هنگام روشن شدن سرور
|
||||
@app.on_event("startup")
|
||||
def startup_event():
|
||||
print("رویداد startup: سرور در حال راهاندازی است و زمانبندی شروع میشود.")
|
||||
scheduler.start()
|
||||
print("زمانبندی فعال شد.")
|
||||
|
||||
# رویداد shutdown: خاموش کردن زمانبندی هنگام خاموش شدن سرور
|
||||
@app.on_event("shutdown")
|
||||
def shutdown_event():
|
||||
print("رویداد shutdown: سرور در حال خاموش شدن است و زمانبندی متوقف میشود.")
|
||||
scheduler.shutdown()
|
||||
|
||||
@app.get("/")
|
||||
def read_root():
|
||||
return {"message": "FastAPI service is running and the scheduler is active."}
|
||||
|
||||
@app.get("/run_monthly_job_now")
|
||||
def run_job_manually():
|
||||
print("درخواست برای اجرای دستی کار ماهانه دریافت شد...")
|
||||
# اجرای کار در یک ترد مجزا برای جلوگیری از مسدود شدن سرور
|
||||
threading.Thread(target=scheduled_job).start()
|
||||
return {"message": "Monthly job has been triggered manually."}
|
||||
def add_download_count():
|
||||
|
||||
count = 1
|
||||
api = HfApi()
|
||||
allModel = c.execute(f'''SELECT *
|
||||
FROM PersianNlp''')
|
||||
all_model_id = []
|
||||
for model in allModel:
|
||||
all_model_id.append([model[0],model[1]])
|
||||
|
||||
for id_ in all_model_id:
|
||||
# try:
|
||||
print(count)
|
||||
count+=1
|
||||
id_12_digits = generate_random_id(length=12, chars=string.digits)
|
||||
model_details = api.model_info(repo_id=id_[1])
|
||||
c.execute(f"""INSERT INTO downloadCountHistory(ID,key_id,downloads,date)
|
||||
VALUES ({id_12_digits},"{int(id_[0])}","{int(model_details.downloads)}","{str(d1)}");""")
|
||||
cnt.commit()
|
||||
# except:
|
||||
# print("Error!!")
|
||||
|
||||
# add_download_count()
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user