Compare commits
7 Commits
a61e9d68ca
...
792e7b6ba1
Author | SHA1 | Date | |
---|---|---|---|
792e7b6ba1 | |||
5513c211df | |||
20f42d5346 | |||
0c2d07892e | |||
f8f90f10a7 | |||
049311691e | |||
1ffdd6b62c |
|
@ -1,28 +1,32 @@
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
import json
|
import json
|
||||||
|
|
||||||
file = open('./data/models_info.json', 'r')
|
file = open('./data/models_ner_info.txt', 'r')
|
||||||
models = json.load(file)
|
models = file.readlines()
|
||||||
|
file.close()
|
||||||
|
|
||||||
# Strips the newline character
|
# Strips the newline character
|
||||||
text = 'جمهوری موافقتنامه معاملات قانون بودجه اساسی قضائی بینالمللی تأسیس منطقهای لازمالاجراء دامپروری راهآهن کمیسیونهای جدیدالاحداث مسئول فرآورده زائد اسقاط پنجساله'
|
text = 'جمهوری موافقتنامه معاملات قانون بودجه اساسی قضائی بینالمللی تأسیس منطقهای لازمالاجراء دامپروری راهآهن کمیسیونهای جدیدالاحداث مسئول فرآورده زائد اسقاط پنجساله'
|
||||||
results = []
|
results = []
|
||||||
for line in models:
|
for line in models:
|
||||||
model_checkpoint = line['model_name']
|
model_checkpoint = line.split(' -- ')[1].rstrip('\n')
|
||||||
|
# model_checkpoint = line['model_name']
|
||||||
try:
|
try:
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
||||||
print(model_checkpoint)
|
print(model_checkpoint)
|
||||||
tokens = tokenizer.tokenize(text)
|
tokens = tokenizer.tokenize(text)
|
||||||
print(tokens)
|
print(tokens)
|
||||||
results.append({ 'model': model_checkpoint, 'tokens': tokens})
|
print(f'len(tokens): {len(tokens)}')
|
||||||
if len(tokens) == 2 :
|
results.append({ 'model': model_checkpoint, 'len': len(tokens), 'tokens': tokens})
|
||||||
file.write( '{'+model_checkpoint + " : [ " + ','.join(tokens) + ' ] }\n' )
|
|
||||||
#result = tokenizer(text)
|
|
||||||
#print(result)
|
|
||||||
#print(tokenizer.decode(result['input_ids']))
|
|
||||||
except:
|
except:
|
||||||
error = "An exception occurred in tokenizer : " + model_checkpoint
|
error = "An exception occurred in tokenizer : " + model_checkpoint
|
||||||
#file.write( error + '\n' )
|
#file.write( error + '\n' )
|
||||||
print(error)
|
print(error)
|
||||||
#tokenizer.save_pretrained(model_checkpoint+'-tokenizer')
|
|
||||||
file.close()
|
with open('./data/models_tokenizer_info.json', 'w', encoding='utf-8') as file:
|
||||||
|
jsondata = json.dumps(results, ensure_ascii=False, indent=2)
|
||||||
|
file.write(jsondata)
|
||||||
|
|
||||||
|
print('finished!')
|
||||||
|
|
||||||
|
|
8
data/final_ner_selected.txt
Normal file
8
data/final_ner_selected.txt
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
HooshvareLab/bert-base-parsbert-ner-uncased
|
||||||
|
HooshvareLab/bert-fa-base-uncased-ner-peyma
|
||||||
|
HooshvareLab/bert-base-parsbert-armanner-uncased
|
||||||
|
HooshvareLab/bert-fa-base-uncased-ner-arman
|
||||||
|
HooshvareLab/bert-base-parsbert-peymaner-uncased
|
||||||
|
nicolauduran45/affilgood-ner-multilingual-v2
|
||||||
|
Amirmerfan/bert-base-uncased-persian-ner-50k-base
|
||||||
|
AliFartout/Roberta-fa-en-ner
|
10950
data/models_info.json
10950
data/models_info.json
File diff suppressed because it is too large
Load Diff
10498
data/models_info0.json
Normal file
10498
data/models_info0.json
Normal file
File diff suppressed because it is too large
Load Diff
258
data/models_ner_info.json
Normal file
258
data/models_ner_info.json
Normal file
|
@ -0,0 +1,258 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"model_name": "HooshvareLab/bert-base-parsbert-ner-uncased",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2021-5-18",
|
||||||
|
"downloads": 4060,
|
||||||
|
"likes": 5,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "jplu/tf-xlm-r-ner-40-lang",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2022-10-6",
|
||||||
|
"downloads": 874,
|
||||||
|
"likes": 27,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "nicolauduran45/affilgood-ner-multilingual-v2",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2025-4-16",
|
||||||
|
"downloads": 668,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "HooshvareLab/bert-fa-zwnj-base-ner",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2021-5-18",
|
||||||
|
"downloads": 407,
|
||||||
|
"likes": 4,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "igorsterner/xlmr-multilingual-sentence-segmentation",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2024-10-5",
|
||||||
|
"downloads": 256,
|
||||||
|
"likes": 4,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "hamedkhaledi/persian-flair-ner",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2022-4-3",
|
||||||
|
"downloads": 230,
|
||||||
|
"likes": 1,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "HooshvareLab/bert-fa-base-uncased-ner-peyma",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2021-5-18",
|
||||||
|
"downloads": 138,
|
||||||
|
"likes": 8,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "mradermacher/persian-question-generator-GGUF",
|
||||||
|
"task": "Unknown",
|
||||||
|
"last_modified": "2024-12-8",
|
||||||
|
"downloads": 110,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "HooshvareLab/bert-base-parsbert-armanner-uncased",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2021-5-18",
|
||||||
|
"downloads": 103,
|
||||||
|
"likes": 3,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "HooshvareLab/distilbert-fa-zwnj-base-ner",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2021-3-21",
|
||||||
|
"downloads": 101,
|
||||||
|
"likes": 4,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "HooshvareLab/roberta-fa-zwnj-base-ner",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2021-5-20",
|
||||||
|
"downloads": 92,
|
||||||
|
"likes": 1,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "HooshvareLab/bert-fa-base-uncased-ner-arman",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2021-5-18",
|
||||||
|
"downloads": 71,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "HooshvareLab/albert-fa-zwnj-base-v2-ner",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2021-3-21",
|
||||||
|
"downloads": 58,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "artificial-nerds/pmnet",
|
||||||
|
"task": "sentence-similarity",
|
||||||
|
"last_modified": "2024-10-29",
|
||||||
|
"downloads": 33,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "HooshvareLab/bert-base-parsbert-peymaner-uncased",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2021-5-18",
|
||||||
|
"downloads": 28,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "Erfan/mT5-base_Farsi_Title_Generator",
|
||||||
|
"task": "text-generation",
|
||||||
|
"last_modified": "2022-1-30",
|
||||||
|
"downloads": 24,
|
||||||
|
"likes": 2,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "myrkur/persian-question-generator",
|
||||||
|
"task": "summarization",
|
||||||
|
"last_modified": "2024-12-10",
|
||||||
|
"downloads": 24,
|
||||||
|
"likes": 2,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "m3hrdadfi/albert-fa-base-v2-ner-arman",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2020-12-26",
|
||||||
|
"downloads": 21,
|
||||||
|
"likes": 3,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "hezarai/bert-fa-ner-arman",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2024-11-14",
|
||||||
|
"downloads": 17,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "m3hrdadfi/albert-fa-base-v2-ner-peyma",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2020-12-26",
|
||||||
|
"downloads": 16,
|
||||||
|
"likes": 1,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "Amirmerfan/bert-base-uncased-persian-ner-50k-base",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2025-3-18",
|
||||||
|
"downloads": 16,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "tum-nlp/neural-news-generator-bloomz-7b1-fa",
|
||||||
|
"task": "text-generation",
|
||||||
|
"last_modified": "2024-8-15",
|
||||||
|
"downloads": 14,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "dadashzadeh/test-trainer-persian",
|
||||||
|
"task": "text-classification",
|
||||||
|
"last_modified": "2023-11-13",
|
||||||
|
"downloads": 11,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "tum-nlp/neural-news-generator-llama-7b-fa",
|
||||||
|
"task": "text-generation",
|
||||||
|
"last_modified": "2024-8-15",
|
||||||
|
"downloads": 11,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "AliFartout/Roberta-fa-en-ner",
|
||||||
|
"task": "token-classification",
|
||||||
|
"last_modified": "2023-9-1",
|
||||||
|
"downloads": 9,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "myrkur/persian-title-generator",
|
||||||
|
"task": "summarization",
|
||||||
|
"last_modified": "2024-6-13",
|
||||||
|
"downloads": 9,
|
||||||
|
"likes": 2,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "NLPclass/mt5-title-generation",
|
||||||
|
"task": "text-generation",
|
||||||
|
"last_modified": "2024-7-24",
|
||||||
|
"downloads": 9,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "SinaRp/Question_generator_persian",
|
||||||
|
"task": "text-generation",
|
||||||
|
"last_modified": "2024-11-8",
|
||||||
|
"downloads": 9,
|
||||||
|
"likes": 1,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "mansoorhamidzadeh/mt5-title-generation",
|
||||||
|
"task": "text-generation",
|
||||||
|
"last_modified": "2024-7-24",
|
||||||
|
"downloads": 8,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "dz5035/bicleaner-ai-full-large-de-xx",
|
||||||
|
"task": "Unknown",
|
||||||
|
"last_modified": "2024-6-3",
|
||||||
|
"downloads": 4,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "m0javad/conversational_Persian_paraphrase_generation",
|
||||||
|
"task": "Unknown",
|
||||||
|
"last_modified": "2024-2-17",
|
||||||
|
"downloads": 0,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model_name": "PranavKeshav/event-planner-gemma-4bit",
|
||||||
|
"task": "text-generation",
|
||||||
|
"last_modified": "2025-5-17",
|
||||||
|
"downloads": 0,
|
||||||
|
"likes": 0,
|
||||||
|
"language": "fa"
|
||||||
|
}
|
||||||
|
]
|
17
data/models_ner_info.txt
Normal file
17
data/models_ner_info.txt
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
4060 -- HooshvareLab/bert-base-parsbert-ner-uncased
|
||||||
|
874 -- jplu/tf-xlm-r-ner-40-lang
|
||||||
|
668 -- nicolauduran45/affilgood-ner-multilingual-v2
|
||||||
|
407 -- HooshvareLab/bert-fa-zwnj-base-ner
|
||||||
|
230 -- hamedkhaledi/persian-flair-ner
|
||||||
|
138 -- HooshvareLab/bert-fa-base-uncased-ner-peyma
|
||||||
|
103 -- HooshvareLab/bert-base-parsbert-armanner-uncased
|
||||||
|
101 -- HooshvareLab/distilbert-fa-zwnj-base-ner
|
||||||
|
92 -- HooshvareLab/roberta-fa-zwnj-base-ner
|
||||||
|
71 -- HooshvareLab/bert-fa-base-uncased-ner-arman
|
||||||
|
58 -- HooshvareLab/albert-fa-zwnj-base-v2-ner
|
||||||
|
28 -- HooshvareLab/bert-base-parsbert-peymaner-uncased
|
||||||
|
21 -- m3hrdadfi/albert-fa-base-v2-ner-arman
|
||||||
|
17 -- hezarai/bert-fa-ner-arman
|
||||||
|
16 -- m3hrdadfi/albert-fa-base-v2-ner-peyma
|
||||||
|
16 -- Amirmerfan/bert-base-uncased-persian-ner-50k-base
|
||||||
|
9 -- AliFartout/Roberta-fa-en-ner
|
465
data/models_tokenizer_info.json
Normal file
465
data/models_tokenizer_info.json
Normal file
|
@ -0,0 +1,465 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"model": "HooshvareLab/bert-base-parsbert-ner-uncased",
|
||||||
|
"len": 20,
|
||||||
|
"tokens": [
|
||||||
|
"جمهوری",
|
||||||
|
"موافقتنامه",
|
||||||
|
"معاملات",
|
||||||
|
"قانون",
|
||||||
|
"بودجه",
|
||||||
|
"اساسی",
|
||||||
|
"قضايی",
|
||||||
|
"بینالمللی",
|
||||||
|
"تاسیس",
|
||||||
|
"منطقهای",
|
||||||
|
"لازمالاجراء",
|
||||||
|
"دامپروری",
|
||||||
|
"راهاهن",
|
||||||
|
"کمیسیونهای",
|
||||||
|
"جدیدالاحداث",
|
||||||
|
"مسيول",
|
||||||
|
"فراورده",
|
||||||
|
"زايد",
|
||||||
|
"اسقاط",
|
||||||
|
"پنجساله"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "nicolauduran45/affilgood-ner-multilingual-v2",
|
||||||
|
"len": 39,
|
||||||
|
"tokens": [
|
||||||
|
"▁جمهوری",
|
||||||
|
"▁موافقت",
|
||||||
|
"▁نامه",
|
||||||
|
"▁معاملات",
|
||||||
|
"▁قانون",
|
||||||
|
"▁بودجه",
|
||||||
|
"▁اساسی",
|
||||||
|
"▁قضا",
|
||||||
|
"ئی",
|
||||||
|
"▁بین",
|
||||||
|
"▁المللی",
|
||||||
|
"▁تأسیس",
|
||||||
|
"▁منطقه",
|
||||||
|
"▁ای",
|
||||||
|
"▁لازم",
|
||||||
|
"▁الاج",
|
||||||
|
"راء",
|
||||||
|
"▁دام",
|
||||||
|
"پر",
|
||||||
|
"وری",
|
||||||
|
"▁راه",
|
||||||
|
"▁آهن",
|
||||||
|
"▁کمیسیون",
|
||||||
|
"▁های",
|
||||||
|
"▁جدید",
|
||||||
|
"الا",
|
||||||
|
"حد",
|
||||||
|
"اث",
|
||||||
|
"▁مسئول",
|
||||||
|
"▁فر",
|
||||||
|
"آور",
|
||||||
|
"ده",
|
||||||
|
"▁زائد",
|
||||||
|
"▁اس",
|
||||||
|
"ق",
|
||||||
|
"اط",
|
||||||
|
"▁پنج",
|
||||||
|
"سال",
|
||||||
|
"ه"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "HooshvareLab/bert-fa-zwnj-base-ner",
|
||||||
|
"len": 37,
|
||||||
|
"tokens": [
|
||||||
|
"جمهوری",
|
||||||
|
"موافقت",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"نامه",
|
||||||
|
"معاملات",
|
||||||
|
"قانون",
|
||||||
|
"بودجه",
|
||||||
|
"اساسی",
|
||||||
|
"[UNK]",
|
||||||
|
"بین",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"المللی",
|
||||||
|
"[UNK]",
|
||||||
|
"منطقه",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"ای",
|
||||||
|
"لازم",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"الاجرا",
|
||||||
|
"##ء",
|
||||||
|
"دامپروری",
|
||||||
|
"راه",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"آ",
|
||||||
|
"##هن",
|
||||||
|
"کمیسیون",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"های",
|
||||||
|
"جدیدا",
|
||||||
|
"##لاح",
|
||||||
|
"##داث",
|
||||||
|
"[UNK]",
|
||||||
|
"[UNK]",
|
||||||
|
"[UNK]",
|
||||||
|
"اسقاط",
|
||||||
|
"پنج",
|
||||||
|
"##ساله"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "HooshvareLab/bert-fa-base-uncased-ner-peyma",
|
||||||
|
"len": 20,
|
||||||
|
"tokens": [
|
||||||
|
"جمهوری",
|
||||||
|
"موافقتنامه",
|
||||||
|
"معاملات",
|
||||||
|
"قانون",
|
||||||
|
"بودجه",
|
||||||
|
"اساسی",
|
||||||
|
"قضايی",
|
||||||
|
"بینالمللی",
|
||||||
|
"تاسیس",
|
||||||
|
"منطقهای",
|
||||||
|
"لازمالاجراء",
|
||||||
|
"دامپروری",
|
||||||
|
"راهاهن",
|
||||||
|
"کمیسیونهای",
|
||||||
|
"جدیدالاحداث",
|
||||||
|
"مسيول",
|
||||||
|
"فراورده",
|
||||||
|
"زايد",
|
||||||
|
"اسقاط",
|
||||||
|
"پنجساله"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "HooshvareLab/bert-base-parsbert-armanner-uncased",
|
||||||
|
"len": 20,
|
||||||
|
"tokens": [
|
||||||
|
"جمهوری",
|
||||||
|
"موافقتنامه",
|
||||||
|
"معاملات",
|
||||||
|
"قانون",
|
||||||
|
"بودجه",
|
||||||
|
"اساسی",
|
||||||
|
"قضايی",
|
||||||
|
"بینالمللی",
|
||||||
|
"تاسیس",
|
||||||
|
"منطقهای",
|
||||||
|
"لازمالاجراء",
|
||||||
|
"دامپروری",
|
||||||
|
"راهاهن",
|
||||||
|
"کمیسیونهای",
|
||||||
|
"جدیدالاحداث",
|
||||||
|
"مسيول",
|
||||||
|
"فراورده",
|
||||||
|
"زايد",
|
||||||
|
"اسقاط",
|
||||||
|
"پنجساله"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "HooshvareLab/distilbert-fa-zwnj-base-ner",
|
||||||
|
"len": 37,
|
||||||
|
"tokens": [
|
||||||
|
"جمهوری",
|
||||||
|
"موافقت",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"نامه",
|
||||||
|
"معاملات",
|
||||||
|
"قانون",
|
||||||
|
"بودجه",
|
||||||
|
"اساسی",
|
||||||
|
"[UNK]",
|
||||||
|
"بین",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"المللی",
|
||||||
|
"[UNK]",
|
||||||
|
"منطقه",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"ای",
|
||||||
|
"لازم",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"الاجرا",
|
||||||
|
"##ء",
|
||||||
|
"دامپروری",
|
||||||
|
"راه",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"آ",
|
||||||
|
"##هن",
|
||||||
|
"کمیسیون",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"های",
|
||||||
|
"جدیدا",
|
||||||
|
"##لاح",
|
||||||
|
"##داث",
|
||||||
|
"[UNK]",
|
||||||
|
"[UNK]",
|
||||||
|
"[UNK]",
|
||||||
|
"اسقاط",
|
||||||
|
"پنج",
|
||||||
|
"##ساله"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "HooshvareLab/roberta-fa-zwnj-base-ner",
|
||||||
|
"len": 37,
|
||||||
|
"tokens": [
|
||||||
|
"ĠجÙħÙĩÙĪØ±ÛĮ",
|
||||||
|
"ĠÙħÙĪØ§ÙģÙĤت",
|
||||||
|
"âĢĮ",
|
||||||
|
"ÙĨاÙħÙĩ",
|
||||||
|
"ĠÙħعاÙħÙĦات",
|
||||||
|
"ĠÙĤاÙĨÙĪÙĨ",
|
||||||
|
"ĠبÙĪØ¯Ø¬Ùĩ",
|
||||||
|
"ĠاساسÛĮ",
|
||||||
|
"ĠÙĤضائÛĮ",
|
||||||
|
"ĠبÛĮÙĨ",
|
||||||
|
"âĢĮ",
|
||||||
|
"اÙĦÙħÙĦÙĦÛĮ",
|
||||||
|
"ĠتأسÛĮس",
|
||||||
|
"ĠÙħÙĨØ·ÙĤÙĩ",
|
||||||
|
"âĢĮ",
|
||||||
|
"اÛĮ",
|
||||||
|
"ĠÙĦازÙħ",
|
||||||
|
"âĢĮ",
|
||||||
|
"اÙĦاج",
|
||||||
|
"راء",
|
||||||
|
"ĠداÙħپرÙĪØ±ÛĮ",
|
||||||
|
"ĠراÙĩ",
|
||||||
|
"âĢĮ",
|
||||||
|
"Ø¢ÙĩÙĨ",
|
||||||
|
"ĠÚ©ÙħÛĮسÛĮÙĪÙĨ",
|
||||||
|
"âĢĮ",
|
||||||
|
"ÙĩاÛĮ",
|
||||||
|
"ĠجدÛĮد",
|
||||||
|
"اÙĦ",
|
||||||
|
"اØŃ",
|
||||||
|
"داث",
|
||||||
|
"ĠÙħسئÙĪÙĦ",
|
||||||
|
"ĠÙģØ±Ø¢ÙĪØ±Ø¯Ùĩ",
|
||||||
|
"Ġزائد",
|
||||||
|
"ĠاسÙĤاط",
|
||||||
|
"ĠÙ¾ÙĨج",
|
||||||
|
"ساÙĦÙĩ"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "HooshvareLab/bert-fa-base-uncased-ner-arman",
|
||||||
|
"len": 20,
|
||||||
|
"tokens": [
|
||||||
|
"جمهوری",
|
||||||
|
"موافقتنامه",
|
||||||
|
"معاملات",
|
||||||
|
"قانون",
|
||||||
|
"بودجه",
|
||||||
|
"اساسی",
|
||||||
|
"قضايی",
|
||||||
|
"بینالمللی",
|
||||||
|
"تاسیس",
|
||||||
|
"منطقهای",
|
||||||
|
"لازمالاجراء",
|
||||||
|
"دامپروری",
|
||||||
|
"راهاهن",
|
||||||
|
"کمیسیونهای",
|
||||||
|
"جدیدالاحداث",
|
||||||
|
"مسيول",
|
||||||
|
"فراورده",
|
||||||
|
"زايد",
|
||||||
|
"اسقاط",
|
||||||
|
"پنجساله"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "HooshvareLab/albert-fa-zwnj-base-v2-ner",
|
||||||
|
"len": 46,
|
||||||
|
"tokens": [
|
||||||
|
"▁جمهوری",
|
||||||
|
"▁موافقت",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"نامه",
|
||||||
|
"▁معاملات",
|
||||||
|
"▁قانون",
|
||||||
|
"▁بودجه",
|
||||||
|
"▁اساسی",
|
||||||
|
"▁قضا",
|
||||||
|
"ي",
|
||||||
|
"ی",
|
||||||
|
"▁بین",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"الم",
|
||||||
|
"لل",
|
||||||
|
"ی",
|
||||||
|
"▁تاسیس",
|
||||||
|
"▁منطقه",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"ای",
|
||||||
|
"▁لازم",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"ال",
|
||||||
|
"اجرا",
|
||||||
|
"ء",
|
||||||
|
"▁دامپروری",
|
||||||
|
"▁راه",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"اهن",
|
||||||
|
"▁کمیسیون",
|
||||||
|
"[ZWNJ]",
|
||||||
|
"های",
|
||||||
|
"▁جدید",
|
||||||
|
"الا",
|
||||||
|
"حد",
|
||||||
|
"اث",
|
||||||
|
"▁مس",
|
||||||
|
"ي",
|
||||||
|
"ول",
|
||||||
|
"▁فراورده",
|
||||||
|
"▁زا",
|
||||||
|
"ي",
|
||||||
|
"د",
|
||||||
|
"▁اسقاط",
|
||||||
|
"▁پنج",
|
||||||
|
"ساله"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "HooshvareLab/bert-base-parsbert-peymaner-uncased",
|
||||||
|
"len": 20,
|
||||||
|
"tokens": [
|
||||||
|
"جمهوری",
|
||||||
|
"موافقتنامه",
|
||||||
|
"معاملات",
|
||||||
|
"قانون",
|
||||||
|
"بودجه",
|
||||||
|
"اساسی",
|
||||||
|
"قضايی",
|
||||||
|
"بینالمللی",
|
||||||
|
"تاسیس",
|
||||||
|
"منطقهای",
|
||||||
|
"لازمالاجراء",
|
||||||
|
"دامپروری",
|
||||||
|
"راهاهن",
|
||||||
|
"کمیسیونهای",
|
||||||
|
"جدیدالاحداث",
|
||||||
|
"مسيول",
|
||||||
|
"فراورده",
|
||||||
|
"زايد",
|
||||||
|
"اسقاط",
|
||||||
|
"پنجساله"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "Amirmerfan/bert-base-uncased-persian-ner-50k-base",
|
||||||
|
"len": 56,
|
||||||
|
"tokens": [
|
||||||
|
"جمهوری",
|
||||||
|
"م",
|
||||||
|
"##وا",
|
||||||
|
"##فق",
|
||||||
|
"##تن",
|
||||||
|
"##ام",
|
||||||
|
"##ه",
|
||||||
|
"مع",
|
||||||
|
"##امل",
|
||||||
|
"##ات",
|
||||||
|
"قانون",
|
||||||
|
"بود",
|
||||||
|
"##جه",
|
||||||
|
"اساسی",
|
||||||
|
"ق",
|
||||||
|
"##ضا",
|
||||||
|
"##يی",
|
||||||
|
"بینالمللی",
|
||||||
|
"تاسیس",
|
||||||
|
"منطقه",
|
||||||
|
"##ای",
|
||||||
|
"لازم",
|
||||||
|
"##ال",
|
||||||
|
"##اج",
|
||||||
|
"##راء",
|
||||||
|
"دا",
|
||||||
|
"##م",
|
||||||
|
"##پر",
|
||||||
|
"##وری",
|
||||||
|
"راه",
|
||||||
|
"##اه",
|
||||||
|
"##ن",
|
||||||
|
"کمی",
|
||||||
|
"##سی",
|
||||||
|
"##ون",
|
||||||
|
"##های",
|
||||||
|
"جدید",
|
||||||
|
"##ال",
|
||||||
|
"##اح",
|
||||||
|
"##دا",
|
||||||
|
"##ث",
|
||||||
|
"م",
|
||||||
|
"##سي",
|
||||||
|
"##ول",
|
||||||
|
"ف",
|
||||||
|
"##را",
|
||||||
|
"##ورد",
|
||||||
|
"##ه",
|
||||||
|
"ز",
|
||||||
|
"##ايد",
|
||||||
|
"اس",
|
||||||
|
"##قا",
|
||||||
|
"##ط",
|
||||||
|
"پنج",
|
||||||
|
"##سال",
|
||||||
|
"##ه"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "AliFartout/Roberta-fa-en-ner",
|
||||||
|
"len": 39,
|
||||||
|
"tokens": [
|
||||||
|
"▁جمهوری",
|
||||||
|
"▁موافقت",
|
||||||
|
"▁نامه",
|
||||||
|
"▁معاملات",
|
||||||
|
"▁قانون",
|
||||||
|
"▁بودجه",
|
||||||
|
"▁اساسی",
|
||||||
|
"▁قضا",
|
||||||
|
"ئی",
|
||||||
|
"▁بین",
|
||||||
|
"▁المللی",
|
||||||
|
"▁تأسیس",
|
||||||
|
"▁منطقه",
|
||||||
|
"▁ای",
|
||||||
|
"▁لازم",
|
||||||
|
"▁الاج",
|
||||||
|
"راء",
|
||||||
|
"▁دام",
|
||||||
|
"پر",
|
||||||
|
"وری",
|
||||||
|
"▁راه",
|
||||||
|
"▁آهن",
|
||||||
|
"▁کمیسیون",
|
||||||
|
"▁های",
|
||||||
|
"▁جدید",
|
||||||
|
"الا",
|
||||||
|
"حد",
|
||||||
|
"اث",
|
||||||
|
"▁مسئول",
|
||||||
|
"▁فر",
|
||||||
|
"آور",
|
||||||
|
"ده",
|
||||||
|
"▁زائد",
|
||||||
|
"▁اس",
|
||||||
|
"ق",
|
||||||
|
"اط",
|
||||||
|
"▁پنج",
|
||||||
|
"سال",
|
||||||
|
"ه"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
BIN
db/persian_nlp_model.db
Normal file
BIN
db/persian_nlp_model.db
Normal file
Binary file not shown.
BIN
fonts/B Nazanin Bold.ttf
Normal file
BIN
fonts/B Nazanin Bold.ttf
Normal file
Binary file not shown.
BIN
fonts/B Nazanin.ttf
Normal file
BIN
fonts/B Nazanin.ttf
Normal file
Binary file not shown.
BIN
fonts/B-NAZANIN.TTF
Normal file
BIN
fonts/B-NAZANIN.TTF
Normal file
Binary file not shown.
BIN
fonts/NotoSansArabic.ttf
Normal file
BIN
fonts/NotoSansArabic.ttf
Normal file
Binary file not shown.
BIN
fonts/consola.ttf
Normal file
BIN
fonts/consola.ttf
Normal file
Binary file not shown.
BIN
fonts/consolab.ttf
Normal file
BIN
fonts/consolab.ttf
Normal file
Binary file not shown.
BIN
fonts/consolai.ttf
Normal file
BIN
fonts/consolai.ttf
Normal file
Binary file not shown.
BIN
fonts/consolaz.ttf
Normal file
BIN
fonts/consolaz.ttf
Normal file
Binary file not shown.
|
@ -1,7 +1,11 @@
|
||||||
from huggingface_hub import HfApi, ModelFilter
|
from huggingface_hub import HfApi
|
||||||
|
from huggingface_hub import ModelCard
|
||||||
|
from huggingface_hub import ModelFilter
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
|
||||||
api = HfApi()
|
api = HfApi()
|
||||||
|
# card = ModelCard.load('model-name')
|
||||||
|
|
||||||
# persian languages tags
|
# persian languages tags
|
||||||
languages = ['pes', 'fas', 'fa'] # Language codes for Persian, Pashto, and Turkish
|
languages = ['pes', 'fas', 'fa'] # Language codes for Persian, Pashto, and Turkish
|
||||||
|
|
|
@ -1,173 +0,0 @@
|
||||||
from huggingface_hub import HfApi
|
|
||||||
import json
|
|
||||||
import datetime
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# تعریف تسکهای رایج NLP
|
|
||||||
nlp_task_list = [
|
|
||||||
"text-classification",
|
|
||||||
"token-classification",
|
|
||||||
"question-answering",
|
|
||||||
"summarization",
|
|
||||||
"translation",
|
|
||||||
"text-generation",
|
|
||||||
"fill-mask",
|
|
||||||
"zero-shot-classification",
|
|
||||||
"feature-extraction",
|
|
||||||
"sentence-similarity",
|
|
||||||
"text2text-generation",
|
|
||||||
"conversational"
|
|
||||||
]
|
|
||||||
|
|
||||||
def persian_model_finder(nlp_tasks,json_file_name):
|
|
||||||
|
|
||||||
api = HfApi()
|
|
||||||
all_persian_nlp_models_data = []
|
|
||||||
seen_model_ids = set() # برای جلوگیری از اضافه شدن مدلهای تکراری
|
|
||||||
|
|
||||||
print("در حال جستجو و استخراج اطلاعات مدلهای NLP فارسی...")
|
|
||||||
|
|
||||||
# فیلتر کردن و پیمایش روی مدلها
|
|
||||||
# برای هر تسک NLP، مدلهای فارسی را جستجو میکنیم.
|
|
||||||
# محدودیت 500 مدل برای هر تسک در نظر گرفته شده است تا از دانلود بیش از حد جلوگیری شود.
|
|
||||||
# اگر میخواهید همه مدلها را استخراج کنید، ممکن است نیاز به پیجینیشن (pagination) باشد.
|
|
||||||
for task in nlp_tasks:
|
|
||||||
print(f" جستجو برای تسک: {task} (زبان: فارسی)...")
|
|
||||||
try:
|
|
||||||
models_for_task = api.list_models(
|
|
||||||
language="fa",
|
|
||||||
task=task,
|
|
||||||
sort="downloads",
|
|
||||||
direction=-1, # نزولی (از بیشترین دانلود به کمترین)
|
|
||||||
limit=None # میتوانید این عدد را تغییر دهید
|
|
||||||
)
|
|
||||||
|
|
||||||
for model_info in models_for_task:
|
|
||||||
if model_info.id not in seen_model_ids:
|
|
||||||
try : # اگر از کارت مدل توانست اطلاعات بیشتری به دست بیاورد :
|
|
||||||
model_ = api.model_info(model_info.id) # به دست آوردن شناسه مدل
|
|
||||||
card_data_dict = model_.card_data.to_dict() # از روی کارت مدل که شامل اطلاعات مدل میباشد یک دیکشنری میسازیم
|
|
||||||
model_data = {
|
|
||||||
"model_id": model_info.id,
|
|
||||||
"url": f"https://huggingface.co/{model_info.id}",
|
|
||||||
"downloads": model_info.downloads,
|
|
||||||
"private": model_info.private,
|
|
||||||
"author": model_info.author,
|
|
||||||
"tags": model_info.tags, # شامل زبانها، تسکها، لایبرریها و...
|
|
||||||
"tag_dataset":"-",
|
|
||||||
"tag_base_model":"-",
|
|
||||||
"tag_license":"-",
|
|
||||||
"tag_region":"-",
|
|
||||||
"pipeline_tag": model_info.pipeline_tag, # تسک اصلی مدل که توسط هاب تعیین شده
|
|
||||||
"Likes":model_info.likes,
|
|
||||||
# چهار مورد پایینی از روی دیکشنری کارت مدل خوانده میشود
|
|
||||||
"languages":card_data_dict.get('language', 'N/A'), # زبان هایی که پشتیبانی میشود
|
|
||||||
"library":card_data_dict.get('library', 'N/A'), # کتابخانه های مورد استفاده
|
|
||||||
"datasets":card_data_dict.get('datasets', 'N/A'), # دیتابیس های مورد استفاده
|
|
||||||
"license":card_data_dict.get('license', 'N/A'),
|
|
||||||
"just_persian" : False
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
if model_data["library"] == 'N/A': # در بعضی موارد کتابخانه به این نام ('library_name') در دیکشنری کارت مدل ذخیره شده
|
|
||||||
model_data["library"] = card_data_dict.get('library_name', 'N/A')
|
|
||||||
# شرط پایینی ، مواردی که فقط مختص زبان فارسی هستند را در دیکشنری مشخص میکند
|
|
||||||
if len(model_data["languages"]) == 2 and "multilingual" in model_data["languages"] or\
|
|
||||||
len(model_data["languages"]) == 2 and "persian" in model_data["languages"] or\
|
|
||||||
len(model_data["languages"]) == 2 and "farsi" in model_data["languages"] or\
|
|
||||||
len(model_data["languages"]) == 2 and "fas" in model_data["languages"] or\
|
|
||||||
len(model_data["languages"]) == 2 and model_data["languages"]=="fa" or\
|
|
||||||
model_data["languages"] == "persian" or\
|
|
||||||
model_data["languages"] == "farsi" or\
|
|
||||||
model_data["languages"] == "fas" or\
|
|
||||||
model_data["languages"] == "pes" or\
|
|
||||||
len(model_data["languages"]) == 1 :
|
|
||||||
model_data["just_persian"] = True
|
|
||||||
|
|
||||||
for value in model_data["tags"]:
|
|
||||||
|
|
||||||
if "dataset:" in value :
|
|
||||||
if type(model_data["tag_dataset"]) == type(""):
|
|
||||||
model_data["tag_dataset"] = list(model_data["tag_dataset"])
|
|
||||||
model_data["tag_dataset"].pop(0)
|
|
||||||
model_data["tag_dataset"].append(f"{str(value).replace("dataset:","")}")
|
|
||||||
|
|
||||||
if "base_model:" in value :
|
|
||||||
if type(model_data["tag_base_model"]) == type(""):
|
|
||||||
model_data["tag_base_model"] = list(model_data["tag_base_model"])
|
|
||||||
model_data["tag_base_model"].pop(0)
|
|
||||||
model_data["tag_base_model"].append(f"{str(value).replace("base_model:","")}")
|
|
||||||
|
|
||||||
if "region:" in value :
|
|
||||||
model_data["tag_region"]=f"{str(value).replace("region:","")}"
|
|
||||||
|
|
||||||
if "license:" in value :
|
|
||||||
model_data["tag_license"]=f"{str(value).replace("license:","")}"
|
|
||||||
|
|
||||||
|
|
||||||
all_persian_nlp_models_data.append(model_data)
|
|
||||||
seen_model_ids.add(model_info.id)
|
|
||||||
|
|
||||||
except : # اگر استفاده از کارت مدل با مشکل مواجه شد :
|
|
||||||
model_data = {
|
|
||||||
"model_id": model_info.id,
|
|
||||||
"url": f"https://huggingface.co/{model_info.id}",
|
|
||||||
"downloads": model_info.downloads,
|
|
||||||
"private": model_info.private,
|
|
||||||
"author": model_info.author,
|
|
||||||
"tags": model_info.tags, # شامل زبانها، تسکها، لایبرریها و...
|
|
||||||
"pipeline_tag": model_info.pipeline_tag, # تسک اصلی مدل که توسط هاب تعیین شده
|
|
||||||
"Likes":model_info.likes,
|
|
||||||
"library":model_info.library_name,
|
|
||||||
# افزودن لایسنس اگر موجود باشد
|
|
||||||
"license": model_info.card_data.license if model_info.card_data and model_info.card_data.license else "N/A"
|
|
||||||
}
|
|
||||||
|
|
||||||
for value in model_data["tags"]:
|
|
||||||
|
|
||||||
if "dataset:" in value :
|
|
||||||
if type(model_data["tag_dataset"]) == type(""):
|
|
||||||
model_data["tag_dataset"] = list(model_data["tag_dataset"])
|
|
||||||
model_data["tag_dataset"].pop(0)
|
|
||||||
model_data["tag_dataset"].append(f"{str(value).replace("dataset:","")}")
|
|
||||||
|
|
||||||
if "base_model:" in value :
|
|
||||||
if type(model_data["tag_base_model"]) == type(""):
|
|
||||||
model_data["tag_base_model"] = list(model_data["tag_base_model"])
|
|
||||||
model_data["tag_base_model"].pop(0)
|
|
||||||
model_data["tag_base_model"].append(f"{str(value).replace("base_model:","")}")
|
|
||||||
|
|
||||||
if "region:" in value :
|
|
||||||
model_data["tag_region"]=f"{str(value).replace("region:","")}"
|
|
||||||
|
|
||||||
if "license:" in value :
|
|
||||||
model_data["tag_license"]=f"{str(value).replace("license:","")}"
|
|
||||||
|
|
||||||
all_persian_nlp_models_data.append(model_data)
|
|
||||||
seen_model_ids.add(model_info.id)
|
|
||||||
|
|
||||||
print(f" تعداد مدلهای یافت شده برای تسک '{task}': {len(models_for_task)}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f" خطا در جستجو برای تسک {task}: {e}")
|
|
||||||
|
|
||||||
# مرتبسازی نهایی مدلها بر اساس تعداد دانلود (کل لیست)
|
|
||||||
# این مرحله اطمینان میدهد که حتی اگر مدلها از تسکهای مختلف جمعآوری شده باشند،
|
|
||||||
# در نهایت بر اساس دانلود مرتب شده باشند.
|
|
||||||
all_persian_nlp_models_data_sorted = sorted(all_persian_nlp_models_data, key=lambda x: x['downloads'], reverse=True)
|
|
||||||
|
|
||||||
print(f"\nتعداد کل مدلهای NLP فارسی منحصربهفرد یافت شده: {len(all_persian_nlp_models_data_sorted)}")
|
|
||||||
|
|
||||||
# ذخیره اطلاعات در یک فایل JSON
|
|
||||||
output_json_file = f"{json_file_name}"
|
|
||||||
with open(output_json_file, "w", encoding="utf-8") as f:
|
|
||||||
json.dump(all_persian_nlp_models_data_sorted, f, ensure_ascii=False, indent=4)
|
|
||||||
|
|
||||||
print(f"اطلاعات {len(all_persian_nlp_models_data_sorted)} مدل در فایل '{output_json_file}' ذخیره شد.")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
persian_model_finder(nlp_task_list,"persian_nlp_models_info.json")
|
|
||||||
|
|
||||||
|
|
3
important_tokenizers.txt
Normal file
3
important_tokenizers.txt
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
facebook/xlm-v-base
|
||||||
|
sharif-dal/dal-bert
|
||||||
|
lifeweb-ai/shiraz
|
30
model_filter.py
Normal file
30
model_filter.py
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
import json
|
||||||
|
|
||||||
|
with open('./data/models_info.json', 'r', encoding='utf-8') as file:
|
||||||
|
models = json.load(file)
|
||||||
|
|
||||||
|
ner_models = []
|
||||||
|
for mdl in models:
|
||||||
|
if mdl['model_name'].__contains__('ner') or mdl['model_name'].__contains__('Ner') or mdl['model_name'].__contains__('Ner'):
|
||||||
|
ner_models.append(mdl)
|
||||||
|
# [mdl['model_name']]= {
|
||||||
|
# "task": mdl['task'],
|
||||||
|
# "last_modified": mdl['last_modified'],
|
||||||
|
# "downloads": int(mdl['downloads']),
|
||||||
|
# "likes": mdl['likes'],
|
||||||
|
# "language": mdl['language']
|
||||||
|
# }
|
||||||
|
sorted_ner_models = sorted(ner_models, key=lambda x: x["downloads"], reverse=True)
|
||||||
|
|
||||||
|
with open('./data/models_ner_info.json', 'w', encoding='utf-8') as file:
|
||||||
|
jsondata = json.dumps(sorted_ner_models, ensure_ascii=False, indent=2)
|
||||||
|
file.write(jsondata)
|
||||||
|
|
||||||
|
ner_models_text = ''
|
||||||
|
for mdl in sorted_ner_models:
|
||||||
|
ner_models_text += ''.join(f"{mdl['downloads']} -- {mdl['model_name']}\n")
|
||||||
|
|
||||||
|
with open('./data/models_ner_info.txt', 'w', encoding='utf-8') as file:
|
||||||
|
file.write(ner_models_text)
|
||||||
|
|
||||||
|
print(len(ner_models))
|
|
@ -1,58 +1,83 @@
|
||||||
#بسم الله
|
#بسم الله
|
||||||
|
|
||||||
|
|
||||||
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
from bidi.algorithm import get_display
|
||||||
from huggingface_hub import HfApi
|
from huggingface_hub import HfApi
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from fastapi import FastAPI
|
||||||
from datetime import date
|
from datetime import date
|
||||||
|
import arabic_reshaper
|
||||||
from fpdf import FPDF
|
from fpdf import FPDF
|
||||||
|
import threading
|
||||||
import random
|
import random
|
||||||
|
import logging
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import string
|
import string
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# تنظیم لاگدهی برای دیدن خروجی زمانبندی
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logging.getLogger('apscheduler').setLevel(logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
first_id = 6600
|
||||||
|
text = 'جمهوری موافقتنامه معاملات قانون بودجه اساسی قضائی بینالمللی تأسیس منطقهای لازمالاجراء دامپروری راهآهن کمیسیونهای جدیدالاحداث مسئول فرآورده زائد اسقاط پنجساله'
|
||||||
list1 = ["ID","model_id","url","downloads","private","author","tags","tag_dataset",\
|
list1 = ["ID","model_id","url","downloads","private","author","tags","tag_dataset",\
|
||||||
"tag_base_model","tag_license","tag_region","pipeline_tag","Likes","languages",\
|
"tag_base_model","tag_license","tag_region","pipeline_tag","Likes","languages",\
|
||||||
"library","datasets","license","just_persian","deleted","date_added"]
|
"library","datasets","license","just_persian","deleted","date_added","last_modified"]
|
||||||
cnt = sqlite3.connect("persian_nlp_model.db")
|
cnt = sqlite3.connect(".\\db\\persian_nlp_model.db", check_same_thread=False)
|
||||||
c = cnt.cursor()
|
c = cnt.cursor()
|
||||||
today = date.today()
|
today = date.today()
|
||||||
d1 = today.strftime("%d-%m-%Y")
|
d1 = today.strftime("%d-%m-%Y")
|
||||||
|
|
||||||
|
|
||||||
|
create_tables=True
|
||||||
|
if create_tables == True :
|
||||||
|
try:
|
||||||
# فقط برای اولین بار که جدول قرار است ساخته شود از این کد ها استفاده شود
|
# فقط برای اولین بار که جدول قرار است ساخته شود از این کد ها استفاده شود
|
||||||
# c.execute("""CREATE TABLE PersianNlp(
|
c.execute("""CREATE TABLE PersianNlp(
|
||||||
# ID INT PRIMARY KEY ,
|
ID INT PRIMARY KEY ,
|
||||||
# model_id TEXT ,
|
model_id TEXT ,
|
||||||
# url TEXT ,
|
url TEXT ,
|
||||||
# downloads INT,
|
downloads INT,
|
||||||
# private TEXT,
|
private TEXT,
|
||||||
# author TEXT,
|
author TEXT,
|
||||||
# tags TEXT,
|
tags TEXT,
|
||||||
# tag_dataset TEXT,
|
tag_dataset TEXT,
|
||||||
# tag_base_model TEXT,
|
tag_base_model TEXT,
|
||||||
# tag_license TEXT,
|
tag_license TEXT,
|
||||||
# tag_region TEXT,
|
tag_region TEXT,
|
||||||
# pipeline_tag TEXT,
|
pipeline_tag TEXT,
|
||||||
# Likes INT,
|
Likes INT,
|
||||||
# languages TEXT,
|
languages TEXT,
|
||||||
# library TEXT,
|
library TEXT,
|
||||||
# datasets TEXT,
|
datasets TEXT,
|
||||||
# license TEXT,
|
license TEXT,
|
||||||
# just_persian TEXT,
|
just_persian TEXT,
|
||||||
# deleted TEXT,
|
deleted TEXT,
|
||||||
# date_added TEXT
|
date_added TEXT,
|
||||||
# );""")
|
last_modified TEXT
|
||||||
|
);""")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# برای ساخت جدول میزان دانلود ها از این کد استفاده شود
|
# برای ساخت جدول میزان دانلود ها از این کد استفاده شود
|
||||||
# c.execute("""CREATE TABLE downloadCountHistory(
|
c.execute("""CREATE TABLE downloadCountHistory(
|
||||||
# ID INT PRIMARY KEY ,
|
ID INT PRIMARY KEY ,
|
||||||
# key_id INT ,
|
key_id INT ,
|
||||||
# downloads INT,
|
downloads INT,
|
||||||
# date TEXT
|
date TEXT
|
||||||
# );""")
|
);""")
|
||||||
|
create_tables = False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
|
||||||
|
print("--- یک خطای غیرمنتظره در ساخت تیبل رخ داد ---")
|
||||||
|
print(f"متن خطا: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -89,14 +114,358 @@ def generate_random_id(length=10, chars=string.ascii_letters + string.digits):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 3. تعریف تابع کمکی برای پردازش متن فارسی
|
||||||
|
def process_text_for_fpdf(text):
|
||||||
|
# مرحله 1: تغییر شکل حروف (اتصال و شکل صحیح)
|
||||||
|
reshaped_text = arabic_reshaper.reshape(text)
|
||||||
|
# مرحله 2: بازآرایی برای نمایش راست به چپ
|
||||||
|
bidi_text = get_display(reshaped_text)
|
||||||
|
return bidi_text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def add_download_count():
|
||||||
|
|
||||||
|
count = 1
|
||||||
|
api = HfApi()
|
||||||
|
allModel = c.execute(f'''SELECT * FROM "PersianNlp"
|
||||||
|
WHERE deleted != 'True' ;''')
|
||||||
|
all_model_id = []
|
||||||
|
for model in allModel:
|
||||||
|
all_model_id.append([model[0],model[1]])
|
||||||
|
|
||||||
|
for id_ in all_model_id:
|
||||||
|
# try:
|
||||||
|
print(count)
|
||||||
|
count+=1
|
||||||
|
id_12_digits = generate_random_id(length=12, chars=string.digits)
|
||||||
|
model_details = api.model_info(repo_id=id_[1])
|
||||||
|
c.execute(f"""INSERT INTO downloadCountHistory(ID,key_id,downloads,date)
|
||||||
|
VALUES ({id_12_digits},"{int(id_[0])}","{int(model_details.downloads)}","{str(d1)}");""")
|
||||||
|
# c.execute(f"""INSERT INTO downloadCountHistory(ID,key_id,downloads,date)
|
||||||
|
# VALUES ({id_12_digits},"{int(id_[0])}","{int(model_details.downloads)+1}","22-08-2025");""")
|
||||||
|
cnt.commit()
|
||||||
|
# except:
|
||||||
|
# print("Error!!")
|
||||||
|
|
||||||
|
# add_download_count()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def MultiModelInfo(limit_number=10):
|
||||||
|
|
||||||
|
|
||||||
|
today = date.today()
|
||||||
|
date_year = today.strftime("%Y")
|
||||||
|
date_month = today.strftime("%m")
|
||||||
|
month = int(date_month)
|
||||||
|
year = int(date_year)
|
||||||
|
Models_added_this_month=[]
|
||||||
|
Models_deleted=[]
|
||||||
|
all_id_download = []
|
||||||
|
all_download = []
|
||||||
|
growth_slope_list_info = []
|
||||||
|
growth_slope_list = []
|
||||||
|
model_info = c.execute(f'''SELECT *
|
||||||
|
FROM PersianNlp''')
|
||||||
|
|
||||||
|
for model in model_info:
|
||||||
|
|
||||||
|
if int(model[19].split("/")[1]) == month and int(model[19].split("/")[2]) == year :
|
||||||
|
Models_added_this_month.append(model[1])
|
||||||
|
if str(model[18]) == "True":
|
||||||
|
Models_deleted.append(model[1])
|
||||||
|
|
||||||
|
all_id_download.append([model[0],model[1],model[3],model[11],model[20]])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
listX=[]
|
||||||
|
for model in all_id_download:
|
||||||
|
|
||||||
|
downloadCountHistory = c.execute(f'''SELECT *
|
||||||
|
FROM downloadCountHistory
|
||||||
|
WHERE key_id = {model[0]}''')
|
||||||
|
|
||||||
|
for models in downloadCountHistory :
|
||||||
|
if int(models[3].split("-")[1]) == month and int(models[3].split("-")[2]) == year :
|
||||||
|
model[2]=models[2]
|
||||||
|
listX.append(model)
|
||||||
|
all_id_download = listX
|
||||||
|
|
||||||
|
|
||||||
|
for model in all_id_download:
|
||||||
|
all_download.append(model[2])
|
||||||
|
|
||||||
|
all_download.sort(reverse=True)
|
||||||
|
maximum_download_list = all_download[0:limit_number]
|
||||||
|
maximum_download_info_list = []
|
||||||
|
n=0
|
||||||
|
for DCount in maximum_download_list:
|
||||||
|
|
||||||
|
for model in all_id_download:
|
||||||
|
if DCount == model[2]:
|
||||||
|
if n < limit_number :
|
||||||
|
maximum_download_info_list.append(model)
|
||||||
|
n+=1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# پیدا کردن بیشترین شیب دانلود ها در چند ماه :
|
||||||
|
|
||||||
|
for model in all_id_download:
|
||||||
|
growth_slope = []
|
||||||
|
DHList =c.execute(f'''SELECT *
|
||||||
|
FROM "downloadCountHistory"
|
||||||
|
WHERE key_id = {model[0]}''')
|
||||||
|
|
||||||
|
for data in DHList:
|
||||||
|
growth_slope.append(data[2])
|
||||||
|
|
||||||
|
growth_slopee , lenM = find_growth_slope(growth_slope) # به دست آوردن درصد رشد هر مدل
|
||||||
|
growth_slope_list.append(growth_slopee)
|
||||||
|
growth_slope_list_info.append([model[1],growth_slopee,lenM,model[2],model[3]])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
growth_slope_list.sort(reverse=True)
|
||||||
|
maximum_growth_slope_list = growth_slope_list[0:limit_number]
|
||||||
|
maximum_growth_slope_info_list = []
|
||||||
|
n=0
|
||||||
|
for DCount in maximum_growth_slope_list:
|
||||||
|
|
||||||
|
for model in growth_slope_list_info:
|
||||||
|
if DCount == model[1]:
|
||||||
|
if n < limit_number :
|
||||||
|
maximum_growth_slope_info_list.append(model)
|
||||||
|
n+=1
|
||||||
|
|
||||||
|
# پایان پیدا کردن شیب
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
model_id_list = []
|
||||||
|
model_download_count_list = []
|
||||||
|
for info in maximum_download_info_list:
|
||||||
|
|
||||||
|
model_download_count_list.append(info[2])
|
||||||
|
model_id_list.append(str(info[1]))
|
||||||
|
|
||||||
|
listA = []
|
||||||
|
for x in model_download_count_list:
|
||||||
|
listA.append(int(x)/1000000)
|
||||||
|
model_download_count_list = listA
|
||||||
|
|
||||||
|
plt.plot(model_id_list,model_download_count_list,marker='o', linestyle='-')
|
||||||
|
plt.xticks(rotation=30, ha='right', fontsize=10)
|
||||||
|
plt.xlabel("Model Name", color='blue')
|
||||||
|
plt.ylabel("Download Count (milion)" ,color='red')
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig(f'Top_{limit_number}_download_rate.png', dpi=300)
|
||||||
|
pdf = FPDF()
|
||||||
|
pdf.add_page()
|
||||||
|
pdf.add_font('B Nazanin', '', '.\\fonts\\B Nazanin.ttf', uni=True)
|
||||||
|
pdf.set_font("Arial", size=16)
|
||||||
|
# اضافه کردن متن
|
||||||
|
pdf.multi_cell(0, 10, f"Top {limit_number} Model Information : \n ----------------------------------------------------------------")
|
||||||
|
pdf.set_font("Arial", size=12)
|
||||||
|
|
||||||
|
pdf.ln()
|
||||||
|
pdf.multi_cell(0, 5, f" Download Rate Chart :")
|
||||||
|
pdf.ln()
|
||||||
|
|
||||||
|
# اضافه کردن عکس
|
||||||
|
pdf.image(f'Top_{limit_number}_download_rate.png', x=10, y=pdf.get_y() + 5, w=70)
|
||||||
|
pdf.ln(20) # یک خط فاصله بعد از عکس
|
||||||
|
# اضافه کردن جدول
|
||||||
|
pdf.ln(50)
|
||||||
|
pdf.ln()
|
||||||
|
pdf.multi_cell(0, 5, f" Download Rate Table :")
|
||||||
|
pdf.ln()
|
||||||
|
# سربرگ جدول
|
||||||
|
for header in ['Count', 'Model_name','Download-rate','Task','Last_modified']:
|
||||||
|
if header == 'Model_name':
|
||||||
|
pdf.cell(80, 10, header, 1, 0, 'C')
|
||||||
|
else:
|
||||||
|
pdf.cell(40, 10, header, 1, 0, 'C')
|
||||||
|
pdf.ln()
|
||||||
|
|
||||||
|
# ردیفهای داده
|
||||||
|
x=1
|
||||||
|
for row in maximum_download_info_list:
|
||||||
|
n=0
|
||||||
|
row[0] = x
|
||||||
|
x+=1
|
||||||
|
for item in row:
|
||||||
|
if n == 1 :
|
||||||
|
pdf.set_font("Arial", size=6)
|
||||||
|
pdf.cell(80, 10, f"https://huggingface.co/{str(item)}", 1, 0, 'C')
|
||||||
|
n+=1
|
||||||
|
else:
|
||||||
|
pdf.set_font("Arial", size=10)
|
||||||
|
pdf.cell(40, 10, str(item), 1, 0, 'C')
|
||||||
|
n+=1
|
||||||
|
pdf.ln()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
pdf.add_page()
|
||||||
|
model_name_list = []
|
||||||
|
model_growth = []
|
||||||
|
for info in maximum_growth_slope_info_list:
|
||||||
|
|
||||||
|
model_name_list.append(info[0])
|
||||||
|
model_growth.append(round(info[1], 2) )
|
||||||
|
|
||||||
|
pdf.ln()
|
||||||
|
pdf.multi_cell(0, 5, f" Download Growth Chart :")
|
||||||
|
pdf.ln()
|
||||||
|
plt.figure(figsize=(8, 6)) # تنظیم اندازه کلی نمودار (عرض و ارتفاع بر حسب اینچ)
|
||||||
|
plt.bar(model_name_list,model_growth,color='lightgreen', width=0.4)
|
||||||
|
plt.xticks(rotation=30, ha='right', fontsize=10)
|
||||||
|
plt.xlabel("Model Name", color='blue')
|
||||||
|
plt.ylabel("Model Growth (%)" ,color='red')
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig(f'Top_{limit_number}_growth_rate.png', dpi=300)
|
||||||
|
pdf.image(f'Top_{limit_number}_growth_rate.png', x=10, y=pdf.get_y() + 5, w=70)
|
||||||
|
pdf.ln(80) # یک خط فاصله بعد از عکس
|
||||||
|
|
||||||
|
pdf.ln()
|
||||||
|
pdf.multi_cell(0, 5, f" Download Growth Table :")
|
||||||
|
pdf.ln()
|
||||||
|
# سربرگ جدول
|
||||||
|
for header in [ 'Model_name','Growth-rate','Length-month','Download-rate','Task']:
|
||||||
|
if header == 'Model_name':
|
||||||
|
pdf.cell(80, 10, header, 1, 0, 'C')
|
||||||
|
else:
|
||||||
|
pdf.cell(30, 10, header, 1, 0, 'C')
|
||||||
|
pdf.ln()
|
||||||
|
|
||||||
|
# ردیفهای داده
|
||||||
|
x=1
|
||||||
|
for row in maximum_growth_slope_info_list:
|
||||||
|
n=0
|
||||||
|
x+=1
|
||||||
|
for item in row:
|
||||||
|
if n == 0 :
|
||||||
|
pdf.set_font("Arial", size=6)
|
||||||
|
pdf.cell(80, 10, f"https://huggingface.co/{str(item)}", 1, 0, 'C')
|
||||||
|
n+=1
|
||||||
|
else:
|
||||||
|
pdf.set_font("Arial", size=6)
|
||||||
|
pdf.cell(30, 10, str(item), 1, 0, 'C')
|
||||||
|
n+=1
|
||||||
|
pdf.ln()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
pdf.add_page()
|
||||||
|
pdf.ln()
|
||||||
|
pdf.set_font("Arial", size=14)
|
||||||
|
pdf.multi_cell(0, 5, f"Models added this month :")
|
||||||
|
pdf.set_font("Arial", size=6)
|
||||||
|
pdf.ln()
|
||||||
|
|
||||||
|
txt=''
|
||||||
|
n=1
|
||||||
|
for model_name in Models_added_this_month:
|
||||||
|
txt +=f"{n} --> {model_name}\n"
|
||||||
|
n+=1
|
||||||
|
pdf.multi_cell(0, 5, f"{txt}")
|
||||||
|
|
||||||
|
pdf.ln()
|
||||||
|
pdf.set_font("Arial", size=14)
|
||||||
|
pdf.multi_cell(0, 5, f"Models deleted :")
|
||||||
|
pdf.set_font("Arial", size=6)
|
||||||
|
pdf.ln()
|
||||||
|
txt=''
|
||||||
|
n=1
|
||||||
|
for model_name in Models_deleted:
|
||||||
|
txt +=f"{n} --> {model_name}\n"
|
||||||
|
n+=1
|
||||||
|
pdf.multi_cell(0, 5, f"{txt}")
|
||||||
|
|
||||||
|
pdf.output(f"MultiModelInfo_{today}.pdf")
|
||||||
|
os.remove(f'Top_{limit_number}_download_rate.png')
|
||||||
|
os.remove(f'Top_{limit_number}_growth_rate.png')
|
||||||
|
|
||||||
|
# MultiModelInfo(5)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def find_growth_slope(number_list):
|
||||||
|
|
||||||
|
try:
|
||||||
|
n = -1
|
||||||
|
index = 0
|
||||||
|
# if n == -1:
|
||||||
|
last_num = number_list[-1]
|
||||||
|
first_num = number_list[-1]
|
||||||
|
|
||||||
|
if number_list[-1] <= number_list[-2]:
|
||||||
|
for num in range(len(number_list)-1):
|
||||||
|
if number_list[n-num] <= number_list[n-num-1]:
|
||||||
|
last_num = number_list[n-num-1]
|
||||||
|
index = n-num-1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
if number_list[-1] >= number_list[-2] and last_num >= first_num:
|
||||||
|
for num in range(len(number_list)-1):
|
||||||
|
if number_list[n-num] >= number_list[n-num-1]:
|
||||||
|
last_num = number_list[n-num-1]
|
||||||
|
index = n-num-1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
percentage_growth = ((first_num - last_num) / last_num) * 100
|
||||||
|
|
||||||
|
except:
|
||||||
|
percentage_growth = 0
|
||||||
|
|
||||||
|
|
||||||
|
return percentage_growth ,abs(index)
|
||||||
|
|
||||||
|
# find_growth_slope([15,16,17,16,10,8])
|
||||||
|
# x = find_growth_slope([1,2,3,4,5,6,7,8,10])
|
||||||
|
# c = find_growth_slope([8,8,9,10,8,8,7])
|
||||||
|
# v = find_growth_slope([8,8,7,5,4,6,7,7])
|
||||||
|
# print("finish!")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def persian_model_finder(nlp_tasks,idx):
|
def persian_model_finder(nlp_tasks,idx):
|
||||||
today = date.today()
|
today = date.today()
|
||||||
download_date = today.strftime("%d/%m/%Y")
|
download_date = today.strftime("%d/%m/%Y")
|
||||||
idX = idx # اخرین آیدی موجود در دیتابیس را وارد میکنیم تا موارد جدید با آیدی های قبلی تداخل نکند
|
idX = idx # اخرین آیدی موجود در دیتابیس را وارد میکنیم تا موارد جدید با آیدی های قبلی تداخل نکند
|
||||||
api = HfApi()
|
api = HfApi()
|
||||||
all_persian_nlp_models_data = []
|
# all_persian_nlp_models_data = []
|
||||||
seen_model_ids = set() # برای جلوگیری از اضافه شدن مدلهای تکراری
|
seen_model_ids = set() # برای جلوگیری از اضافه شدن مدلهای تکراری
|
||||||
|
new_seen_ids = set()
|
||||||
|
|
||||||
|
|
||||||
|
for task in nlp_tasks:
|
||||||
|
|
||||||
|
models_for_task = api.list_models(
|
||||||
|
language="fa",
|
||||||
|
task=task,
|
||||||
|
sort="downloads",
|
||||||
|
direction=-1, # نزولی (از بیشترین دانلود به کمترین)
|
||||||
|
limit=None # میتوانید این عدد را تغییر دهید
|
||||||
|
)
|
||||||
|
|
||||||
|
for model_info in models_for_task:
|
||||||
|
new_seen_ids.add(model_info.id)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
print("در حال جستجو و استخراج اطلاعات مدلهای NLP فارسی...")
|
print("در حال جستجو و استخراج اطلاعات مدلهای NLP فارسی...")
|
||||||
|
|
||||||
# فیلتر کردن و پیمایش روی مدلها
|
# فیلتر کردن و پیمایش روی مدلها
|
||||||
|
@ -107,8 +476,12 @@ def persian_model_finder(nlp_tasks,idx):
|
||||||
try:
|
try:
|
||||||
allModel = c.execute(f'''SELECT *
|
allModel = c.execute(f'''SELECT *
|
||||||
FROM PersianNlp''')
|
FROM PersianNlp''')
|
||||||
|
|
||||||
|
idX+=1
|
||||||
|
|
||||||
for model in allModel:
|
for model in allModel:
|
||||||
seen_model_ids.add(model[1])
|
seen_model_ids.add(model[1])
|
||||||
|
idX+=1
|
||||||
except:
|
except:
|
||||||
print("database not find!")
|
print("database not find!")
|
||||||
|
|
||||||
|
@ -129,6 +502,7 @@ FROM PersianNlp''')
|
||||||
idX+=1
|
idX+=1
|
||||||
# try : # اگر از کارت مدل توانست اطلاعات بیشتری به دست بیاورد :
|
# try : # اگر از کارت مدل توانست اطلاعات بیشتری به دست بیاورد :
|
||||||
model_ = api.model_info(model_info.id) # به دست آوردن شناسه مدل
|
model_ = api.model_info(model_info.id) # به دست آوردن شناسه مدل
|
||||||
|
lastModified = api.model_info(repo_id=model_info.id).last_modified
|
||||||
card_data_dict = model_.card_data.to_dict() # از روی کارت مدل که شامل اطلاعات مدل میباشد یک دیکشنری میسازیم
|
card_data_dict = model_.card_data.to_dict() # از روی کارت مدل که شامل اطلاعات مدل میباشد یک دیکشنری میسازیم
|
||||||
model_data = {
|
model_data = {
|
||||||
"model_id": model_info.id,
|
"model_id": model_info.id,
|
||||||
|
@ -150,7 +524,8 @@ FROM PersianNlp''')
|
||||||
"license":card_data_dict.get('license', 'N/A'),
|
"license":card_data_dict.get('license', 'N/A'),
|
||||||
"just_persian" : "False",
|
"just_persian" : "False",
|
||||||
"deleted" : "False",
|
"deleted" : "False",
|
||||||
"date_added" : f"{download_date}"
|
"date_added" : f"{download_date}",
|
||||||
|
"last_modified" : f"{str(lastModified.strftime("%d-%m-%Y"))}"
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -191,19 +566,32 @@ FROM PersianNlp''')
|
||||||
|
|
||||||
|
|
||||||
# all_persian_nlp_models_data.append(model_data)
|
# all_persian_nlp_models_data.append(model_data)
|
||||||
c.execute(f"""INSERT INTO PersianNlp (ID,model_id,url,downloads,private,author,tags,tag_dataset,tag_base_model,tag_license,tag_region,pipeline_tag,Likes,languages,library,datasets,license,just_persian,deleted,date_added)
|
c.execute(f"""INSERT INTO PersianNlp (ID,model_id,url,downloads,private,author,tags,tag_dataset,tag_base_model,tag_license,tag_region,pipeline_tag,Likes,languages,library,datasets,license,just_persian,deleted,date_added,last_modified)
|
||||||
VALUES ({idX},"{model_data["model_id"]}","{model_data["url"]}",{model_data["downloads"]},"{model_data["private"]}","{model_data["author"]}","{model_data["tags"]}","{model_data["tag_dataset"]}","{model_data["tag_base_model"]}","{model_data["tag_license"]}","{model_data["tag_region"]}","{model_data["pipeline_tag"]}",{model_data["Likes"]},"{model_data["languages"]}","{model_data["library"]}","{model_data["datasets"]}","{model_data["license"]}","{model_data["just_persian"]}","{model_data["deleted"]}","{model_data["date_added"]}");""")
|
VALUES ({idX},"{model_data["model_id"]}","{model_data["url"]}",{model_data["downloads"]},"{model_data["private"]}","{model_data["author"]}","{model_data["tags"]}","{model_data["tag_dataset"]}","{model_data["tag_base_model"]}","{model_data["tag_license"]}","{model_data["tag_region"]}","{model_data["pipeline_tag"]}",{model_data["Likes"]},"{model_data["languages"]}","{model_data["library"]}","{model_data["datasets"]}","{model_data["license"]}","{model_data["just_persian"]}","{model_data["deleted"]}","{model_data["date_added"]}","{model_data['last_modified']}");""")
|
||||||
cnt.commit()
|
cnt.commit()
|
||||||
seen_model_ids.add(model_info.id)
|
seen_model_ids.add(model_info.id)
|
||||||
|
|
||||||
|
|
||||||
print(f"\nتعداد کل مدلهای NLP فارسی منحصربهفرد یافت شده: {len(seen_model_ids)}")
|
print(f"\nتعداد کل مدلهای NLP فارسی منحصربهفرد یافت شده: {len(seen_model_ids)}")
|
||||||
|
|
||||||
#اول لیست تسک ها را میدهیم برای جست و جو ، و بعد آخرین آیدی که در تیبل مدلها در دیتابیس موجود است
|
for modelID in seen_model_ids:
|
||||||
# persian_model_finder(nlp_task_list,8288)
|
if modelID not in new_seen_ids:
|
||||||
|
|
||||||
|
c.execute(f'''UPDATE PersianNlp
|
||||||
|
SET deleted = 'True'
|
||||||
|
WHERE model_id = '{modelID}';''')
|
||||||
|
|
||||||
|
cnt.commit()
|
||||||
|
|
||||||
|
|
||||||
|
add_download_count()
|
||||||
|
MultiModelInfo()
|
||||||
|
|
||||||
|
#اول لیست تسک ها را میدهیم برای جست و جو ، و بعد اولین آیدی که در تیبل مدلها در دیتابیس موجود است
|
||||||
|
# persian_model_finder(nlp_task_list,6600)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def search(name,search_by):
|
def search(name,search_by):
|
||||||
|
|
||||||
X = "------------------------------------------------------------------------------------\n+-+-+-+- FOUND MODEL +-+-+-+-\n------------------------------------------------------------------------------------\n\n"
|
X = "------------------------------------------------------------------------------------\n+-+-+-+- FOUND MODEL +-+-+-+-\n------------------------------------------------------------------------------------\n\n"
|
||||||
|
@ -299,28 +687,165 @@ FROM PersianNlp''')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def add_download_count():
|
def singleModelInfo( model_id_ ,month_later = 6 , year_later = 0 ):
|
||||||
|
|
||||||
count = 1
|
|
||||||
api = HfApi()
|
|
||||||
allModel = c.execute(f'''SELECT *
|
|
||||||
FROM PersianNlp''')
|
|
||||||
all_model_id = []
|
|
||||||
for model in allModel:
|
|
||||||
all_model_id.append([model[0],model[1]])
|
|
||||||
|
|
||||||
for id_ in all_model_id:
|
today = date.today()
|
||||||
# try:
|
date_year = today.strftime("%Y")
|
||||||
print(count)
|
date_month = today.strftime("%m")
|
||||||
count+=1
|
month = int(date_month)
|
||||||
id_12_digits = generate_random_id(length=12, chars=string.digits)
|
year = int(date_year)
|
||||||
model_details = api.model_info(repo_id=id_[1])
|
model_info = c.execute(f'''SELECT *
|
||||||
c.execute(f"""INSERT INTO downloadCountHistory(ID,key_id,downloads,date)
|
FROM PersianNlp
|
||||||
VALUES ({id_12_digits},"{int(id_[0])}","{int(model_details.downloads)}","{str(d1)}");""")
|
WHERE model_id = "{model_id_}"''')
|
||||||
cnt.commit()
|
|
||||||
# except:
|
for model in model_info:
|
||||||
# print("Error!!")
|
m = model
|
||||||
|
model_id = m[0]
|
||||||
# add_download_count()
|
last_modyfied = m[20]
|
||||||
|
Likes = m[12]
|
||||||
|
task = m[11]
|
||||||
|
|
||||||
|
|
||||||
|
downloadCountHistory = c.execute(f'''SELECT *
|
||||||
|
FROM downloadCountHistory
|
||||||
|
WHERE key_id = {model_id}''')
|
||||||
|
|
||||||
|
|
||||||
|
n=0
|
||||||
|
downloads_list = []
|
||||||
|
download_count_list = []
|
||||||
|
download_date_list = []
|
||||||
|
for model in downloadCountHistory :
|
||||||
|
if int(model[3].split("-")[1]) >= month-month_later and int(model[3].split("-")[2]) >= year-year_later :
|
||||||
|
|
||||||
|
download_count_list.append(model[2])
|
||||||
|
download_date_list.append(model[3])
|
||||||
|
downloads_list.append([model[3],model[2]])
|
||||||
|
|
||||||
|
growth_slope , lenM = find_growth_slope(download_count_list)
|
||||||
|
|
||||||
|
plt.plot(download_date_list,download_count_list, marker='o', linestyle='-')
|
||||||
|
plt.savefig('Download_rate_chart.png', dpi=300)
|
||||||
|
|
||||||
|
pdf = FPDF()
|
||||||
|
pdf.add_page()
|
||||||
|
pdf.add_font('B Nazanin', '', '.\\fonts\\B Nazanin.ttf', uni=True)
|
||||||
|
pdf.set_font("Arial", size=12)
|
||||||
|
|
||||||
|
# اضافه کردن متن
|
||||||
|
pdf.multi_cell(0, 10, f"Model -<< https://huggingface.co/{model_id_} >>- Information :")
|
||||||
|
|
||||||
|
pdf.ln()
|
||||||
|
pdf.multi_cell(0, 5, f"Download rate chart : ")
|
||||||
|
pdf.ln()
|
||||||
|
# اضافه کردن عکس
|
||||||
|
pdf.image("Download_rate_chart.png", x=10, y=pdf.get_y() + 5, w=70)
|
||||||
|
pdf.ln(20) # یک خط فاصله بعد از عکس
|
||||||
|
# pdf.cell(0, 10, " Download history chart ", ln=True, align='C')
|
||||||
|
|
||||||
|
# اضافه کردن جدول
|
||||||
|
pdf.ln(50)
|
||||||
|
|
||||||
|
pdf.ln()
|
||||||
|
pdf.multi_cell(0, 5, f"Download rate table : ")
|
||||||
|
pdf.ln()
|
||||||
|
# سربرگ جدول
|
||||||
|
for header in ['Date', 'Download-rate']:
|
||||||
|
pdf.cell(40, 10, header, 1, 0, 'C')
|
||||||
|
pdf.ln()
|
||||||
|
pdf.set_font("Arial", size=10)
|
||||||
|
# ردیفهای داده
|
||||||
|
for row in downloads_list:
|
||||||
|
for item in row:
|
||||||
|
pdf.cell(40, 10, str(item), 1, 0, 'C')
|
||||||
|
pdf.ln()
|
||||||
|
|
||||||
|
pdf.ln()
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_id_)
|
||||||
|
tokens = tokenizer.tokenize(text)
|
||||||
|
print(tokens)
|
||||||
|
print(f'len(tokens): {len(tokens)}')
|
||||||
|
results = {'model': model_id_, 'len': len(tokens), 'tokens': tokens }
|
||||||
|
# results = str(results).encode('latin-1', 'replace').decode('latin-1')
|
||||||
|
# results = str(results).replace("▁","")
|
||||||
|
pdf.multi_cell(0, 3, f"Likes : {str(Likes)}")
|
||||||
|
pdf.ln()
|
||||||
|
pdf.multi_cell(0, 3, f"last_modyfied : {str(last_modyfied)}")
|
||||||
|
pdf.ln()
|
||||||
|
pdf.multi_cell(0, 3, f"Growth slope : {round(growth_slope, 2)} in {lenM} month .")
|
||||||
|
pdf.ln()
|
||||||
|
pdf.multi_cell(0, 7, f"task : {task}")
|
||||||
|
pdf.ln()
|
||||||
|
pdf.set_font("Arial", size=16)
|
||||||
|
pdf.multi_cell(0, 5, f"Tokenize info : ")
|
||||||
|
pdf.ln()
|
||||||
|
pdf.set_font("Arial", size=10)
|
||||||
|
pdf.multi_cell(0, 3, f"len : {str(results['len'])}")
|
||||||
|
pdf.ln()
|
||||||
|
pdf.multi_cell(0, 3, "tokenized list : ")
|
||||||
|
pdf.ln()
|
||||||
|
txt = ''
|
||||||
|
for token in results["tokens"]:
|
||||||
|
txt += f' [ {token} ] '
|
||||||
|
pdf.set_font("B Nazanin", size=10)
|
||||||
|
pdf.multi_cell(0, 5, f"{process_text_for_fpdf(txt)}",align='R')
|
||||||
|
pdf.output("singleModelInfo.pdf")
|
||||||
|
print("فایل PDF با FPDF ایجاد شد.")
|
||||||
|
os.remove("Download_rate_chart.png")
|
||||||
|
|
||||||
|
# singleModelInfo("amberoad/bert-multilingual-passage-reranking-msmarco")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# --- بخش ادغام با FastAPI و APScheduler ---
|
||||||
|
app = FastAPI()
|
||||||
|
scheduler = BackgroundScheduler()
|
||||||
|
|
||||||
|
# یک تابع برای دریافت آخرین ID قبل از اجرای job
|
||||||
|
# def get_last_id():
|
||||||
|
# try:
|
||||||
|
# last_id = c.execute("SELECT MAX(ID) FROM PersianNlp").fetchone()[0]
|
||||||
|
# return last_id if last_id is not None else 0
|
||||||
|
# except sqlite3.OperationalError:
|
||||||
|
# return 0
|
||||||
|
|
||||||
|
# تعریف کار زمانبندی شده: اجرای persian_model_finder در روز آخر هر ماه
|
||||||
|
def scheduled_job():
|
||||||
|
idx = first_id
|
||||||
|
print(f"شروع اجرای کار زمانبندی شده. آخرین ID: {idx}")
|
||||||
|
persian_model_finder(nlp_task_list, idx)
|
||||||
|
print("کار زمانبندی شده با موفقیت به پایان رسید.")
|
||||||
|
|
||||||
|
# scheduler.add_job(scheduled_job, 'cron', day='1', hour='0', minute='0')
|
||||||
|
# scheduler.add_job(scheduled_job, 'interval', minutes=5) # هر بیست دقیقه تابع رو اجرا میکنه که برای تست درست کار کردن هستش و الا کامنت بشه
|
||||||
|
scheduler.add_job(scheduled_job, 'cron', day='last', hour='0', minute='0') # آخرین روز هرماه رو به عنوان زمان بندی قرار میده
|
||||||
|
|
||||||
|
# رویداد startup: شروع زمانبندی هنگام روشن شدن سرور
|
||||||
|
@app.on_event("startup")
|
||||||
|
def startup_event():
|
||||||
|
print("رویداد startup: سرور در حال راهاندازی است و زمانبندی شروع میشود.")
|
||||||
|
scheduler.start()
|
||||||
|
print("زمانبندی فعال شد.")
|
||||||
|
|
||||||
|
# رویداد shutdown: خاموش کردن زمانبندی هنگام خاموش شدن سرور
|
||||||
|
@app.on_event("shutdown")
|
||||||
|
def shutdown_event():
|
||||||
|
print("رویداد shutdown: سرور در حال خاموش شدن است و زمانبندی متوقف میشود.")
|
||||||
|
scheduler.shutdown()
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
def read_root():
|
||||||
|
return {"message": "FastAPI service is running and the scheduler is active."}
|
||||||
|
|
||||||
|
@app.get("/run_monthly_job_now")
|
||||||
|
def run_job_manually():
|
||||||
|
print("درخواست برای اجرای دستی کار ماهانه دریافت شد...")
|
||||||
|
# اجرای کار در یک ترد مجزا برای جلوگیری از مسدود شدن سرور
|
||||||
|
threading.Thread(target=scheduled_job).start()
|
||||||
|
return {"message": "Monthly job has been triggered manually."}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user