evaluate new ner models

This commit is contained in:
ajokar 2025-07-17 20:35:40 +03:30
parent 4cf1d4d776
commit 1ffdd6b62c
9 changed files with 19279 additions and 2979 deletions

View File

@ -1,28 +1,32 @@
from transformers import AutoTokenizer
import json
file = open('./data/models_info.json', 'r')
models = json.load(file)
file = open('./data/models_ner_info.txt', 'r')
models = file.readlines()
file.close()
# Strips the newline character
text = 'جمهوری موافقت‌نامه معاملات قانون بودجه اساسی قضائی بین‌المللی تأسیس منطقه‌ای لازم‌الاجراء دامپروری راه‌آهن کمیسیون‌های جدیدالاحداث مسئول فرآورده زائد اسقاط پنجساله'
results = []
for line in models:
model_checkpoint = line['model_name']
model_checkpoint = line.split(' -- ')[1].rstrip('\n')
# model_checkpoint = line['model_name']
try:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(model_checkpoint)
tokens = tokenizer.tokenize(text)
print(tokens)
results.append({ 'model': model_checkpoint, 'tokens': tokens})
if len(tokens) == 2 :
file.write( '{'+model_checkpoint + " : [ " + ','.join(tokens) + ' ] }\n' )
#result = tokenizer(text)
#print(result)
#print(tokenizer.decode(result['input_ids']))
print(f'len(tokens): {len(tokens)}')
results.append({ 'model': model_checkpoint, 'len': len(tokens), 'tokens': tokens})
except:
error = "An exception occurred in tokenizer : " + model_checkpoint
#file.write( error + '\n' )
print(error)
#tokenizer.save_pretrained(model_checkpoint+'-tokenizer')
file.close()
with open('./data/models_tokenizer_info.json', 'w', encoding='utf-8') as file:
jsondata = json.dumps(results, ensure_ascii=False, indent=2)
file.write(jsondata)
print('finished!')

View File

@ -0,0 +1,8 @@
HooshvareLab/bert-base-parsbert-ner-uncased
HooshvareLab/bert-fa-base-uncased-ner-peyma
HooshvareLab/bert-base-parsbert-armanner-uncased
HooshvareLab/bert-fa-base-uncased-ner-arman
HooshvareLab/bert-base-parsbert-peymaner-uncased
nicolauduran45/affilgood-ner-multilingual-v2
Amirmerfan/bert-base-uncased-persian-ner-50k-base
AliFartout/Roberta-fa-en-ner

File diff suppressed because it is too large Load Diff

10498
data/models_info0.json Normal file

File diff suppressed because it is too large Load Diff

258
data/models_ner_info.json Normal file
View File

@ -0,0 +1,258 @@
[
{
"model_name": "HooshvareLab/bert-base-parsbert-ner-uncased",
"task": "token-classification",
"last_modified": "2021-5-18",
"downloads": 4060,
"likes": 5,
"language": "fa"
},
{
"model_name": "jplu/tf-xlm-r-ner-40-lang",
"task": "token-classification",
"last_modified": "2022-10-6",
"downloads": 874,
"likes": 27,
"language": "fa"
},
{
"model_name": "nicolauduran45/affilgood-ner-multilingual-v2",
"task": "token-classification",
"last_modified": "2025-4-16",
"downloads": 668,
"likes": 0,
"language": "fa"
},
{
"model_name": "HooshvareLab/bert-fa-zwnj-base-ner",
"task": "token-classification",
"last_modified": "2021-5-18",
"downloads": 407,
"likes": 4,
"language": "fa"
},
{
"model_name": "igorsterner/xlmr-multilingual-sentence-segmentation",
"task": "token-classification",
"last_modified": "2024-10-5",
"downloads": 256,
"likes": 4,
"language": "fa"
},
{
"model_name": "hamedkhaledi/persian-flair-ner",
"task": "token-classification",
"last_modified": "2022-4-3",
"downloads": 230,
"likes": 1,
"language": "fa"
},
{
"model_name": "HooshvareLab/bert-fa-base-uncased-ner-peyma",
"task": "token-classification",
"last_modified": "2021-5-18",
"downloads": 138,
"likes": 8,
"language": "fa"
},
{
"model_name": "mradermacher/persian-question-generator-GGUF",
"task": "Unknown",
"last_modified": "2024-12-8",
"downloads": 110,
"likes": 0,
"language": "fa"
},
{
"model_name": "HooshvareLab/bert-base-parsbert-armanner-uncased",
"task": "token-classification",
"last_modified": "2021-5-18",
"downloads": 103,
"likes": 3,
"language": "fa"
},
{
"model_name": "HooshvareLab/distilbert-fa-zwnj-base-ner",
"task": "token-classification",
"last_modified": "2021-3-21",
"downloads": 101,
"likes": 4,
"language": "fa"
},
{
"model_name": "HooshvareLab/roberta-fa-zwnj-base-ner",
"task": "token-classification",
"last_modified": "2021-5-20",
"downloads": 92,
"likes": 1,
"language": "fa"
},
{
"model_name": "HooshvareLab/bert-fa-base-uncased-ner-arman",
"task": "token-classification",
"last_modified": "2021-5-18",
"downloads": 71,
"likes": 0,
"language": "fa"
},
{
"model_name": "HooshvareLab/albert-fa-zwnj-base-v2-ner",
"task": "token-classification",
"last_modified": "2021-3-21",
"downloads": 58,
"likes": 0,
"language": "fa"
},
{
"model_name": "artificial-nerds/pmnet",
"task": "sentence-similarity",
"last_modified": "2024-10-29",
"downloads": 33,
"likes": 0,
"language": "fa"
},
{
"model_name": "HooshvareLab/bert-base-parsbert-peymaner-uncased",
"task": "token-classification",
"last_modified": "2021-5-18",
"downloads": 28,
"likes": 0,
"language": "fa"
},
{
"model_name": "Erfan/mT5-base_Farsi_Title_Generator",
"task": "text-generation",
"last_modified": "2022-1-30",
"downloads": 24,
"likes": 2,
"language": "fa"
},
{
"model_name": "myrkur/persian-question-generator",
"task": "summarization",
"last_modified": "2024-12-10",
"downloads": 24,
"likes": 2,
"language": "fa"
},
{
"model_name": "m3hrdadfi/albert-fa-base-v2-ner-arman",
"task": "token-classification",
"last_modified": "2020-12-26",
"downloads": 21,
"likes": 3,
"language": "fa"
},
{
"model_name": "hezarai/bert-fa-ner-arman",
"task": "token-classification",
"last_modified": "2024-11-14",
"downloads": 17,
"likes": 0,
"language": "fa"
},
{
"model_name": "m3hrdadfi/albert-fa-base-v2-ner-peyma",
"task": "token-classification",
"last_modified": "2020-12-26",
"downloads": 16,
"likes": 1,
"language": "fa"
},
{
"model_name": "Amirmerfan/bert-base-uncased-persian-ner-50k-base",
"task": "token-classification",
"last_modified": "2025-3-18",
"downloads": 16,
"likes": 0,
"language": "fa"
},
{
"model_name": "tum-nlp/neural-news-generator-bloomz-7b1-fa",
"task": "text-generation",
"last_modified": "2024-8-15",
"downloads": 14,
"likes": 0,
"language": "fa"
},
{
"model_name": "dadashzadeh/test-trainer-persian",
"task": "text-classification",
"last_modified": "2023-11-13",
"downloads": 11,
"likes": 0,
"language": "fa"
},
{
"model_name": "tum-nlp/neural-news-generator-llama-7b-fa",
"task": "text-generation",
"last_modified": "2024-8-15",
"downloads": 11,
"likes": 0,
"language": "fa"
},
{
"model_name": "AliFartout/Roberta-fa-en-ner",
"task": "token-classification",
"last_modified": "2023-9-1",
"downloads": 9,
"likes": 0,
"language": "fa"
},
{
"model_name": "myrkur/persian-title-generator",
"task": "summarization",
"last_modified": "2024-6-13",
"downloads": 9,
"likes": 2,
"language": "fa"
},
{
"model_name": "NLPclass/mt5-title-generation",
"task": "text-generation",
"last_modified": "2024-7-24",
"downloads": 9,
"likes": 0,
"language": "fa"
},
{
"model_name": "SinaRp/Question_generator_persian",
"task": "text-generation",
"last_modified": "2024-11-8",
"downloads": 9,
"likes": 1,
"language": "fa"
},
{
"model_name": "mansoorhamidzadeh/mt5-title-generation",
"task": "text-generation",
"last_modified": "2024-7-24",
"downloads": 8,
"likes": 0,
"language": "fa"
},
{
"model_name": "dz5035/bicleaner-ai-full-large-de-xx",
"task": "Unknown",
"last_modified": "2024-6-3",
"downloads": 4,
"likes": 0,
"language": "fa"
},
{
"model_name": "m0javad/conversational_Persian_paraphrase_generation",
"task": "Unknown",
"last_modified": "2024-2-17",
"downloads": 0,
"likes": 0,
"language": "fa"
},
{
"model_name": "PranavKeshav/event-planner-gemma-4bit",
"task": "text-generation",
"last_modified": "2025-5-17",
"downloads": 0,
"likes": 0,
"language": "fa"
}
]

17
data/models_ner_info.txt Normal file
View File

@ -0,0 +1,17 @@
4060 -- HooshvareLab/bert-base-parsbert-ner-uncased
874 -- jplu/tf-xlm-r-ner-40-lang
668 -- nicolauduran45/affilgood-ner-multilingual-v2
407 -- HooshvareLab/bert-fa-zwnj-base-ner
230 -- hamedkhaledi/persian-flair-ner
138 -- HooshvareLab/bert-fa-base-uncased-ner-peyma
103 -- HooshvareLab/bert-base-parsbert-armanner-uncased
101 -- HooshvareLab/distilbert-fa-zwnj-base-ner
92 -- HooshvareLab/roberta-fa-zwnj-base-ner
71 -- HooshvareLab/bert-fa-base-uncased-ner-arman
58 -- HooshvareLab/albert-fa-zwnj-base-v2-ner
28 -- HooshvareLab/bert-base-parsbert-peymaner-uncased
21 -- m3hrdadfi/albert-fa-base-v2-ner-arman
17 -- hezarai/bert-fa-ner-arman
16 -- m3hrdadfi/albert-fa-base-v2-ner-peyma
16 -- Amirmerfan/bert-base-uncased-persian-ner-50k-base
9 -- AliFartout/Roberta-fa-en-ner

View File

@ -0,0 +1,465 @@
[
{
"model": "HooshvareLab/bert-base-parsbert-ner-uncased",
"len": 20,
"tokens": [
"جمهوری",
"موافقتنامه",
"معاملات",
"قانون",
"بودجه",
"اساسی",
"قضايی",
"بینالمللی",
"تاسیس",
"منطقهای",
"لازمالاجراء",
"دامپروری",
"راهاهن",
"کمیسیونهای",
"جدیدالاحداث",
"مسيول",
"فراورده",
"زايد",
"اسقاط",
"پنجساله"
]
},
{
"model": "nicolauduran45/affilgood-ner-multilingual-v2",
"len": 39,
"tokens": [
"▁جمهوری",
"▁موافقت",
"▁نامه",
"▁معاملات",
"▁قانون",
"▁بودجه",
"▁اساسی",
"▁قضا",
"ئی",
"▁بین",
"▁المللی",
"▁تأسیس",
"▁منطقه",
"▁ای",
"▁لازم",
"▁الاج",
"راء",
"▁دام",
"پر",
"وری",
"▁راه",
"▁آهن",
"▁کمیسیون",
"▁های",
"▁جدید",
"الا",
"حد",
"اث",
"▁مسئول",
"▁فر",
"آور",
"ده",
"▁زائد",
"▁اس",
"ق",
"اط",
"▁پنج",
"سال",
"ه"
]
},
{
"model": "HooshvareLab/bert-fa-zwnj-base-ner",
"len": 37,
"tokens": [
"جمهوری",
"موافقت",
"[ZWNJ]",
"نامه",
"معاملات",
"قانون",
"بودجه",
"اساسی",
"[UNK]",
"بین",
"[ZWNJ]",
"المللی",
"[UNK]",
"منطقه",
"[ZWNJ]",
"ای",
"لازم",
"[ZWNJ]",
"الاجرا",
"##ء",
"دامپروری",
"راه",
"[ZWNJ]",
"آ",
"##هن",
"کمیسیون",
"[ZWNJ]",
"های",
"جدیدا",
"##لاح",
"##داث",
"[UNK]",
"[UNK]",
"[UNK]",
"اسقاط",
"پنج",
"##ساله"
]
},
{
"model": "HooshvareLab/bert-fa-base-uncased-ner-peyma",
"len": 20,
"tokens": [
"جمهوری",
"موافقتنامه",
"معاملات",
"قانون",
"بودجه",
"اساسی",
"قضايی",
"بینالمللی",
"تاسیس",
"منطقهای",
"لازمالاجراء",
"دامپروری",
"راهاهن",
"کمیسیونهای",
"جدیدالاحداث",
"مسيول",
"فراورده",
"زايد",
"اسقاط",
"پنجساله"
]
},
{
"model": "HooshvareLab/bert-base-parsbert-armanner-uncased",
"len": 20,
"tokens": [
"جمهوری",
"موافقتنامه",
"معاملات",
"قانون",
"بودجه",
"اساسی",
"قضايی",
"بینالمللی",
"تاسیس",
"منطقهای",
"لازمالاجراء",
"دامپروری",
"راهاهن",
"کمیسیونهای",
"جدیدالاحداث",
"مسيول",
"فراورده",
"زايد",
"اسقاط",
"پنجساله"
]
},
{
"model": "HooshvareLab/distilbert-fa-zwnj-base-ner",
"len": 37,
"tokens": [
"جمهوری",
"موافقت",
"[ZWNJ]",
"نامه",
"معاملات",
"قانون",
"بودجه",
"اساسی",
"[UNK]",
"بین",
"[ZWNJ]",
"المللی",
"[UNK]",
"منطقه",
"[ZWNJ]",
"ای",
"لازم",
"[ZWNJ]",
"الاجرا",
"##ء",
"دامپروری",
"راه",
"[ZWNJ]",
"آ",
"##هن",
"کمیسیون",
"[ZWNJ]",
"های",
"جدیدا",
"##لاح",
"##داث",
"[UNK]",
"[UNK]",
"[UNK]",
"اسقاط",
"پنج",
"##ساله"
]
},
{
"model": "HooshvareLab/roberta-fa-zwnj-base-ner",
"len": 37,
"tokens": [
"ĠجÙħÙĩÙĪØ±ÛĮ",
"ĠÙħÙĪØ§ÙģÙĤت",
"âĢĮ",
"ÙĨاÙħÙĩ",
"ĠÙħعاÙħÙĦات",
"ĠÙĤاÙĨÙĪÙĨ",
"ĠبÙĪØ¯Ø¬Ùĩ",
"ĠاساسÛĮ",
"ĠÙĤضائÛĮ",
"ĠبÛĮÙĨ",
"âĢĮ",
"اÙĦÙħÙĦÙĦÛĮ",
"ĠتأسÛĮس",
"ĠÙħÙĨØ·ÙĤÙĩ",
"âĢĮ",
"اÛĮ",
"ĠÙĦازÙħ",
"âĢĮ",
"اÙĦاج",
"راء",
"ĠداÙħپرÙĪØ±ÛĮ",
"ĠراÙĩ",
"âĢĮ",
"Ø¢ÙĩÙĨ",
"ĠÚ©ÙħÛĮسÛĮÙĪÙĨ",
"âĢĮ",
"ÙĩاÛĮ",
"ĠجدÛĮد",
"اÙĦ",
"اØŃ",
"داث",
"ĠÙħسئÙĪÙĦ",
"ĠÙģØ±Ø¢ÙĪØ±Ø¯Ùĩ",
"Ġزائد",
"ĠاسÙĤاط",
"ĠÙ¾ÙĨج",
"ساÙĦÙĩ"
]
},
{
"model": "HooshvareLab/bert-fa-base-uncased-ner-arman",
"len": 20,
"tokens": [
"جمهوری",
"موافقتنامه",
"معاملات",
"قانون",
"بودجه",
"اساسی",
"قضايی",
"بینالمللی",
"تاسیس",
"منطقهای",
"لازمالاجراء",
"دامپروری",
"راهاهن",
"کمیسیونهای",
"جدیدالاحداث",
"مسيول",
"فراورده",
"زايد",
"اسقاط",
"پنجساله"
]
},
{
"model": "HooshvareLab/albert-fa-zwnj-base-v2-ner",
"len": 46,
"tokens": [
"▁جمهوری",
"▁موافقت",
"[ZWNJ]",
"نامه",
"▁معاملات",
"▁قانون",
"▁بودجه",
"▁اساسی",
"▁قضا",
"ي",
"ی",
"▁بین",
"[ZWNJ]",
"الم",
"لل",
"ی",
"▁تاسیس",
"▁منطقه",
"[ZWNJ]",
"ای",
"▁لازم",
"[ZWNJ]",
"ال",
"اجرا",
"ء",
"▁دامپروری",
"▁راه",
"[ZWNJ]",
"اهن",
"▁کمیسیون",
"[ZWNJ]",
"های",
"▁جدید",
"الا",
"حد",
"اث",
"▁مس",
"ي",
"ول",
"▁فراورده",
"▁زا",
"ي",
"د",
"▁اسقاط",
"▁پنج",
"ساله"
]
},
{
"model": "HooshvareLab/bert-base-parsbert-peymaner-uncased",
"len": 20,
"tokens": [
"جمهوری",
"موافقتنامه",
"معاملات",
"قانون",
"بودجه",
"اساسی",
"قضايی",
"بینالمللی",
"تاسیس",
"منطقهای",
"لازمالاجراء",
"دامپروری",
"راهاهن",
"کمیسیونهای",
"جدیدالاحداث",
"مسيول",
"فراورده",
"زايد",
"اسقاط",
"پنجساله"
]
},
{
"model": "Amirmerfan/bert-base-uncased-persian-ner-50k-base",
"len": 56,
"tokens": [
"جمهوری",
"م",
"##وا",
"##فق",
"##تن",
"##ام",
"##ه",
"مع",
"##امل",
"##ات",
"قانون",
"بود",
"##جه",
"اساسی",
"ق",
"##ضا",
"##يی",
"بینالمللی",
"تاسیس",
"منطقه",
"##ای",
"لازم",
"##ال",
"##اج",
"##راء",
"دا",
"##م",
"##پر",
"##وری",
"راه",
"##اه",
"##ن",
"کمی",
"##سی",
"##ون",
"##های",
"جدید",
"##ال",
"##اح",
"##دا",
"##ث",
"م",
"##سي",
"##ول",
"ف",
"##را",
"##ورد",
"##ه",
"ز",
"##ايد",
"اس",
"##قا",
"##ط",
"پنج",
"##سال",
"##ه"
]
},
{
"model": "AliFartout/Roberta-fa-en-ner",
"len": 39,
"tokens": [
"▁جمهوری",
"▁موافقت",
"▁نامه",
"▁معاملات",
"▁قانون",
"▁بودجه",
"▁اساسی",
"▁قضا",
"ئی",
"▁بین",
"▁المللی",
"▁تأسیس",
"▁منطقه",
"▁ای",
"▁لازم",
"▁الاج",
"راء",
"▁دام",
"پر",
"وری",
"▁راه",
"▁آهن",
"▁کمیسیون",
"▁های",
"▁جدید",
"الا",
"حد",
"اث",
"▁مسئول",
"▁فر",
"آور",
"ده",
"▁زائد",
"▁اس",
"ق",
"اط",
"▁پنج",
"سال",
"ه"
]
}
]

View File

@ -1,7 +1,11 @@
from huggingface_hub import HfApi, ModelFilter
from huggingface_hub import HfApi
from huggingface_hub import ModelCard
from huggingface_hub import ModelFilter
import json
api = HfApi()
# card = ModelCard.load('model-name')
# persian languages tags
languages = ['pes', 'fas', 'fa'] # Language codes for Persian, Pashto, and Turkish

30
model_filter.py Normal file
View File

@ -0,0 +1,30 @@
import json
with open('./data/models_info.json', 'r', encoding='utf-8') as file:
models = json.load(file)
ner_models = []
for mdl in models:
if mdl['model_name'].__contains__('ner') or mdl['model_name'].__contains__('Ner') or mdl['model_name'].__contains__('Ner'):
ner_models.append(mdl)
# [mdl['model_name']]= {
# "task": mdl['task'],
# "last_modified": mdl['last_modified'],
# "downloads": int(mdl['downloads']),
# "likes": mdl['likes'],
# "language": mdl['language']
# }
sorted_ner_models = sorted(ner_models, key=lambda x: x["downloads"], reverse=True)
with open('./data/models_ner_info.json', 'w', encoding='utf-8') as file:
jsondata = json.dumps(sorted_ner_models, ensure_ascii=False, indent=2)
file.write(jsondata)
ner_models_text = ''
for mdl in sorted_ner_models:
ner_models_text += ''.join(f"{mdl['downloads']} -- {mdl['model_name']}\n")
with open('./data/models_ner_info.txt', 'w', encoding='utf-8') as file:
file.write(ner_models_text)
print(len(ner_models))