From ae1a9b11c0a0f28eb1b0e1c34c2e134787daac34 Mon Sep 17 00:00:00 2001 From: ajokar Date: Tue, 15 Jul 2025 17:39:08 +0330 Subject: [PATCH] send editions to git --- check_tokenizers.py | 2 +- .../huggingface-fa-models.txt | 0 models_info.json => data/models_info.json | 0 .../tokenizer-fa-models-logs.txt | 0 get_persian_nlp_models.py | 13 ++++++------- 5 files changed, 7 insertions(+), 8 deletions(-) rename huggingface-fa-models.txt => data/huggingface-fa-models.txt (100%) rename models_info.json => data/models_info.json (100%) rename fokenizer-fa-models-logs.txt => data/tokenizer-fa-models-logs.txt (100%) diff --git a/check_tokenizers.py b/check_tokenizers.py index 573a505..b4e9322 100644 --- a/check_tokenizers.py +++ b/check_tokenizers.py @@ -1,7 +1,7 @@ from transformers import AutoTokenizer import json -file = open('models_info.json', 'r') +file = open('./data/models_info.json', 'r') models = json.load(file) # Strips the newline character diff --git a/huggingface-fa-models.txt b/data/huggingface-fa-models.txt similarity index 100% rename from huggingface-fa-models.txt rename to data/huggingface-fa-models.txt diff --git a/models_info.json b/data/models_info.json similarity index 100% rename from models_info.json rename to data/models_info.json diff --git a/fokenizer-fa-models-logs.txt b/data/tokenizer-fa-models-logs.txt similarity index 100% rename from fokenizer-fa-models-logs.txt rename to data/tokenizer-fa-models-logs.txt diff --git a/get_persian_nlp_models.py b/get_persian_nlp_models.py index 2f5d9ba..619b846 100644 --- a/get_persian_nlp_models.py +++ b/get_persian_nlp_models.py @@ -1,16 +1,15 @@ from huggingface_hub import HfApi, ModelFilter import json -# Initialize the Hugging Face API api = HfApi() -# Define the languages you're interested in +# persian languages tags languages = ['pes', 'fas', 'fa'] # Language codes for Persian, Pashto, and Turkish nlp_tasks = ['text-classification', 'token-classification','table-question-answering','question-answering','zero-shot-classification','translation','summarization','feature-extraction','text-generation','text2text-generation','fill-mask','sentence-similarity'] -# Initialize a list to store the model information +# list of model information models_info = [] -# Iterate over each language +# Iterate languages for lang in languages: # Filter models by language models = api.list_models(filter=ModelFilter(language=lang)) @@ -30,8 +29,8 @@ for lang in languages: } models_info.append(model_info) -# Save the collected data to a JSON file -with open('models_info.json', 'w', encoding='utf-8') as f: +# Save models_info +with open('./data/models_info.json', 'w', encoding='utf-8') as f: json.dump(models_info, f, ensure_ascii=False, indent=4) -print("Data collection complete. Saved to models_info.json") \ No newline at end of file +print("Finished!") \ No newline at end of file