Nlp_models/check_tokenizers.py

from transformers import AutoTokenizer
import json

file = open('./data/models_ner_info.txt', 'r')
models = file.readlines()
file.close()

# Strips the newline character
text = 'جمهوری موافقت‌نامه معاملات قانون بودجه اساسی قضائی بین‌المللی تأسیس منطقه‌ای لازم‌الاجراء دامپروری راه‌آهن کمیسیون‌های جدیدالاحداث مسئول فرآورده زائد اسقاط پنجساله'
results = []
for line in models:
    model_checkpoint = line.split(' -- ')[1].rstrip('\n')
    # model_checkpoint = line['model_name']
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
        print(model_checkpoint)
        tokens = tokenizer.tokenize(text)
        print(tokens)
        print(f'len(tokens): {len(tokens)}')
        results.append({ 'model': model_checkpoint, 'len': len(tokens), 'tokens': tokens})

    except:
        error = "An exception occurred in tokenizer : " + model_checkpoint
        #file.write( error + '\n' )
        print(error)

with open('./data/models_tokenizer_info.json', 'w', encoding='utf-8') as file:
    jsondata = json.dumps(results, ensure_ascii=False, indent=2)
    file.write(jsondata)

print('finished!')