Flair_NER/check_tokenizer.py

from transformers import AutoTokenizer
import os
from general_functions import normalize_content

local_address = './jokar/Flair_NER/'
def read_file():
    address = os.getcwd() + '/jokar/Flair_NER/data/law.txt'
    with open(address, 'r', encoding='utf-8') as file:
        text = ''
        try:
            text = str(file.read())
        except:
            pass
    return text

lines = []
# file = open(local_address + 'farsi_tokenizers.txt', 'r')
# lines = file.readlines()

file = open(local_address + 'farsi_tokenizers_result.txt', 'w', encoding='utf-8')


lines.append('orgcatorg/xlm-v-base-ner')

text = read_file()
text = normalize_content(text)
# token_c = text.split()
# print(str(len(token_c)))

# Strips the newline character
for index, line in enumerate(lines):
    model_checkpoint = line.split(':')[0]
    model_checkpoint = model_checkpoint.strip('{')
    model_checkpoint = model_checkpoint.strip()
    # فقط مدل های مربوط به NER را بررسی کن
    # if not (model_checkpoint.__contains__('ner') or model_checkpoint.__contains__('NER')):
    #     continue
    print(str(index+1) + ' => ' + model_checkpoint)
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

        # text = 'فسخ معاملات'
        tokens = tokenizer.tokenize(text)

        # if len(tokens) == 2 :
        token_list = ''
        for token in tokens:
            token_list = token_list + '\n' + token
        result = '\n\n' + 'token counts: ' + str(len(tokens)) + token_list
        file.write('*'*100 + '\n' + str(index+1) + ' - ' + model_checkpoint + " : " + result.strip() )
        #file.close()
        #result = tokenizer(text)
        #print(result)
        #print(tokenizer.decode(result['input_ids']))
    except Exception as e:
        error = "An exception occurred in tokenizer : " + model_checkpoint
        #file.write( error + '\n' )
        print(e)
    #tokenizer.save_pretrained(model_checkpoint+'-tokenizer')
file.close()