Flair_NER/check_tokenizer.py

61 lines
2.0 KiB
Python

from transformers import AutoTokenizer
import os
from general_functions import normalize_content
local_address = './jokar/Flair_NER/'
def read_file():
address = os.getcwd() + '/jokar/Flair_NER/data/law.txt'
with open(address, 'r', encoding='utf-8') as file:
text = ''
try:
text = str(file.read())
except:
pass
return text
lines = []
# file = open(local_address + 'farsi_tokenizers.txt', 'r')
# lines = file.readlines()
file = open(local_address + 'farsi_tokenizers_result.txt', 'w', encoding='utf-8')
lines.append('orgcatorg/xlm-v-base-ner')
text = read_file()
text = normalize_content(text)
# token_c = text.split()
# print(str(len(token_c)))
# Strips the newline character
for index, line in enumerate(lines):
model_checkpoint = line.split(':')[0]
model_checkpoint = model_checkpoint.strip('{')
model_checkpoint = model_checkpoint.strip()
# فقط مدل های مربوط به NER را بررسی کن
# if not (model_checkpoint.__contains__('ner') or model_checkpoint.__contains__('NER')):
# continue
print(str(index+1) + ' => ' + model_checkpoint)
try:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# text = 'فسخ معاملات'
tokens = tokenizer.tokenize(text)
# if len(tokens) == 2 :
token_list = ''
for token in tokens:
token_list = token_list + '\n' + token
result = '\n\n' + 'token counts: ' + str(len(tokens)) + token_list
file.write('*'*100 + '\n' + str(index+1) + ' - ' + model_checkpoint + " : " + result.strip() )
#file.close()
#result = tokenizer(text)
#print(result)
#print(tokenizer.decode(result['input_ids']))
except Exception as e:
error = "An exception occurred in tokenizer : " + model_checkpoint
#file.write( error + '\n' )
print(e)
#tokenizer.save_pretrained(model_checkpoint+'-tokenizer')
file.close()