61 lines
2.0 KiB
Python
61 lines
2.0 KiB
Python
from transformers import AutoTokenizer
|
|
import os
|
|
from general_functions import normalize_content
|
|
|
|
local_address = './jokar/Flair_NER/'
|
|
def read_file():
|
|
address = os.getcwd() + '/jokar/Flair_NER/data/law.txt'
|
|
with open(address, 'r', encoding='utf-8') as file:
|
|
text = ''
|
|
try:
|
|
text = str(file.read())
|
|
except:
|
|
pass
|
|
return text
|
|
|
|
lines = []
|
|
# file = open(local_address + 'farsi_tokenizers.txt', 'r')
|
|
# lines = file.readlines()
|
|
|
|
file = open(local_address + 'farsi_tokenizers_result.txt', 'w', encoding='utf-8')
|
|
|
|
|
|
lines.append('orgcatorg/xlm-v-base-ner')
|
|
|
|
text = read_file()
|
|
text = normalize_content(text)
|
|
# token_c = text.split()
|
|
# print(str(len(token_c)))
|
|
|
|
# Strips the newline character
|
|
for index, line in enumerate(lines):
|
|
model_checkpoint = line.split(':')[0]
|
|
model_checkpoint = model_checkpoint.strip('{')
|
|
model_checkpoint = model_checkpoint.strip()
|
|
# فقط مدل های مربوط به NER را بررسی کن
|
|
# if not (model_checkpoint.__contains__('ner') or model_checkpoint.__contains__('NER')):
|
|
# continue
|
|
print(str(index+1) + ' => ' + model_checkpoint)
|
|
try:
|
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
|
|
|
# text = 'فسخ معاملات'
|
|
tokens = tokenizer.tokenize(text)
|
|
|
|
# if len(tokens) == 2 :
|
|
token_list = ''
|
|
for token in tokens:
|
|
token_list = token_list + '\n' + token
|
|
result = '\n\n' + 'token counts: ' + str(len(tokens)) + token_list
|
|
file.write('*'*100 + '\n' + str(index+1) + ' - ' + model_checkpoint + " : " + result.strip() )
|
|
#file.close()
|
|
#result = tokenizer(text)
|
|
#print(result)
|
|
#print(tokenizer.decode(result['input_ids']))
|
|
except Exception as e:
|
|
error = "An exception occurred in tokenizer : " + model_checkpoint
|
|
#file.write( error + '\n' )
|
|
print(e)
|
|
#tokenizer.save_pretrained(model_checkpoint+'-tokenizer')
|
|
file.close()
|