from transformers import AutoTokenizer import os from general_functions import normalize_content local_address = './jokar/Flair_NER/' def read_file(): address = os.getcwd() + '/jokar/Flair_NER/data/law.txt' with open(address, 'r', encoding='utf-8') as file: text = '' try: text = str(file.read()) except: pass return text lines = [] # file = open(local_address + 'farsi_tokenizers.txt', 'r') # lines = file.readlines() file = open(local_address + 'farsi_tokenizers_result.txt', 'w', encoding='utf-8') lines.append('orgcatorg/xlm-v-base-ner') text = read_file() text = normalize_content(text) # token_c = text.split() # print(str(len(token_c))) # Strips the newline character for index, line in enumerate(lines): model_checkpoint = line.split(':')[0] model_checkpoint = model_checkpoint.strip('{') model_checkpoint = model_checkpoint.strip() # فقط مدل های مربوط به NER را بررسی کن # if not (model_checkpoint.__contains__('ner') or model_checkpoint.__contains__('NER')): # continue print(str(index+1) + ' => ' + model_checkpoint) try: tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) # text = 'فسخ معاملات' tokens = tokenizer.tokenize(text) # if len(tokens) == 2 : token_list = '' for token in tokens: token_list = token_list + '\n' + token result = '\n\n' + 'token counts: ' + str(len(tokens)) + token_list file.write('*'*100 + '\n' + str(index+1) + ' - ' + model_checkpoint + " : " + result.strip() ) #file.close() #result = tokenizer(text) #print(result) #print(tokenizer.decode(result['input_ids'])) except Exception as e: error = "An exception occurred in tokenizer : " + model_checkpoint #file.write( error + '\n' ) print(e) #tokenizer.save_pretrained(model_checkpoint+'-tokenizer') file.close()