# -*- coding: utf-8 -*- "Original file is located at https://colab.research.google.com/drive/1Yb_fU_WBIs3a_L5G3_A_nxChrnR4Nzb1" learning_rate = 0.4e-4 # 0.65e-4 - 0.4e-4 mini_batch_size = 10 max_epochs = 200 from funcs import save_to_file_by_address import json import os from pathlib import Path from flair.data import Corpus from flair.datasets import ColumnCorpus from flair.embeddings import TransformerWordEmbeddings from flair.models import SequenceTagger from flair.trainers import ModelTrainer from flair.models import SequenceTagger from flair.embeddings import TransformerDocumentEmbeddings # from funcs import remove_signs # from inference import main #model = os.getcwd() + "\\data\\final-model.pt" #model = os.getcwd() + "/data/HooshvareLab--distilbert-fa-zwnj-base-ner" # مدل اولیه که تست شد و تا حدود 70 درصد در آخرین آموزش خوب جواب می داد #model = os.getcwd() + "/data/distilbert-base-multilingual-cased-tavasi" # model = "HooshvareLab/bert-fa-base-uncased-ner-peyma" # model = "m3hrdadfi/albert-fa-base-v2-ner-arman" # 111111111111111 #model = "Helsinki-NLP/opus-mt-tc-big-fa-itc" # خطا میدهد، برای نر آموزش ندیده #model = "zedfum/arman-longformer-8k-finetuned-ensani" # خطا میدهد، برای نر آموزش ندیده #model = "AliGhiasvand86/gisha_qa" # خطا میدهد، برای نر آموزش ندیده model = "orgcatorg/xlm-v-base-ner" # بهترین توکنایزر فارسی *********************** #model = "pourmand1376/NER_Farsi" # #model = "HooshvareLab/bert-base-parsbert-ner-uncased" # **** خوب جواب داد #model = "SeyedAli/Persian-Text-NER-Bert-V1" # ***** خیلی خوب جواب داد #model = "HooshvareLab/bert-base-parsbert-peymaner-uncased" # جالب نبود! #model = "HooshvareLab/bert-base-parsbert-armanner-uncased" # جالب نبود! #model = "HooshvareLab/bert-base-parsbert-ner-uncased" # جالب نبود! print(model) print('#'*50) print('#'*50) #! pip install flair #!pip install 'flair==0.10' # import matplotlib.pyplot as plt # define columns columns = {0 : 'text', 1 : 'ner'} # directory where the data resides data_folder = './data/' # initializing the corpuscorpus = ColumnCorpus(data_folder, columns, train_file='peyma_train.txt', sequence_length=512) corpus = ColumnCorpus(data_folder, columns, #train_file = 'peyma_train.txt') train_file = 'DATASET.txt') # qavanin 36K tokens #train_file = 'law_dataset.txt', #test_file = 'test_gold.txt', #dev_file = 'dev split 2.txt' #max_sentence_length=500 #) # tag to predict tag_type = 'ner' # make tag dictionary from the corpus tag_dictionary = corpus.make_label_dictionary(label_type=tag_type) #xlm-roberta-large # embeddings = TransformerWordEmbeddings(model='HooshvareLab/distilbert-fa-zwnj-base-ner', embeddings = TransformerWordEmbeddings(model= model, layers="-1", subtoken_pooling="first", # pooling='mean', fine_tune=True, use_context=True, from_tf=True, allow_long_sentences=True # model_max_length=512, ) print('model read successfully !') print('#'*50) print('#'*50) try: tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary= tag_dictionary, tag_type='ner', use_crf=False, use_rnn=False, reproject_embeddings=False ) except Exception as e: print(str(e.args[0])) exit() from flair.trainers import ModelTrainer try: trainer = ModelTrainer(tagger, corpus) #resources/taggers/sota-ner-flert # trainer.fine_tune('./taggers', # learning_rate=2.0e-6, # mini_batch_size=16, # # mini_batch_chunk_size=1, # remove this parameter to speed up computation if you have a big GPU # max_epochs=20 # ) except Exception as e: print(str(e.args[0])) exit() try: result = trainer.fine_tune('./taggers', learning_rate= learning_rate, mini_batch_size= mini_batch_size, max_epochs= max_epochs ) except Exception as e: print(str(e.args[0])) exit() try: # Save the model's state dictionary (configuration + weights) #model_state_dict_path = Path('./trained/best-model.pt') # Assuming best model is saved here #tagger.save(model_state_dict_path) # Optionally, save additional hyperparameters to a separate file (e.g., training.json) hyperparameters = { "learning_rate": learning_rate, "mini_batch_size": mini_batch_size, "max_epochs": max_epochs, } with open('./trained/training.json', 'w') as f: json.dump(hyperparameters, f, indent=4) except Exception as e: exit() train_result = f'''************************************************\n ##### TRAIN RESULT ##### F1 Score: {result} ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n''' # اجرای اینفرنس جهت ارزیابی مدل # time = datetime.datetime.now() # tagger.save('./trained/trained-model ' + str(time) + '.pt') print('#'*70) print( ' ********** fine-tune operation finished ********** ') import datetime operation_time = datetime.datetime.now() print(f' ********** {operation_time} ********** ') print('#'*70) # ################################################### # ارزیابی مدل آموزش دیده try: from evaluate_model import do_evaluate print(' Try to evaluating the trained model! ') evaluate_result = do_evaluate() print(' Evaluating finished! ') except Exception as e: print('do_evaluate function failed') evaluate_result = f"""do_evaluate function failed! error massage: {str(e.args[0])}""" final_result = f"""Model Name: {model} Fine-Tune Parameters: {hyperparameters} {train_result} {evaluate_result}\n Fine_Tune time: {operation_time} ------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------\n """ save_to_file_by_address('./data/train_log.txt', final_result) print(' Saving results finished! ') # ################################################### # تست مدل بر یک مقدار مشخص شده print(' Try to test trained model! ') from inference import inference_main inference_main(model,'') print(' Testing model finished! ')