diff --git a/Flair_NER/train.py b/Flair_NER/train.py new file mode 100644 index 0000000..721437f --- /dev/null +++ b/Flair_NER/train.py @@ -0,0 +1,194 @@ +# -*- coding: utf-8 -*- +" Original file is located at https://colab.research.google.com/drive/1Yb_fU_WBIs3a_L5G3_A_nxChrnR4Nzb1" + +learning_rate = 0.65e-4 # 0.65e-4 - 0.4e-4 +mini_batch_size = 8 +max_epochs = 200 + +from funcs import save_to_file_by_address +import json +import os +from pathlib import Path +from flair.data import Corpus +from flair.datasets import ColumnCorpus +from flair.embeddings import TransformerWordEmbeddings +from flair.models import SequenceTagger +from flair.trainers import ModelTrainer +from flair.models import SequenceTagger +from flair.embeddings import TransformerDocumentEmbeddings + +# from funcs import remove_signs +# from inference import main +#model = os.getcwd() + "\\data\\final-model.pt" +#model = os.getcwd() + "/data/HooshvareLab--distilbert-fa-zwnj-base-ner" # مدل اولیه که تست شد و تا حدود 70 درصد در آخرین آموزش خوب جواب می داد +#model = os.getcwd() + "/data/distilbert-base-multilingual-cased-tavasi" +# model = "HooshvareLab/bert-fa-base-uncased-ner-peyma" +model = "PooryaPiroozfar/Flair-Persian-NER" # 111111111111111 +#model = "Helsinki-NLP/opus-mt-tc-big-fa-itc" # خطا میدهد، برای نر آموزش ندیده +#model = "zedfum/arman-longformer-8k-finetuned-ensani" # خطا میدهد، برای نر آموزش ندیده +#model = "AliGhiasvand86/gisha_qa" # خطا میدهد، برای نر آموزش ندیده + +## --------------------------------------------------------- +## --- آخرین کار مورد استفاده در سامانه قانون یار از این آموزش دیده است +#model = "orgcatorg/xlm-v-base-ner" # بهترین توکنایزر فارسی *********************** +## --------------------------------------------------------- + + +# from transformers import AutoModel +# print('1sdfsdf') +# model = AutoModel.from_pretrained("/home/gpu/HFHOME/hub/models--orgcatorg--xlm-v-base-ner") + +#model = "pourmand1376/NER_Farsi" # +#model = "HooshvareLab/bert-base-parsbert-ner-uncased" # **** خوب جواب داد +#model = "SeyedAli/Persian-Text-NER-Bert-V1" # ***** خیلی خوب جواب داد +#model = "HooshvareLab/bert-base-parsbert-peymaner-uncased" # جالب نبود! +#model = "HooshvareLab/bert-base-parsbert-armanner-uncased" # جالب نبود! +#model = "HooshvareLab/bert-base-parsbert-ner-uncased" # جالب نبود! +print(model) +print('#'*50) +print('#'*50) + +#!pip install 'flair==0.10' + +# define columns +columns = {0 : 'text', 1 : 'ner'} +# directory where the data resides +data_folder = './data/' +# initializing the corpuscorpus = ColumnCorpus(data_folder, columns, train_file='peyma_train.txt', sequence_length=512) +#اسم دیتاست اینجا تنظیم شود +corpus = ColumnCorpus(data_folder, columns, + #train_file = 'peyma_train.txt') + train_file = 'DATASET.txt') # qavanin 36K tokens + #train_file = 'law_dataset.txt', + #test_file = 'test_gold.txt', + #dev_file = 'dev split 2.txt' + #max_sentence_length=500 + #) + +# tag to predict +tag_type = 'ner' +# make tag dictionary from the corpus +tag_dictionary = corpus.make_label_dictionary(label_type=tag_type) + +#xlm-roberta-large +# embeddings = TransformerWordEmbeddings(model='HooshvareLab/distilbert-fa-zwnj-base-ner', +embeddings = TransformerWordEmbeddings(model= model, + layers="-1", + subtoken_pooling="first", + # pooling='mean', + fine_tune=True, + use_context=True, + from_tf=True, + allow_long_sentences=True + # model_max_length=512, + ) + +print('model read successfully !') +print('#'*50) +print('#'*50) +try: + tagger = SequenceTagger(hidden_size=256, + embeddings=embeddings, + tag_dictionary= tag_dictionary, + tag_type='ner', + use_crf=False, + use_rnn=False, + reproject_embeddings=False + ) +except Exception as e: + print(str(e.args[0])) + exit() + + +from flair.trainers import ModelTrainer +try: + + trainer = ModelTrainer(tagger, corpus) + #resources/taggers/sota-ner-flert + # trainer.fine_tune('./taggers', + # learning_rate=2.0e-6, + # mini_batch_size=16, + # # mini_batch_chunk_size=1, # remove this parameter to speed up computation if you have a big GPU + # max_epochs=20 + # ) +except Exception as e: + print(str(e.args[0])) + exit() + +try: + result = trainer.fine_tune('./taggers', + learning_rate= learning_rate, + mini_batch_size= mini_batch_size, + max_epochs= max_epochs + ) +except Exception as e: + print(str(e.args[0])) + exit() +try: + # Save the model's state dictionary (configuration + weights) + #model_state_dict_path = Path('./trained/best-model.pt') # Assuming best model is saved here + #tagger.save(model_state_dict_path) + + # Optionally, save additional hyperparameters to a separate file (e.g., training.json) + hyperparameters = { + "learning_rate": learning_rate, + "mini_batch_size": mini_batch_size, + "max_epochs": max_epochs, + } + + with open('./trained/training.json', 'w') as f: + json.dump(hyperparameters, f, indent=4) +except Exception as e: + + exit() + +try: + from train_log_plotter import plot_diagram + plot_diagram() +except: + print('log diagram failed due to error!') + +train_result = f'''************************************************\n +##### TRAIN RESULT ##### +F1 Score: {result} +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n''' + +# اجرای اینفرنس جهت ارزیابی مدل +# time = datetime.datetime.now() +# tagger.save('./trained/trained-model ' + str(time) + '.pt') +print('#'*70) +print( ' ********** fine-tune operation finished ********** ') +import datetime +operation_time = datetime.datetime.now() +print(f' ********** {operation_time} ********** ') +print('#'*70) + +# ################################################### +# ارزیابی مدل آموزش دیده +try: + from evaluate_model import do_evaluate + print(' Try to evaluating the trained model! ') + evaluate_result = do_evaluate() + print(' Evaluating finished! ') +except Exception as e: + print('do_evaluate function failed') + evaluate_result = f"do_evaluate function failed!\nerror massage:\n{str(e.args[0])}" + +final_result = f"""Model Name: {model} +Fine-Tune Parameters: {hyperparameters} +{train_result} +{evaluate_result}\n +Fine_Tune time: {operation_time} +------------------------------------------------------------------------------------ +------------------------------------------------------------------------------------\n +""" +save_to_file_by_address('./data/train_log.txt', final_result) +print(' Saving results finished! ') +# ################################################### +# تست مدل بر یک مقدار مشخص شده +print(' Try to test trained model! ') +from inference import inference_main +inference_main(model,'') +print(' Testing model finished! ') + +