diff --git a/Flair_NER/train.py b/Flair_NER/train.py deleted file mode 100644 index 721437f..0000000 --- a/Flair_NER/train.py +++ /dev/null @@ -1,194 +0,0 @@ -# -*- coding: utf-8 -*- -" Original file is located at https://colab.research.google.com/drive/1Yb_fU_WBIs3a_L5G3_A_nxChrnR4Nzb1" - -learning_rate = 0.65e-4 # 0.65e-4 - 0.4e-4 -mini_batch_size = 8 -max_epochs = 200 - -from funcs import save_to_file_by_address -import json -import os -from pathlib import Path -from flair.data import Corpus -from flair.datasets import ColumnCorpus -from flair.embeddings import TransformerWordEmbeddings -from flair.models import SequenceTagger -from flair.trainers import ModelTrainer -from flair.models import SequenceTagger -from flair.embeddings import TransformerDocumentEmbeddings - -# from funcs import remove_signs -# from inference import main -#model = os.getcwd() + "\\data\\final-model.pt" -#model = os.getcwd() + "/data/HooshvareLab--distilbert-fa-zwnj-base-ner" # مدل اولیه که تست شد و تا حدود 70 درصد در آخرین آموزش خوب جواب می داد -#model = os.getcwd() + "/data/distilbert-base-multilingual-cased-tavasi" -# model = "HooshvareLab/bert-fa-base-uncased-ner-peyma" -model = "PooryaPiroozfar/Flair-Persian-NER" # 111111111111111 -#model = "Helsinki-NLP/opus-mt-tc-big-fa-itc" # خطا میدهد، برای نر آموزش ندیده -#model = "zedfum/arman-longformer-8k-finetuned-ensani" # خطا میدهد، برای نر آموزش ندیده -#model = "AliGhiasvand86/gisha_qa" # خطا میدهد، برای نر آموزش ندیده - -## --------------------------------------------------------- -## --- آخرین کار مورد استفاده در سامانه قانون یار از این آموزش دیده است -#model = "orgcatorg/xlm-v-base-ner" # بهترین توکنایزر فارسی *********************** -## --------------------------------------------------------- - - -# from transformers import AutoModel -# print('1sdfsdf') -# model = AutoModel.from_pretrained("/home/gpu/HFHOME/hub/models--orgcatorg--xlm-v-base-ner") - -#model = "pourmand1376/NER_Farsi" # -#model = "HooshvareLab/bert-base-parsbert-ner-uncased" # **** خوب جواب داد -#model = "SeyedAli/Persian-Text-NER-Bert-V1" # ***** خیلی خوب جواب داد -#model = "HooshvareLab/bert-base-parsbert-peymaner-uncased" # جالب نبود! -#model = "HooshvareLab/bert-base-parsbert-armanner-uncased" # جالب نبود! -#model = "HooshvareLab/bert-base-parsbert-ner-uncased" # جالب نبود! -print(model) -print('#'*50) -print('#'*50) - -#!pip install 'flair==0.10' - -# define columns -columns = {0 : 'text', 1 : 'ner'} -# directory where the data resides -data_folder = './data/' -# initializing the corpuscorpus = ColumnCorpus(data_folder, columns, train_file='peyma_train.txt', sequence_length=512) -#اسم دیتاست اینجا تنظیم شود -corpus = ColumnCorpus(data_folder, columns, - #train_file = 'peyma_train.txt') - train_file = 'DATASET.txt') # qavanin 36K tokens - #train_file = 'law_dataset.txt', - #test_file = 'test_gold.txt', - #dev_file = 'dev split 2.txt' - #max_sentence_length=500 - #) - -# tag to predict -tag_type = 'ner' -# make tag dictionary from the corpus -tag_dictionary = corpus.make_label_dictionary(label_type=tag_type) - -#xlm-roberta-large -# embeddings = TransformerWordEmbeddings(model='HooshvareLab/distilbert-fa-zwnj-base-ner', -embeddings = TransformerWordEmbeddings(model= model, - layers="-1", - subtoken_pooling="first", - # pooling='mean', - fine_tune=True, - use_context=True, - from_tf=True, - allow_long_sentences=True - # model_max_length=512, - ) - -print('model read successfully !') -print('#'*50) -print('#'*50) -try: - tagger = SequenceTagger(hidden_size=256, - embeddings=embeddings, - tag_dictionary= tag_dictionary, - tag_type='ner', - use_crf=False, - use_rnn=False, - reproject_embeddings=False - ) -except Exception as e: - print(str(e.args[0])) - exit() - - -from flair.trainers import ModelTrainer -try: - - trainer = ModelTrainer(tagger, corpus) - #resources/taggers/sota-ner-flert - # trainer.fine_tune('./taggers', - # learning_rate=2.0e-6, - # mini_batch_size=16, - # # mini_batch_chunk_size=1, # remove this parameter to speed up computation if you have a big GPU - # max_epochs=20 - # ) -except Exception as e: - print(str(e.args[0])) - exit() - -try: - result = trainer.fine_tune('./taggers', - learning_rate= learning_rate, - mini_batch_size= mini_batch_size, - max_epochs= max_epochs - ) -except Exception as e: - print(str(e.args[0])) - exit() -try: - # Save the model's state dictionary (configuration + weights) - #model_state_dict_path = Path('./trained/best-model.pt') # Assuming best model is saved here - #tagger.save(model_state_dict_path) - - # Optionally, save additional hyperparameters to a separate file (e.g., training.json) - hyperparameters = { - "learning_rate": learning_rate, - "mini_batch_size": mini_batch_size, - "max_epochs": max_epochs, - } - - with open('./trained/training.json', 'w') as f: - json.dump(hyperparameters, f, indent=4) -except Exception as e: - - exit() - -try: - from train_log_plotter import plot_diagram - plot_diagram() -except: - print('log diagram failed due to error!') - -train_result = f'''************************************************\n -##### TRAIN RESULT ##### -F1 Score: {result} -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n''' - -# اجرای اینفرنس جهت ارزیابی مدل -# time = datetime.datetime.now() -# tagger.save('./trained/trained-model ' + str(time) + '.pt') -print('#'*70) -print( ' ********** fine-tune operation finished ********** ') -import datetime -operation_time = datetime.datetime.now() -print(f' ********** {operation_time} ********** ') -print('#'*70) - -# ################################################### -# ارزیابی مدل آموزش دیده -try: - from evaluate_model import do_evaluate - print(' Try to evaluating the trained model! ') - evaluate_result = do_evaluate() - print(' Evaluating finished! ') -except Exception as e: - print('do_evaluate function failed') - evaluate_result = f"do_evaluate function failed!\nerror massage:\n{str(e.args[0])}" - -final_result = f"""Model Name: {model} -Fine-Tune Parameters: {hyperparameters} -{train_result} -{evaluate_result}\n -Fine_Tune time: {operation_time} ------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------\n -""" -save_to_file_by_address('./data/train_log.txt', final_result) -print(' Saving results finished! ') -# ################################################### -# تست مدل بر یک مقدار مشخص شده -print(' Try to test trained model! ') -from inference import inference_main -inference_main(model,'') -print(' Testing model finished! ') - - diff --git a/temp.py b/temp.py new file mode 100644 index 0000000..a39b4e9 --- /dev/null +++ b/temp.py @@ -0,0 +1,67 @@ +from flair.data import Corpus +from flair.datasets import ColumnCorpus +from flair.embeddings import FlairEmbeddings, WordEmbeddings, StackedEmbeddings +from flair.models import SequenceTagger +from flair.trainers import ModelTrainer +from flair.data import Dictionary +from flair.models import SequenceTagger + +# Path to your new dataset +data_folder = './data' # Folder containing your dataset +train_file = 'DATASET140402_no_aref.txt' # qavanin 36K tokens +test_file = 'test_ds_new.txt' # test 110 sections - 6.7K + + +# Column format for your dataset (adjust as necessary) +# For example: 0 = text, 1 = NER tags +columns = {0: 'text', 1: 'ner'} + +# Load the corpus +corpus = ColumnCorpus( + data_folder=data_folder, + column_format=columns, + train_file=train_file, + test_file=test_file +) + +print(corpus) + +# Define the tag dictionary (new NER tags) +tag_type = 'ner' +tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) + +# Load the existing model (mdl.pt) +tagger = SequenceTagger.load("/home/gpu/tnlp/jokar/Models/catorg/14030906_before_aitools_ds_finetune/final-model.pt") + +# Define embeddings (you can modify this as needed) +embedding_types = [ + WordEmbeddings('glove'), # Pre-trained GloVe embeddings + FlairEmbeddings('news-forward'), # Forward Flair embeddings + FlairEmbeddings('news-backward') # Backward Flair embeddings +] + +embeddings = StackedEmbeddings(embeddings=embedding_types) + +# Create a new tagger using the updated tag dictionary +new_tagger = SequenceTagger( + hidden_size=256, # Size of the hidden layer, adjust as needed + embeddings=embeddings, + tag_dictionary=tag_dictionary, + tag_type=tag_type, + use_crf=True, +) + +# Transfer the weights from the old model to the new tagger +new_tagger.load_state_dict(tagger.state_dict(), strict=False) + +# Train the model with the new dataset +trainer = ModelTrainer(new_tagger, corpus) + +# Start training +trainer.train('./output', # Output folder for the model + learning_rate=0.1, + mini_batch_size=32, + max_epochs=10) # Adjust parameters as needed + +# Save the fine-tuned model +new_tagger.save('./trained/fine_tuned_mdl.pt') \ No newline at end of file diff --git a/train.py b/train.py index 721437f..7c5dda9 100644 --- a/train.py +++ b/train.py @@ -1,9 +1,6 @@ -# -*- coding: utf-8 -*- -" Original file is located at https://colab.research.google.com/drive/1Yb_fU_WBIs3a_L5G3_A_nxChrnR4Nzb1" - learning_rate = 0.65e-4 # 0.65e-4 - 0.4e-4 mini_batch_size = 8 -max_epochs = 200 +max_epochs = 100 from funcs import save_to_file_by_address import json @@ -23,7 +20,9 @@ from flair.embeddings import TransformerDocumentEmbeddings #model = os.getcwd() + "/data/HooshvareLab--distilbert-fa-zwnj-base-ner" # مدل اولیه که تست شد و تا حدود 70 درصد در آخرین آموزش خوب جواب می داد #model = os.getcwd() + "/data/distilbert-base-multilingual-cased-tavasi" # model = "HooshvareLab/bert-fa-base-uncased-ner-peyma" -model = "PooryaPiroozfar/Flair-Persian-NER" # 111111111111111 +# model = "PooryaPiroozfar/Flair-Persian-NER" # 111111111111111 +model_ad = "/home/gpu/tnlp/jokar/Models/catorg/14030906_before_aitools_ds_finetune/final-model.pt" +model = SequenceTagger.load(model_ad) #model = "Helsinki-NLP/opus-mt-tc-big-fa-itc" # خطا میدهد، برای نر آموزش ندیده #model = "zedfum/arman-longformer-8k-finetuned-ensani" # خطا میدهد، برای نر آموزش ندیده #model = "AliGhiasvand86/gisha_qa" # خطا میدهد، برای نر آموزش ندیده @@ -58,9 +57,8 @@ data_folder = './data/' #اسم دیتاست اینجا تنظیم شود corpus = ColumnCorpus(data_folder, columns, #train_file = 'peyma_train.txt') - train_file = 'DATASET.txt') # qavanin 36K tokens - #train_file = 'law_dataset.txt', - #test_file = 'test_gold.txt', + train_file = 'DATASET140402_no_aref.txt', # qavanin 36K tokens + test_file = 'test_ds_new.txt',) # test 110 sections - 6.7K #dev_file = 'dev split 2.txt' #max_sentence_length=500 #)