try to train old orgcatorg model

2025-07-17 16:44:16 +03:30 · 2025-07-17 16:44:16 +03:30 · cb9b414cd1
commit cb9b414cd1
parent 9c721f19a6
3 changed files with 73 additions and 202 deletions
--- a/Flair_NER/train.py
+++ b/Flair_NER/train.py
@ -1,194 +0,0 @@
 # -*- coding: utf-8 -*-
 " Original file is located at https://colab.research.google.com/drive/1Yb_fU_WBIs3a_L5G3_A_nxChrnR4Nzb1"
 learning_rate   = 0.65e-4 # 0.65e-4 - 0.4e-4
 mini_batch_size = 8
 max_epochs      = 200
 from funcs import save_to_file_by_address
 import json
 import os
 from pathlib import Path
 from flair.data import Corpus
 from flair.datasets import ColumnCorpus
 from flair.embeddings import TransformerWordEmbeddings
 from flair.models import SequenceTagger
 from flair.trainers import ModelTrainer
 from flair.models import SequenceTagger
 from flair.embeddings import TransformerDocumentEmbeddings
 # from funcs import remove_signs
 # from inference import main
 #model = os.getcwd() + "\\data\\final-model.pt"
 #model = os.getcwd() + "/data/HooshvareLab--distilbert-fa-zwnj-base-ner" # مدل اولیه که تست شد و تا حدود 70 درصد در آخرین آموزش خوب جواب می داد
 #model = os.getcwd() + "/data/distilbert-base-multilingual-cased-tavasi"
 # model = "HooshvareLab/bert-fa-base-uncased-ner-peyma"
 model = "PooryaPiroozfar/Flair-Persian-NER" # 111111111111111
 #model = "Helsinki-NLP/opus-mt-tc-big-fa-itc" # خطا میدهد، برای نر آموزش ندیده
 #model = "zedfum/arman-longformer-8k-finetuned-ensani" # خطا میدهد، برای نر آموزش ندیده
 #model = "AliGhiasvand86/gisha_qa" # خطا میدهد، برای نر آموزش ندیده
 ## ---------------------------------------------------------
 ## --- آخرین کار مورد استفاده در سامانه قانون یار از این آموزش دیده است
 #model = "orgcatorg/xlm-v-base-ner" # بهترین توکنایزر فارسی ***********************
 ## ---------------------------------------------------------
 # from transformers import AutoModel
 # print('1sdfsdf')
 # model = AutoModel.from_pretrained("/home/gpu/HFHOME/hub/models--orgcatorg--xlm-v-base-ner")
 #model = "pourmand1376/NER_Farsi" # 
 #model = "HooshvareLab/bert-base-parsbert-ner-uncased"   # ****  خوب جواب داد
 #model = "SeyedAli/Persian-Text-NER-Bert-V1"             # ***** خیلی خوب جواب داد
 #model = "HooshvareLab/bert-base-parsbert-peymaner-uncased" # جالب نبود!
 #model = "HooshvareLab/bert-base-parsbert-armanner-uncased" # جالب نبود!
 #model = "HooshvareLab/bert-base-parsbert-ner-uncased" # جالب نبود!
 print(model)
 print('#'*50)
 print('#'*50)
 #!pip install 'flair==0.10'
 # define columns
 columns = {0 : 'text', 1 : 'ner'}
 # directory where the data resides
 data_folder = './data/'
 # initializing the corpuscorpus = ColumnCorpus(data_folder, columns, train_file='peyma_train.txt', sequence_length=512)
 #اسم دیتاست اینجا تنظیم شود 
 corpus = ColumnCorpus(data_folder, columns,
                            #train_file = 'peyma_train.txt')
                              train_file = 'DATASET.txt') # qavanin 36K tokens
                              #train_file = 'law_dataset.txt',
                              #test_file = 'test_gold.txt',
                              #dev_file = 'dev split 2.txt'
                              #max_sentence_length=500
                              #)
 # tag to predict
 tag_type = 'ner'
 # make tag dictionary from the corpus
 tag_dictionary = corpus.make_label_dictionary(label_type=tag_type)
 #xlm-roberta-large
 # embeddings = TransformerWordEmbeddings(model='HooshvareLab/distilbert-fa-zwnj-base-ner',
 embeddings = TransformerWordEmbeddings(model= model,
                                       layers="-1",
                                       subtoken_pooling="first",
                                    #    pooling='mean',
                                       fine_tune=True,
                                       use_context=True,
                                       from_tf=True,
                                       allow_long_sentences=True
                                    #    model_max_length=512,
                                       )
 print('model read successfully !')
 print('#'*50)
 print('#'*50)
 try:
    tagger = SequenceTagger(hidden_size=256,
                            embeddings=embeddings,
                            tag_dictionary= tag_dictionary,
                            tag_type='ner',
                            use_crf=False,
                            use_rnn=False,
                            reproject_embeddings=False
                            )
 except Exception as e:
    print(str(e.args[0]))
    exit()
 from flair.trainers import ModelTrainer
 try:
    trainer = ModelTrainer(tagger, corpus)
    #resources/taggers/sota-ner-flert
    # trainer.fine_tune('./taggers',
    #                   learning_rate=2.0e-6,
    #                   mini_batch_size=16,
    #                 #   mini_batch_chunk_size=1,  # remove this parameter to speed up computation if you have a big GPU
    #                   max_epochs=20
    #                   )
 except Exception as e:
    print(str(e.args[0]))
    exit()
 try:
    result = trainer.fine_tune('./taggers',
                               learning_rate= learning_rate,
                               mini_batch_size= mini_batch_size,
                               max_epochs= max_epochs
                               )
 except Exception as e:
    print(str(e.args[0]))
    exit()
 try:
    # Save the model's state dictionary (configuration + weights)
    #model_state_dict_path = Path('./trained/best-model.pt')  # Assuming best model is saved here
    #tagger.save(model_state_dict_path)
    # Optionally, save additional hyperparameters to a separate file (e.g., training.json)
    hyperparameters = {
        "learning_rate": learning_rate,
        "mini_batch_size": mini_batch_size,
        "max_epochs": max_epochs,
    }
    with open('./trained/training.json', 'w') as f:
        json.dump(hyperparameters, f, indent=4)
 except Exception as e:
    exit()
 try:
    from train_log_plotter import plot_diagram
    plot_diagram()
 except:
    print('log diagram failed due to error!')
 train_result = f'''************************************************\n
 ##### TRAIN RESULT #####
 F1 Score: {result}
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n'''
 # اجرای اینفرنس جهت ارزیابی مدل
 # time = datetime.datetime.now()
 # tagger.save('./trained/trained-model ' + str(time) + '.pt')
 print('#'*70)
 print( '           ********** fine-tune operation finished **********           ')
 import datetime
 operation_time = datetime.datetime.now()
 print(f'           **********  {operation_time}  **********           ')
 print('#'*70)
 # ###################################################
 # ارزیابی مدل آموزش دیده
 try:
    from evaluate_model import do_evaluate
    print(' Try to evaluating the trained model! ')
    evaluate_result = do_evaluate()
    print(' Evaluating finished! ')
 except Exception as e:
    print('do_evaluate function failed')
    evaluate_result = f"do_evaluate function failed!\nerror massage:\n{str(e.args[0])}"
 final_result = f"""Model Name: {model}
 Fine-Tune Parameters: {hyperparameters}
 {train_result}
 {evaluate_result}\n
 Fine_Tune time: {operation_time}
 ------------------------------------------------------------------------------------
 ------------------------------------------------------------------------------------\n
 """
 save_to_file_by_address('./data/train_log.txt', final_result)
 print(' Saving results finished! ')
 # ###################################################
 # تست مدل بر یک مقدار مشخص شده
 print(' Try to test trained model! ')
 from inference import inference_main
 inference_main(model,'')
 print(' Testing model finished! ')
--- a/temp.py
+++ b/temp.py
@ -0,0 +1,67 @@
 from flair.data import Corpus
 from flair.datasets import ColumnCorpus
 from flair.embeddings import FlairEmbeddings, WordEmbeddings, StackedEmbeddings
 from flair.models import SequenceTagger
 from flair.trainers import ModelTrainer
 from flair.data import Dictionary
 from flair.models import SequenceTagger
 # Path to your new dataset
 data_folder = './data'  # Folder containing your dataset
 train_file = 'DATASET140402_no_aref.txt' # qavanin 36K tokens
 test_file = 'test_ds_new.txt' # test 110 sections - 6.7K
 # Column format for your dataset (adjust as necessary)
 # For example: 0 = text, 1 = NER tags
 columns = {0: 'text', 1: 'ner'}
 # Load the corpus
 corpus = ColumnCorpus(
    data_folder=data_folder,
    column_format=columns,
    train_file=train_file,
    test_file=test_file
 )
 print(corpus)
 # Define the tag dictionary (new NER tags)
 tag_type = 'ner'
 tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
 # Load the existing model (mdl.pt)
 tagger = SequenceTagger.load("/home/gpu/tnlp/jokar/Models/catorg/14030906_before_aitools_ds_finetune/final-model.pt")
 # Define embeddings (you can modify this as needed)
 embedding_types = [
    WordEmbeddings('glove'),  # Pre-trained GloVe embeddings
    FlairEmbeddings('news-forward'),  # Forward Flair embeddings
    FlairEmbeddings('news-backward')  # Backward Flair embeddings
 ]
 embeddings = StackedEmbeddings(embeddings=embedding_types)
 # Create a new tagger using the updated tag dictionary
 new_tagger = SequenceTagger(
    hidden_size=256,  # Size of the hidden layer, adjust as needed
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type=tag_type,
    use_crf=True,
 )
 # Transfer the weights from the old model to the new tagger
 new_tagger.load_state_dict(tagger.state_dict(), strict=False)
 # Train the model with the new dataset
 trainer = ModelTrainer(new_tagger, corpus)
 # Start training
 trainer.train('./output',  # Output folder for the model
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=10)  # Adjust parameters as needed
 # Save the fine-tuned model
 new_tagger.save('./trained/fine_tuned_mdl.pt')
--- a/train.py
+++ b/train.py
@ -1,9 +1,6 @@
 # -*- coding: utf-8 -*-
 " Original file is located at https://colab.research.google.com/drive/1Yb_fU_WBIs3a_L5G3_A_nxChrnR4Nzb1"
 learning_rate   = 0.65e-4 # 0.65e-4 - 0.4e-4
 mini_batch_size = 8
-max_epochs      = 200
+max_epochs      = 100
 from funcs import save_to_file_by_address
 import json
@ -23,7 +20,9 @@ from flair.embeddings import TransformerDocumentEmbeddings
 #model = os.getcwd() + "/data/HooshvareLab--distilbert-fa-zwnj-base-ner" # مدل اولیه که تست شد و تا حدود 70 درصد در آخرین آموزش خوب جواب می داد
 #model = os.getcwd() + "/data/distilbert-base-multilingual-cased-tavasi"
 # model = "HooshvareLab/bert-fa-base-uncased-ner-peyma"
-model = "PooryaPiroozfar/Flair-Persian-NER" # 111111111111111
+# model = "PooryaPiroozfar/Flair-Persian-NER" # 111111111111111
 model_ad = "/home/gpu/tnlp/jokar/Models/catorg/14030906_before_aitools_ds_finetune/final-model.pt"
 model = SequenceTagger.load(model_ad)
 #model = "Helsinki-NLP/opus-mt-tc-big-fa-itc" # خطا میدهد، برای نر آموزش ندیده
 #model = "zedfum/arman-longformer-8k-finetuned-ensani" # خطا میدهد، برای نر آموزش ندیده
 #model = "AliGhiasvand86/gisha_qa" # خطا میدهد، برای نر آموزش ندیده
@ -58,9 +57,8 @@ data_folder = './data/'
 #اسم دیتاست اینجا تنظیم شود 
 corpus = ColumnCorpus(data_folder, columns,
                            #train_file = 'peyma_train.txt')
-                              train_file = 'DATASET.txt') # qavanin 36K tokens
+                              train_file = 'DATASET140402_no_aref.txt', # qavanin 36K tokens
-                              #train_file = 'law_dataset.txt',
+                              test_file = 'test_ds_new.txt',) # test 110 sections - 6.7K
                              #test_file = 'test_gold.txt',
                              #dev_file = 'dev split 2.txt'
                              #max_sentence_length=500
                              #)