try to train old orgcatorg model

2025-07-17 16:44:16 +03:30 · 2025-07-17 16:44:16 +03:30 · cb9b414cd1
commit cb9b414cd1
parent 9c721f19a6
3 changed files with 73 additions and 202 deletions
--- a/Flair_NER/train.py
+++ b/Flair_NER/train.py
@ -1,194 +0,0 @@
-# -*- coding: utf-8 -*-
-" Original file is located at https://colab.research.google.com/drive/1Yb_fU_WBIs3a_L5G3_A_nxChrnR4Nzb1"
-
-learning_rate   = 0.65e-4 # 0.65e-4 - 0.4e-4
-mini_batch_size = 8
-max_epochs      = 200
-
-from funcs import save_to_file_by_address
-import json
-import os
-from pathlib import Path
-from flair.data import Corpus
-from flair.datasets import ColumnCorpus
-from flair.embeddings import TransformerWordEmbeddings
-from flair.models import SequenceTagger
-from flair.trainers import ModelTrainer
-from flair.models import SequenceTagger
-from flair.embeddings import TransformerDocumentEmbeddings
-
-# from funcs import remove_signs
-# from inference import main
-#model = os.getcwd() + "\\data\\final-model.pt"
-#model = os.getcwd() + "/data/HooshvareLab--distilbert-fa-zwnj-base-ner" # مدل اولیه که تست شد و تا حدود 70 درصد در آخرین آموزش خوب جواب می داد
-#model = os.getcwd() + "/data/distilbert-base-multilingual-cased-tavasi"
-# model = "HooshvareLab/bert-fa-base-uncased-ner-peyma"
-model = "PooryaPiroozfar/Flair-Persian-NER" # 111111111111111
-#model = "Helsinki-NLP/opus-mt-tc-big-fa-itc" # خطا میدهد، برای نر آموزش ندیده
-#model = "zedfum/arman-longformer-8k-finetuned-ensani" # خطا میدهد، برای نر آموزش ندیده
-#model = "AliGhiasvand86/gisha_qa" # خطا میدهد، برای نر آموزش ندیده
-
-## ---------------------------------------------------------
-## --- آخرین کار مورد استفاده در سامانه قانون یار از این آموزش دیده است
-#model = "orgcatorg/xlm-v-base-ner" # بهترین توکنایزر فارسی ***********************
-## ---------------------------------------------------------
-
-
-# from transformers import AutoModel
-# print('1sdfsdf')
-# model = AutoModel.from_pretrained("/home/gpu/HFHOME/hub/models--orgcatorg--xlm-v-base-ner")
-
-#model = "pourmand1376/NER_Farsi" # 
-#model = "HooshvareLab/bert-base-parsbert-ner-uncased"   # ****  خوب جواب داد
-#model = "SeyedAli/Persian-Text-NER-Bert-V1"             # ***** خیلی خوب جواب داد
-#model = "HooshvareLab/bert-base-parsbert-peymaner-uncased" # جالب نبود!
-#model = "HooshvareLab/bert-base-parsbert-armanner-uncased" # جالب نبود!
-#model = "HooshvareLab/bert-base-parsbert-ner-uncased" # جالب نبود!
-print(model)
-print('#'*50)
-print('#'*50)
-
-#!pip install 'flair==0.10'
-
-# define columns
-columns = {0 : 'text', 1 : 'ner'}
-# directory where the data resides
-data_folder = './data/'
-# initializing the corpuscorpus = ColumnCorpus(data_folder, columns, train_file='peyma_train.txt', sequence_length=512)
-#اسم دیتاست اینجا تنظیم شود 
-corpus = ColumnCorpus(data_folder, columns,
-                            #train_file = 'peyma_train.txt')
-                              train_file = 'DATASET.txt') # qavanin 36K tokens
-                              #train_file = 'law_dataset.txt',
-                              #test_file = 'test_gold.txt',
-                              #dev_file = 'dev split 2.txt'
-                              #max_sentence_length=500
-                              #)
-
-# tag to predict
-tag_type = 'ner'
-# make tag dictionary from the corpus
-tag_dictionary = corpus.make_label_dictionary(label_type=tag_type)
-
-#xlm-roberta-large
-# embeddings = TransformerWordEmbeddings(model='HooshvareLab/distilbert-fa-zwnj-base-ner',
-embeddings = TransformerWordEmbeddings(model= model,
-                                       layers="-1",
-                                       subtoken_pooling="first",
-                                    #    pooling='mean',
-                                       fine_tune=True,
-                                       use_context=True,
-                                       from_tf=True,
-                                       allow_long_sentences=True
-                                    #    model_max_length=512,
-                                       )
-
-print('model read successfully !')
-print('#'*50)
-print('#'*50)
-try:
-    tagger = SequenceTagger(hidden_size=256,
-                            embeddings=embeddings,
-                            tag_dictionary= tag_dictionary,
-                            tag_type='ner',
-                            use_crf=False,
-                            use_rnn=False,
-                            reproject_embeddings=False
-                            )
-except Exception as e:
-    print(str(e.args[0]))
-    exit()
-    
-
-from flair.trainers import ModelTrainer
-try:
-
-    trainer = ModelTrainer(tagger, corpus)
-    #resources/taggers/sota-ner-flert
-    # trainer.fine_tune('./taggers',
-    #                   learning_rate=2.0e-6,
-    #                   mini_batch_size=16,
-    #                 #   mini_batch_chunk_size=1,  # remove this parameter to speed up computation if you have a big GPU
-    #                   max_epochs=20
-    #                   )
-except Exception as e:
-    print(str(e.args[0]))
-    exit()
-    
-try:
-    result = trainer.fine_tune('./taggers',
-                               learning_rate= learning_rate,
-                               mini_batch_size= mini_batch_size,
-                               max_epochs= max_epochs
-                               )
-except Exception as e:
-    print(str(e.args[0]))
-    exit()
-try:
-    # Save the model's state dictionary (configuration + weights)
-    #model_state_dict_path = Path('./trained/best-model.pt')  # Assuming best model is saved here
-    #tagger.save(model_state_dict_path)
-
-    # Optionally, save additional hyperparameters to a separate file (e.g., training.json)
-    hyperparameters = {
-        "learning_rate": learning_rate,
-        "mini_batch_size": mini_batch_size,
-        "max_epochs": max_epochs,
-    }
-
-    with open('./trained/training.json', 'w') as f:
-        json.dump(hyperparameters, f, indent=4)
-except Exception as e:
-
-    exit()
-
-try:
-    from train_log_plotter import plot_diagram
-    plot_diagram()
-except:
-    print('log diagram failed due to error!')
-
-train_result = f'''************************************************\n
-##### TRAIN RESULT #####
-F1 Score: {result}
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n'''
-
-# اجرای اینفرنس جهت ارزیابی مدل
-# time = datetime.datetime.now()
-# tagger.save('./trained/trained-model ' + str(time) + '.pt')
-print('#'*70)
-print( '           ********** fine-tune operation finished **********           ')
-import datetime
-operation_time = datetime.datetime.now()
-print(f'           **********  {operation_time}  **********           ')
-print('#'*70)
-
-# ###################################################
-# ارزیابی مدل آموزش دیده
-try:
-    from evaluate_model import do_evaluate
-    print(' Try to evaluating the trained model! ')
-    evaluate_result = do_evaluate()
-    print(' Evaluating finished! ')
-except Exception as e:
-    print('do_evaluate function failed')
-    evaluate_result = f"do_evaluate function failed!\nerror massage:\n{str(e.args[0])}"
-    
-final_result = f"""Model Name: {model}
-Fine-Tune Parameters: {hyperparameters}
-{train_result}
-{evaluate_result}\n
-Fine_Tune time: {operation_time}
------------------------------------------------------------------------------------
------------------------------------------------------------------------------------\n
-"""
-save_to_file_by_address('./data/train_log.txt', final_result)
-print(' Saving results finished! ')
-# ###################################################
-# تست مدل بر یک مقدار مشخص شده
-print(' Try to test trained model! ')
-from inference import inference_main
-inference_main(model,'')
-print(' Testing model finished! ')
-
-
--- a/temp.py
+++ b/temp.py
@ -0,0 +1,67 @@
+from flair.data import Corpus
+from flair.datasets import ColumnCorpus
+from flair.embeddings import FlairEmbeddings, WordEmbeddings, StackedEmbeddings
+from flair.models import SequenceTagger
+from flair.trainers import ModelTrainer
+from flair.data import Dictionary
+from flair.models import SequenceTagger
+
+# Path to your new dataset
+data_folder = './data'  # Folder containing your dataset
+train_file = 'DATASET140402_no_aref.txt' # qavanin 36K tokens
+test_file = 'test_ds_new.txt' # test 110 sections - 6.7K
+
+
+# Column format for your dataset (adjust as necessary)
+# For example: 0 = text, 1 = NER tags
+columns = {0: 'text', 1: 'ner'}
+
+# Load the corpus
+corpus = ColumnCorpus(
+    data_folder=data_folder,
+    column_format=columns,
+    train_file=train_file,
+    test_file=test_file
+)
+
+print(corpus)
+
+# Define the tag dictionary (new NER tags)
+tag_type = 'ner'
+tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
+
+# Load the existing model (mdl.pt)
+tagger = SequenceTagger.load("/home/gpu/tnlp/jokar/Models/catorg/14030906_before_aitools_ds_finetune/final-model.pt")
+
+# Define embeddings (you can modify this as needed)
+embedding_types = [
+    WordEmbeddings('glove'),  # Pre-trained GloVe embeddings
+    FlairEmbeddings('news-forward'),  # Forward Flair embeddings
+    FlairEmbeddings('news-backward')  # Backward Flair embeddings
+]
+
+embeddings = StackedEmbeddings(embeddings=embedding_types)
+
+# Create a new tagger using the updated tag dictionary
+new_tagger = SequenceTagger(
+    hidden_size=256,  # Size of the hidden layer, adjust as needed
+    embeddings=embeddings,
+    tag_dictionary=tag_dictionary,
+    tag_type=tag_type,
+    use_crf=True,
+)
+
+# Transfer the weights from the old model to the new tagger
+new_tagger.load_state_dict(tagger.state_dict(), strict=False)
+
+# Train the model with the new dataset
+trainer = ModelTrainer(new_tagger, corpus)
+
+# Start training
+trainer.train('./output',  # Output folder for the model
+              learning_rate=0.1,
+              mini_batch_size=32,
+              max_epochs=10)  # Adjust parameters as needed
+
+# Save the fine-tuned model
+new_tagger.save('./trained/fine_tuned_mdl.pt')
--- a/train.py
+++ b/train.py
@ -1,9 +1,6 @@
-# -*- coding: utf-8 -*-
-" Original file is located at https://colab.research.google.com/drive/1Yb_fU_WBIs3a_L5G3_A_nxChrnR4Nzb1"
-
 learning_rate   = 0.65e-4 # 0.65e-4 - 0.4e-4
 mini_batch_size = 8
-max_epochs      = 200
+max_epochs      = 100

 from funcs import save_to_file_by_address
 import json
@ -23,7 +20,9 @@ from flair.embeddings import TransformerDocumentEmbeddings
 #model = os.getcwd() + "/data/HooshvareLab--distilbert-fa-zwnj-base-ner" # مدل اولیه که تست شد و تا حدود 70 درصد در آخرین آموزش خوب جواب می داد
 #model = os.getcwd() + "/data/distilbert-base-multilingual-cased-tavasi"
 # model = "HooshvareLab/bert-fa-base-uncased-ner-peyma"
-model = "PooryaPiroozfar/Flair-Persian-NER" # 111111111111111
+# model = "PooryaPiroozfar/Flair-Persian-NER" # 111111111111111
+model_ad = "/home/gpu/tnlp/jokar/Models/catorg/14030906_before_aitools_ds_finetune/final-model.pt"
+model = SequenceTagger.load(model_ad)
 #model = "Helsinki-NLP/opus-mt-tc-big-fa-itc" # خطا میدهد، برای نر آموزش ندیده
 #model = "zedfum/arman-longformer-8k-finetuned-ensani" # خطا میدهد، برای نر آموزش ندیده
 #model = "AliGhiasvand86/gisha_qa" # خطا میدهد، برای نر آموزش ندیده
@ -58,9 +57,8 @@ data_folder = './data/'
 #اسم دیتاست اینجا تنظیم شود 
 corpus = ColumnCorpus(data_folder, columns,
                            #train_file = 'peyma_train.txt')
-                              train_file = 'DATASET.txt') # qavanin 36K tokens
-                              #train_file = 'law_dataset.txt',
-                              #test_file = 'test_gold.txt',
+                              train_file = 'DATASET140402_no_aref.txt', # qavanin 36K tokens
+                              test_file = 'test_ds_new.txt',) # test 110 sections - 6.7K
                              #dev_file = 'dev split 2.txt'
                              #max_sentence_length=500
                              #)