try to train old orgcatorg model
This commit is contained in:
parent
9c721f19a6
commit
cb9b414cd1
|
@ -1,194 +0,0 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
" Original file is located at https://colab.research.google.com/drive/1Yb_fU_WBIs3a_L5G3_A_nxChrnR4Nzb1"
|
|
||||||
|
|
||||||
learning_rate = 0.65e-4 # 0.65e-4 - 0.4e-4
|
|
||||||
mini_batch_size = 8
|
|
||||||
max_epochs = 200
|
|
||||||
|
|
||||||
from funcs import save_to_file_by_address
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
from flair.data import Corpus
|
|
||||||
from flair.datasets import ColumnCorpus
|
|
||||||
from flair.embeddings import TransformerWordEmbeddings
|
|
||||||
from flair.models import SequenceTagger
|
|
||||||
from flair.trainers import ModelTrainer
|
|
||||||
from flair.models import SequenceTagger
|
|
||||||
from flair.embeddings import TransformerDocumentEmbeddings
|
|
||||||
|
|
||||||
# from funcs import remove_signs
|
|
||||||
# from inference import main
|
|
||||||
#model = os.getcwd() + "\\data\\final-model.pt"
|
|
||||||
#model = os.getcwd() + "/data/HooshvareLab--distilbert-fa-zwnj-base-ner" # مدل اولیه که تست شد و تا حدود 70 درصد در آخرین آموزش خوب جواب می داد
|
|
||||||
#model = os.getcwd() + "/data/distilbert-base-multilingual-cased-tavasi"
|
|
||||||
# model = "HooshvareLab/bert-fa-base-uncased-ner-peyma"
|
|
||||||
model = "PooryaPiroozfar/Flair-Persian-NER" # 111111111111111
|
|
||||||
#model = "Helsinki-NLP/opus-mt-tc-big-fa-itc" # خطا میدهد، برای نر آموزش ندیده
|
|
||||||
#model = "zedfum/arman-longformer-8k-finetuned-ensani" # خطا میدهد، برای نر آموزش ندیده
|
|
||||||
#model = "AliGhiasvand86/gisha_qa" # خطا میدهد، برای نر آموزش ندیده
|
|
||||||
|
|
||||||
## ---------------------------------------------------------
|
|
||||||
## --- آخرین کار مورد استفاده در سامانه قانون یار از این آموزش دیده است
|
|
||||||
#model = "orgcatorg/xlm-v-base-ner" # بهترین توکنایزر فارسی ***********************
|
|
||||||
## ---------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
# from transformers import AutoModel
|
|
||||||
# print('1sdfsdf')
|
|
||||||
# model = AutoModel.from_pretrained("/home/gpu/HFHOME/hub/models--orgcatorg--xlm-v-base-ner")
|
|
||||||
|
|
||||||
#model = "pourmand1376/NER_Farsi" #
|
|
||||||
#model = "HooshvareLab/bert-base-parsbert-ner-uncased" # **** خوب جواب داد
|
|
||||||
#model = "SeyedAli/Persian-Text-NER-Bert-V1" # ***** خیلی خوب جواب داد
|
|
||||||
#model = "HooshvareLab/bert-base-parsbert-peymaner-uncased" # جالب نبود!
|
|
||||||
#model = "HooshvareLab/bert-base-parsbert-armanner-uncased" # جالب نبود!
|
|
||||||
#model = "HooshvareLab/bert-base-parsbert-ner-uncased" # جالب نبود!
|
|
||||||
print(model)
|
|
||||||
print('#'*50)
|
|
||||||
print('#'*50)
|
|
||||||
|
|
||||||
#!pip install 'flair==0.10'
|
|
||||||
|
|
||||||
# define columns
|
|
||||||
columns = {0 : 'text', 1 : 'ner'}
|
|
||||||
# directory where the data resides
|
|
||||||
data_folder = './data/'
|
|
||||||
# initializing the corpuscorpus = ColumnCorpus(data_folder, columns, train_file='peyma_train.txt', sequence_length=512)
|
|
||||||
#اسم دیتاست اینجا تنظیم شود
|
|
||||||
corpus = ColumnCorpus(data_folder, columns,
|
|
||||||
#train_file = 'peyma_train.txt')
|
|
||||||
train_file = 'DATASET.txt') # qavanin 36K tokens
|
|
||||||
#train_file = 'law_dataset.txt',
|
|
||||||
#test_file = 'test_gold.txt',
|
|
||||||
#dev_file = 'dev split 2.txt'
|
|
||||||
#max_sentence_length=500
|
|
||||||
#)
|
|
||||||
|
|
||||||
# tag to predict
|
|
||||||
tag_type = 'ner'
|
|
||||||
# make tag dictionary from the corpus
|
|
||||||
tag_dictionary = corpus.make_label_dictionary(label_type=tag_type)
|
|
||||||
|
|
||||||
#xlm-roberta-large
|
|
||||||
# embeddings = TransformerWordEmbeddings(model='HooshvareLab/distilbert-fa-zwnj-base-ner',
|
|
||||||
embeddings = TransformerWordEmbeddings(model= model,
|
|
||||||
layers="-1",
|
|
||||||
subtoken_pooling="first",
|
|
||||||
# pooling='mean',
|
|
||||||
fine_tune=True,
|
|
||||||
use_context=True,
|
|
||||||
from_tf=True,
|
|
||||||
allow_long_sentences=True
|
|
||||||
# model_max_length=512,
|
|
||||||
)
|
|
||||||
|
|
||||||
print('model read successfully !')
|
|
||||||
print('#'*50)
|
|
||||||
print('#'*50)
|
|
||||||
try:
|
|
||||||
tagger = SequenceTagger(hidden_size=256,
|
|
||||||
embeddings=embeddings,
|
|
||||||
tag_dictionary= tag_dictionary,
|
|
||||||
tag_type='ner',
|
|
||||||
use_crf=False,
|
|
||||||
use_rnn=False,
|
|
||||||
reproject_embeddings=False
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
print(str(e.args[0]))
|
|
||||||
exit()
|
|
||||||
|
|
||||||
|
|
||||||
from flair.trainers import ModelTrainer
|
|
||||||
try:
|
|
||||||
|
|
||||||
trainer = ModelTrainer(tagger, corpus)
|
|
||||||
#resources/taggers/sota-ner-flert
|
|
||||||
# trainer.fine_tune('./taggers',
|
|
||||||
# learning_rate=2.0e-6,
|
|
||||||
# mini_batch_size=16,
|
|
||||||
# # mini_batch_chunk_size=1, # remove this parameter to speed up computation if you have a big GPU
|
|
||||||
# max_epochs=20
|
|
||||||
# )
|
|
||||||
except Exception as e:
|
|
||||||
print(str(e.args[0]))
|
|
||||||
exit()
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = trainer.fine_tune('./taggers',
|
|
||||||
learning_rate= learning_rate,
|
|
||||||
mini_batch_size= mini_batch_size,
|
|
||||||
max_epochs= max_epochs
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
print(str(e.args[0]))
|
|
||||||
exit()
|
|
||||||
try:
|
|
||||||
# Save the model's state dictionary (configuration + weights)
|
|
||||||
#model_state_dict_path = Path('./trained/best-model.pt') # Assuming best model is saved here
|
|
||||||
#tagger.save(model_state_dict_path)
|
|
||||||
|
|
||||||
# Optionally, save additional hyperparameters to a separate file (e.g., training.json)
|
|
||||||
hyperparameters = {
|
|
||||||
"learning_rate": learning_rate,
|
|
||||||
"mini_batch_size": mini_batch_size,
|
|
||||||
"max_epochs": max_epochs,
|
|
||||||
}
|
|
||||||
|
|
||||||
with open('./trained/training.json', 'w') as f:
|
|
||||||
json.dump(hyperparameters, f, indent=4)
|
|
||||||
except Exception as e:
|
|
||||||
|
|
||||||
exit()
|
|
||||||
|
|
||||||
try:
|
|
||||||
from train_log_plotter import plot_diagram
|
|
||||||
plot_diagram()
|
|
||||||
except:
|
|
||||||
print('log diagram failed due to error!')
|
|
||||||
|
|
||||||
train_result = f'''************************************************\n
|
|
||||||
##### TRAIN RESULT #####
|
|
||||||
F1 Score: {result}
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n'''
|
|
||||||
|
|
||||||
# اجرای اینفرنس جهت ارزیابی مدل
|
|
||||||
# time = datetime.datetime.now()
|
|
||||||
# tagger.save('./trained/trained-model ' + str(time) + '.pt')
|
|
||||||
print('#'*70)
|
|
||||||
print( ' ********** fine-tune operation finished ********** ')
|
|
||||||
import datetime
|
|
||||||
operation_time = datetime.datetime.now()
|
|
||||||
print(f' ********** {operation_time} ********** ')
|
|
||||||
print('#'*70)
|
|
||||||
|
|
||||||
# ###################################################
|
|
||||||
# ارزیابی مدل آموزش دیده
|
|
||||||
try:
|
|
||||||
from evaluate_model import do_evaluate
|
|
||||||
print(' Try to evaluating the trained model! ')
|
|
||||||
evaluate_result = do_evaluate()
|
|
||||||
print(' Evaluating finished! ')
|
|
||||||
except Exception as e:
|
|
||||||
print('do_evaluate function failed')
|
|
||||||
evaluate_result = f"do_evaluate function failed!\nerror massage:\n{str(e.args[0])}"
|
|
||||||
|
|
||||||
final_result = f"""Model Name: {model}
|
|
||||||
Fine-Tune Parameters: {hyperparameters}
|
|
||||||
{train_result}
|
|
||||||
{evaluate_result}\n
|
|
||||||
Fine_Tune time: {operation_time}
|
|
||||||
------------------------------------------------------------------------------------
|
|
||||||
------------------------------------------------------------------------------------\n
|
|
||||||
"""
|
|
||||||
save_to_file_by_address('./data/train_log.txt', final_result)
|
|
||||||
print(' Saving results finished! ')
|
|
||||||
# ###################################################
|
|
||||||
# تست مدل بر یک مقدار مشخص شده
|
|
||||||
print(' Try to test trained model! ')
|
|
||||||
from inference import inference_main
|
|
||||||
inference_main(model,'')
|
|
||||||
print(' Testing model finished! ')
|
|
||||||
|
|
||||||
|
|
67
temp.py
Normal file
67
temp.py
Normal file
|
@ -0,0 +1,67 @@
|
||||||
|
from flair.data import Corpus
|
||||||
|
from flair.datasets import ColumnCorpus
|
||||||
|
from flair.embeddings import FlairEmbeddings, WordEmbeddings, StackedEmbeddings
|
||||||
|
from flair.models import SequenceTagger
|
||||||
|
from flair.trainers import ModelTrainer
|
||||||
|
from flair.data import Dictionary
|
||||||
|
from flair.models import SequenceTagger
|
||||||
|
|
||||||
|
# Path to your new dataset
|
||||||
|
data_folder = './data' # Folder containing your dataset
|
||||||
|
train_file = 'DATASET140402_no_aref.txt' # qavanin 36K tokens
|
||||||
|
test_file = 'test_ds_new.txt' # test 110 sections - 6.7K
|
||||||
|
|
||||||
|
|
||||||
|
# Column format for your dataset (adjust as necessary)
|
||||||
|
# For example: 0 = text, 1 = NER tags
|
||||||
|
columns = {0: 'text', 1: 'ner'}
|
||||||
|
|
||||||
|
# Load the corpus
|
||||||
|
corpus = ColumnCorpus(
|
||||||
|
data_folder=data_folder,
|
||||||
|
column_format=columns,
|
||||||
|
train_file=train_file,
|
||||||
|
test_file=test_file
|
||||||
|
)
|
||||||
|
|
||||||
|
print(corpus)
|
||||||
|
|
||||||
|
# Define the tag dictionary (new NER tags)
|
||||||
|
tag_type = 'ner'
|
||||||
|
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
|
||||||
|
|
||||||
|
# Load the existing model (mdl.pt)
|
||||||
|
tagger = SequenceTagger.load("/home/gpu/tnlp/jokar/Models/catorg/14030906_before_aitools_ds_finetune/final-model.pt")
|
||||||
|
|
||||||
|
# Define embeddings (you can modify this as needed)
|
||||||
|
embedding_types = [
|
||||||
|
WordEmbeddings('glove'), # Pre-trained GloVe embeddings
|
||||||
|
FlairEmbeddings('news-forward'), # Forward Flair embeddings
|
||||||
|
FlairEmbeddings('news-backward') # Backward Flair embeddings
|
||||||
|
]
|
||||||
|
|
||||||
|
embeddings = StackedEmbeddings(embeddings=embedding_types)
|
||||||
|
|
||||||
|
# Create a new tagger using the updated tag dictionary
|
||||||
|
new_tagger = SequenceTagger(
|
||||||
|
hidden_size=256, # Size of the hidden layer, adjust as needed
|
||||||
|
embeddings=embeddings,
|
||||||
|
tag_dictionary=tag_dictionary,
|
||||||
|
tag_type=tag_type,
|
||||||
|
use_crf=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Transfer the weights from the old model to the new tagger
|
||||||
|
new_tagger.load_state_dict(tagger.state_dict(), strict=False)
|
||||||
|
|
||||||
|
# Train the model with the new dataset
|
||||||
|
trainer = ModelTrainer(new_tagger, corpus)
|
||||||
|
|
||||||
|
# Start training
|
||||||
|
trainer.train('./output', # Output folder for the model
|
||||||
|
learning_rate=0.1,
|
||||||
|
mini_batch_size=32,
|
||||||
|
max_epochs=10) # Adjust parameters as needed
|
||||||
|
|
||||||
|
# Save the fine-tuned model
|
||||||
|
new_tagger.save('./trained/fine_tuned_mdl.pt')
|
14
train.py
14
train.py
|
@ -1,9 +1,6 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
" Original file is located at https://colab.research.google.com/drive/1Yb_fU_WBIs3a_L5G3_A_nxChrnR4Nzb1"
|
|
||||||
|
|
||||||
learning_rate = 0.65e-4 # 0.65e-4 - 0.4e-4
|
learning_rate = 0.65e-4 # 0.65e-4 - 0.4e-4
|
||||||
mini_batch_size = 8
|
mini_batch_size = 8
|
||||||
max_epochs = 200
|
max_epochs = 100
|
||||||
|
|
||||||
from funcs import save_to_file_by_address
|
from funcs import save_to_file_by_address
|
||||||
import json
|
import json
|
||||||
|
@ -23,7 +20,9 @@ from flair.embeddings import TransformerDocumentEmbeddings
|
||||||
#model = os.getcwd() + "/data/HooshvareLab--distilbert-fa-zwnj-base-ner" # مدل اولیه که تست شد و تا حدود 70 درصد در آخرین آموزش خوب جواب می داد
|
#model = os.getcwd() + "/data/HooshvareLab--distilbert-fa-zwnj-base-ner" # مدل اولیه که تست شد و تا حدود 70 درصد در آخرین آموزش خوب جواب می داد
|
||||||
#model = os.getcwd() + "/data/distilbert-base-multilingual-cased-tavasi"
|
#model = os.getcwd() + "/data/distilbert-base-multilingual-cased-tavasi"
|
||||||
# model = "HooshvareLab/bert-fa-base-uncased-ner-peyma"
|
# model = "HooshvareLab/bert-fa-base-uncased-ner-peyma"
|
||||||
model = "PooryaPiroozfar/Flair-Persian-NER" # 111111111111111
|
# model = "PooryaPiroozfar/Flair-Persian-NER" # 111111111111111
|
||||||
|
model_ad = "/home/gpu/tnlp/jokar/Models/catorg/14030906_before_aitools_ds_finetune/final-model.pt"
|
||||||
|
model = SequenceTagger.load(model_ad)
|
||||||
#model = "Helsinki-NLP/opus-mt-tc-big-fa-itc" # خطا میدهد، برای نر آموزش ندیده
|
#model = "Helsinki-NLP/opus-mt-tc-big-fa-itc" # خطا میدهد، برای نر آموزش ندیده
|
||||||
#model = "zedfum/arman-longformer-8k-finetuned-ensani" # خطا میدهد، برای نر آموزش ندیده
|
#model = "zedfum/arman-longformer-8k-finetuned-ensani" # خطا میدهد، برای نر آموزش ندیده
|
||||||
#model = "AliGhiasvand86/gisha_qa" # خطا میدهد، برای نر آموزش ندیده
|
#model = "AliGhiasvand86/gisha_qa" # خطا میدهد، برای نر آموزش ندیده
|
||||||
|
@ -58,9 +57,8 @@ data_folder = './data/'
|
||||||
#اسم دیتاست اینجا تنظیم شود
|
#اسم دیتاست اینجا تنظیم شود
|
||||||
corpus = ColumnCorpus(data_folder, columns,
|
corpus = ColumnCorpus(data_folder, columns,
|
||||||
#train_file = 'peyma_train.txt')
|
#train_file = 'peyma_train.txt')
|
||||||
train_file = 'DATASET.txt') # qavanin 36K tokens
|
train_file = 'DATASET140402_no_aref.txt', # qavanin 36K tokens
|
||||||
#train_file = 'law_dataset.txt',
|
test_file = 'test_ds_new.txt',) # test 110 sections - 6.7K
|
||||||
#test_file = 'test_gold.txt',
|
|
||||||
#dev_file = 'dev split 2.txt'
|
#dev_file = 'dev split 2.txt'
|
||||||
#max_sentence_length=500
|
#max_sentence_length=500
|
||||||
#)
|
#)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user