import flair from flair.data import Corpus from flair.datasets import ColumnDataset from flair.datasets import ColumnCorpus from flair.embeddings import TransformerDocumentEmbeddings from flair.models import SequenceTagger from flair.trainers import ModelTrainer model = 'zedfum/arman-longformer-8k-finetuned-ensani' dataset = './jokar/Flair_NER/data/DATASET.txt' # define columns columns = {0 : 'text', 1 : 'ner'} # directory where the data resides data_folder = './data/' # initializing the corpus corpus = ColumnCorpus(data_folder, columns, train_file = 'DATASET_3.txt' ) # Load the dataset for training #corpus = ColumnDataset(dataset, column_format={0: 'text', 1: 'ner'}) # Create a Flair embedding layer using the safetensors model embeddings = TransformerDocumentEmbeddings(model, layers='-1', pooling='mean') # tag to predict tag_type = 'ner' # make tag dictionary from the corpus tag_dictionary = corpus.make_label_dictionary(label_type=tag_type) print(tag_dictionary) print('#'*50) print('#'*50) print('#'*50) # Create a SequenceTagger model for NER tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=corpus.make_tag_dictionary(tag_type=tag_type),tag_type=tag_type) print('SequenceTagger') print('#'*50) print('#'*50) print('#'*50) # Initialize the trainer trainer = ModelTrainer(tagger, corpus) print('ModelTrainer') print('#'*50) print('#'*50) print('#'*50) # Train the model trainer.train('./jokar/Flair_NER/trained', learning_rate=0.65e-4, mini_batch_size=8, max_epochs=10) print('trained!')