Flair_NER/data_helper.py

153 lines
5.8 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pickle
import re
import string
import os
class DataHelper():
def __init__(self):
pass
def clean_text(self, text_doc, new_line_elimination):
punctuations = r')(}{:؟!،؛»«.' + r"/<>?.,:;"
punctuations = '[' + punctuations + string.punctuation + ']'
punctuations = punctuations.replace("@", "")
text_doc.strip()
# pattern = ur'\s*@[a-zA-Z0-9]*\s*'
# tmp = re.findall(pattern, text_doc)
# newstring = re.sub(pattern, eliminate_pattern, text_doc)
#finding the numbers
pattern = r"[-+]?\d*\.\d+|\d+"
nums_list = re.findall(pattern, text_doc)
newstring = re.sub(pattern, 'floatingpointnumber', text_doc)
pattern = '\s*' + punctuations + '+' + '\s*'
tmp = re.findall(pattern, newstring)
newstring = re.sub(pattern, self.add_space, newstring)
# pattern = u'([a-zA-Z0-9]+)(\s*)(' + punctuations + u')(\s*)([a-zA-Z0-9]+)'
# rep = ur'\1\3\5'
# tmp = re.findall(pattern, newstring)
# newstring = re.sub(pattern, rep, newstring)
pattern = r'[\n]+'
tmp = re.findall(pattern, newstring)
if new_line_elimination:
newstring = re.sub(pattern, " ", newstring)
else:
# newstring = re.sub(pattern, "\n", newstring)
pass
punctuations = r")(}{:؟!-،؛»«.@$&%" + r"/<>?.,:;"
latinLettersDigits = r"a-zA-Z0-9"
pattern = r'[^' + punctuations + latinLettersDigits + 'آ-ی' + '' + '\d\s:]'
tmp = re.findall(pattern, newstring)
newstring = re.sub(pattern, self.eliminate_pattern, newstring)
pattern = r'[ ]+'
tmp = re.findall(pattern, newstring)
newstring = re.sub(pattern, ' ', newstring)
for number in nums_list:
pattern = 'floatingpointnumber'
newstring = re.sub(pattern, number, newstring, 1)
return newstring
def add_space(self, mystring):
mystring = mystring.group() # this method return the string matched by re
mystring = mystring.strip(' ') # ommiting the whitespace around the pucntuation
mystring = " " + mystring + " " # adding a space after and before punctuation
return mystring
def replace_newline_with_dot(self, mystring):
return ' . '
def eliminate_pattern(self, mystring):
return ""
def load_var(self, load_path):
file = open(load_path, 'rb')
variable = pickle.load(file)
file.close()
return variable
def save_var(self, save_path, variable):
print("saving vars ...")
file = open(save_path, 'wb')
pickle.dump(variable, file)
print("variable saved.")
file.close()
def build_stem_dictionary(self, normalizer, verb_tense_path, mokasar_noun_path):
path_dir = "resource/Persian_Dependency_Treebank/Data/2ndRep"
lexicon_stem = set()
verb_stem = set()
#verb_tense_map = {}
verb_p2f_map = {}
verb_f2p_map = {}
for fileName in os.listdir(path_dir):
file_path = path_dir + "/" + fileName
with open(file_path, "r") as input:
input_content = input.readlines()
for el in input_content:
el = normalizer.sub_alphabets(el)
el = el.split("\t")
if (len(el) > 2):
if (el[3] == 'V'):
tmp_pos = "V"
else:
tmp_pos = "N"
stem_word = el[2]
stem_word = stem_word.split("#")
stem_word = [x.strip('\u200c') for x in stem_word]
if (tmp_pos == "V" and len(stem_word) == 2):
if (len(stem_word[0]) != 0 and len(stem_word[1]) != 0):
verb_p2f_map[stem_word[0]] = stem_word[1]
verb_f2p_map[stem_word[1]] = stem_word[0]
verb_stem.add(stem_word[0])
verb_stem.add(stem_word[1])
if(tmp_pos == 'V' and len(stem_word) == 3):
if(len(stem_word[0]) != 0 and len(stem_word[1]) != 0 and len(stem_word[2]) !=0):
#verb_prifix.add(stem_word[0])
verb_p2f_map[stem_word[1]] = stem_word[2]
verb_f2p_map[stem_word[2]] = stem_word[1]
verb_stem.add(stem_word[1])
verb_stem.add(stem_word[2])
for t in stem_word:
if len(t) > 1:
if (tmp_pos == 'N'):
lexicon_stem.add(t)
with open(verb_tense_path, "r") as bon_file:
bon_file_content = bon_file.readlines()
for el in bon_file_content:
el = el.strip()
el = normalizer.sub_alphabets(el)
el = el.split()
el = [x.strip('\u200c') for x in el]
verb_p2f_map[el[0]] = el[1]
verb_f2p_map[el[1]] = el[0]
verb_stem.add(el[0])
verb_stem.add(el[1])
irregular_noun = {}
with open(mokasar_noun_path, "r") as input:
input_content = input.readlines()
for el in input_content:
el = normalizer.sub_alphabets(el)
el = el.replace("\t\t", "\t")
el = el.strip().split("\t")
el = [x.strip('\u200c') for x in el]
irregular_noun[el[0]] = el[1]
lexicon_stem.add(el[0])
verb_tense_map = [verb_p2f_map, verb_f2p_map]
return lexicon_stem, verb_stem, verb_tense_map, irregular_noun