import pickle import re import string import os class DataHelper(): def __init__(self): pass def clean_text(self, text_doc, new_line_elimination): punctuations = r')(}{:؟!،؛»«.' + r"/<>?.,:;" punctuations = '[' + punctuations + string.punctuation + ']' punctuations = punctuations.replace("@", "") text_doc.strip() # pattern = ur'\s*@[a-zA-Z0-9]*\s*' # tmp = re.findall(pattern, text_doc) # newstring = re.sub(pattern, eliminate_pattern, text_doc) #finding the numbers pattern = r"[-+]?\d*\.\d+|\d+" nums_list = re.findall(pattern, text_doc) newstring = re.sub(pattern, 'floatingpointnumber', text_doc) #pattern = '\s*' + punctuations + '+' + '\s*' #tmp = re.findall(pattern, newstring) #newstring = re.sub(pattern, self.add_space, newstring) # pattern = u'([a-zA-Z0-9]+)(\s*)(' + punctuations + u')(\s*)([a-zA-Z0-9]+)' # rep = ur'\1\3\5' # tmp = re.findall(pattern, newstring) # newstring = re.sub(pattern, rep, newstring) pattern = r'[\n]+' tmp = re.findall(pattern, newstring) if new_line_elimination: newstring = re.sub(pattern, " ", newstring) else: # newstring = re.sub(pattern, "\n", newstring) pass punctuations = r")(}{:؟!-،؛»«.@$&%" + r"/<>?.,:;" latinLettersDigits = r"a-zA-Z0-9" pattern = r'[^' + punctuations + latinLettersDigits + 'آ-ی' + '‌' + '\d\s:]' tmp = re.findall(pattern, newstring) newstring = re.sub(pattern, self.eliminate_pattern, newstring) pattern = r'[ ]+' tmp = re.findall(pattern, newstring) newstring = re.sub(pattern, ' ', newstring) for number in nums_list: pattern = 'floatingpointnumber' newstring = re.sub(pattern, number, newstring, 1) return newstring def add_space(self, mystring): mystring = mystring.group() # this method return the string matched by re mystring = mystring.strip(' ') # ommiting the whitespace around the pucntuation mystring = " " + mystring + " " # adding a space after and before punctuation return mystring def replace_newline_with_dot(self, mystring): return ' . ' def eliminate_pattern(self, mystring): return "" def load_var(self, load_path): file = open(load_path, 'rb') variable = pickle.load(file) file.close() return variable def save_var(self, save_path, variable): print("saving vars ...") file = open(save_path, 'wb') pickle.dump(variable, file) print("variable saved.") file.close() def build_stem_dictionary(self, normalizer, verb_tense_path, mokasar_noun_path): path_dir = "resource/Persian_Dependency_Treebank/Data/2ndRep" lexicon_stem = set() verb_stem = set() #verb_tense_map = {} verb_p2f_map = {} verb_f2p_map = {} for fileName in os.listdir(path_dir): file_path = path_dir + "/" + fileName with open(file_path, "r") as input: input_content = input.readlines() for el in input_content: el = normalizer.sub_alphabets(el) el = el.split("\t") if (len(el) > 2): if (el[3] == 'V'): tmp_pos = "V" else: tmp_pos = "N" stem_word = el[2] stem_word = stem_word.split("#") stem_word = [x.strip('\u200c') for x in stem_word] if (tmp_pos == "V" and len(stem_word) == 2): if (len(stem_word[0]) != 0 and len(stem_word[1]) != 0): verb_p2f_map[stem_word[0]] = stem_word[1] verb_f2p_map[stem_word[1]] = stem_word[0] verb_stem.add(stem_word[0]) verb_stem.add(stem_word[1]) if(tmp_pos == 'V' and len(stem_word) == 3): if(len(stem_word[0]) != 0 and len(stem_word[1]) != 0 and len(stem_word[2]) !=0): #verb_prifix.add(stem_word[0]) verb_p2f_map[stem_word[1]] = stem_word[2] verb_f2p_map[stem_word[2]] = stem_word[1] verb_stem.add(stem_word[1]) verb_stem.add(stem_word[2]) for t in stem_word: if len(t) > 1: if (tmp_pos == 'N'): lexicon_stem.add(t) with open(verb_tense_path, "r") as bon_file: bon_file_content = bon_file.readlines() for el in bon_file_content: el = el.strip() el = normalizer.sub_alphabets(el) el = el.split() el = [x.strip('\u200c') for x in el] verb_p2f_map[el[0]] = el[1] verb_f2p_map[el[1]] = el[0] verb_stem.add(el[0]) verb_stem.add(el[1]) irregular_noun = {} with open(mokasar_noun_path, "r") as input: input_content = input.readlines() for el in input_content: el = normalizer.sub_alphabets(el) el = el.replace("\t\t", "\t") el = el.strip().split("\t") el = [x.strip('\u200c') for x in el] irregular_noun[el[0]] = el[1] lexicon_stem.add(el[0]) verb_tense_map = [verb_p2f_map, verb_f2p_map] return lexicon_stem, verb_stem, verb_tense_map, irregular_noun