diff --git a/main_qa_data/data_helper.py b/main_qa_data/data_helper.py new file mode 100644 index 0000000..fe3a854 --- /dev/null +++ b/main_qa_data/data_helper.py @@ -0,0 +1,152 @@ +import pickle +import re +import string +import os + + +class DataHelper(): + def __init__(self): + pass + + def clean_text(self, text_doc, new_line_elimination): + punctuations = r')(}{:؟!،؛»«.' + r"/<>?.,:;" + punctuations = '[' + punctuations + string.punctuation + ']' + punctuations = punctuations.replace("@", "") + + text_doc.strip() + + # pattern = ur'\s*@[a-zA-Z0-9]*\s*' + # tmp = re.findall(pattern, text_doc) + # newstring = re.sub(pattern, eliminate_pattern, text_doc) + + + #finding the numbers + pattern = r"[-+]?\d*\.\d+|\d+" + nums_list = re.findall(pattern, text_doc) + newstring = re.sub(pattern, 'floatingpointnumber', text_doc) + + + #pattern = '\s*' + punctuations + '+' + '\s*' + #tmp = re.findall(pattern, newstring) + #newstring = re.sub(pattern, self.add_space, newstring) + + # pattern = u'([a-zA-Z0-9]+)(\s*)(' + punctuations + u')(\s*)([a-zA-Z0-9]+)' + # rep = ur'\1\3\5' + # tmp = re.findall(pattern, newstring) + # newstring = re.sub(pattern, rep, newstring) + + pattern = r'[\n]+' + tmp = re.findall(pattern, newstring) + if new_line_elimination: + newstring = re.sub(pattern, " ", newstring) + else: + # newstring = re.sub(pattern, "\n", newstring) + pass + + punctuations = r")(}{:؟!-،؛»«.@$&%" + r"/<>?.,:;" + latinLettersDigits = r"a-zA-Z0-9" + pattern = r'[^' + punctuations + latinLettersDigits + 'آ-ی' + '‌' + '\d\s:]' + tmp = re.findall(pattern, newstring) + newstring = re.sub(pattern, self.eliminate_pattern, newstring) + + pattern = r'[ ]+' + tmp = re.findall(pattern, newstring) + newstring = re.sub(pattern, ' ', newstring) + + for number in nums_list: + pattern = 'floatingpointnumber' + newstring = re.sub(pattern, number, newstring, 1) + + return newstring + + def add_space(self, mystring): + mystring = mystring.group() # this method return the string matched by re + mystring = mystring.strip(' ') # ommiting the whitespace around the pucntuation + mystring = " " + mystring + " " # adding a space after and before punctuation + return mystring + + def replace_newline_with_dot(self, mystring): + return ' . ' + + def eliminate_pattern(self, mystring): + return "" + + def load_var(self, load_path): + file = open(load_path, 'rb') + variable = pickle.load(file) + file.close() + return variable + + def save_var(self, save_path, variable): + print("saving vars ...") + file = open(save_path, 'wb') + pickle.dump(variable, file) + print("variable saved.") + file.close() + + def build_stem_dictionary(self, normalizer, verb_tense_path, mokasar_noun_path): + path_dir = "resource/Persian_Dependency_Treebank/Data/2ndRep" + lexicon_stem = set() + verb_stem = set() + #verb_tense_map = {} + verb_p2f_map = {} + verb_f2p_map = {} + for fileName in os.listdir(path_dir): + file_path = path_dir + "/" + fileName + with open(file_path, "r") as input: + input_content = input.readlines() + for el in input_content: + el = normalizer.sub_alphabets(el) + el = el.split("\t") + if (len(el) > 2): + if (el[3] == 'V'): + tmp_pos = "V" + else: + tmp_pos = "N" + stem_word = el[2] + stem_word = stem_word.split("#") + stem_word = [x.strip('\u200c') for x in stem_word] + if (tmp_pos == "V" and len(stem_word) == 2): + if (len(stem_word[0]) != 0 and len(stem_word[1]) != 0): + verb_p2f_map[stem_word[0]] = stem_word[1] + verb_f2p_map[stem_word[1]] = stem_word[0] + verb_stem.add(stem_word[0]) + verb_stem.add(stem_word[1]) + if(tmp_pos == 'V' and len(stem_word) == 3): + if(len(stem_word[0]) != 0 and len(stem_word[1]) != 0 and len(stem_word[2]) !=0): + #verb_prifix.add(stem_word[0]) + verb_p2f_map[stem_word[1]] = stem_word[2] + verb_f2p_map[stem_word[2]] = stem_word[1] + verb_stem.add(stem_word[1]) + verb_stem.add(stem_word[2]) + for t in stem_word: + if len(t) > 1: + if (tmp_pos == 'N'): + lexicon_stem.add(t) + + with open(verb_tense_path, "r") as bon_file: + bon_file_content = bon_file.readlines() + for el in bon_file_content: + el = el.strip() + el = normalizer.sub_alphabets(el) + el = el.split() + el = [x.strip('\u200c') for x in el] + + verb_p2f_map[el[0]] = el[1] + verb_f2p_map[el[1]] = el[0] + verb_stem.add(el[0]) + verb_stem.add(el[1]) + + irregular_noun = {} + with open(mokasar_noun_path, "r") as input: + input_content = input.readlines() + for el in input_content: + el = normalizer.sub_alphabets(el) + el = el.replace("\t\t", "\t") + el = el.strip().split("\t") + el = [x.strip('\u200c') for x in el] + irregular_noun[el[0]] = el[1] + lexicon_stem.add(el[0]) + + verb_tense_map = [verb_p2f_map, verb_f2p_map] + return lexicon_stem, verb_stem, verb_tense_map, irregular_noun