Upload files to "main_qa_data"
This commit is contained in:
parent
617a83b36d
commit
d8d863eebe
152
main_qa_data/data_helper.py
Normal file
152
main_qa_data/data_helper.py
Normal file
|
@ -0,0 +1,152 @@
|
|||
import pickle
|
||||
import re
|
||||
import string
|
||||
import os
|
||||
|
||||
|
||||
class DataHelper():
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def clean_text(self, text_doc, new_line_elimination):
|
||||
punctuations = r')(}{:؟!،؛»«.' + r"/<>?.,:;"
|
||||
punctuations = '[' + punctuations + string.punctuation + ']'
|
||||
punctuations = punctuations.replace("@", "")
|
||||
|
||||
text_doc.strip()
|
||||
|
||||
# pattern = ur'\s*@[a-zA-Z0-9]*\s*'
|
||||
# tmp = re.findall(pattern, text_doc)
|
||||
# newstring = re.sub(pattern, eliminate_pattern, text_doc)
|
||||
|
||||
|
||||
#finding the numbers
|
||||
pattern = r"[-+]?\d*\.\d+|\d+"
|
||||
nums_list = re.findall(pattern, text_doc)
|
||||
newstring = re.sub(pattern, 'floatingpointnumber', text_doc)
|
||||
|
||||
|
||||
#pattern = '\s*' + punctuations + '+' + '\s*'
|
||||
#tmp = re.findall(pattern, newstring)
|
||||
#newstring = re.sub(pattern, self.add_space, newstring)
|
||||
|
||||
# pattern = u'([a-zA-Z0-9]+)(\s*)(' + punctuations + u')(\s*)([a-zA-Z0-9]+)'
|
||||
# rep = ur'\1\3\5'
|
||||
# tmp = re.findall(pattern, newstring)
|
||||
# newstring = re.sub(pattern, rep, newstring)
|
||||
|
||||
pattern = r'[\n]+'
|
||||
tmp = re.findall(pattern, newstring)
|
||||
if new_line_elimination:
|
||||
newstring = re.sub(pattern, " ", newstring)
|
||||
else:
|
||||
# newstring = re.sub(pattern, "\n", newstring)
|
||||
pass
|
||||
|
||||
punctuations = r")(}{:؟!-،؛»«.@$&%" + r"/<>?.,:;"
|
||||
latinLettersDigits = r"a-zA-Z0-9"
|
||||
pattern = r'[^' + punctuations + latinLettersDigits + 'آ-ی' + '' + '\d\s:]'
|
||||
tmp = re.findall(pattern, newstring)
|
||||
newstring = re.sub(pattern, self.eliminate_pattern, newstring)
|
||||
|
||||
pattern = r'[ ]+'
|
||||
tmp = re.findall(pattern, newstring)
|
||||
newstring = re.sub(pattern, ' ', newstring)
|
||||
|
||||
for number in nums_list:
|
||||
pattern = 'floatingpointnumber'
|
||||
newstring = re.sub(pattern, number, newstring, 1)
|
||||
|
||||
return newstring
|
||||
|
||||
def add_space(self, mystring):
|
||||
mystring = mystring.group() # this method return the string matched by re
|
||||
mystring = mystring.strip(' ') # ommiting the whitespace around the pucntuation
|
||||
mystring = " " + mystring + " " # adding a space after and before punctuation
|
||||
return mystring
|
||||
|
||||
def replace_newline_with_dot(self, mystring):
|
||||
return ' . '
|
||||
|
||||
def eliminate_pattern(self, mystring):
|
||||
return ""
|
||||
|
||||
def load_var(self, load_path):
|
||||
file = open(load_path, 'rb')
|
||||
variable = pickle.load(file)
|
||||
file.close()
|
||||
return variable
|
||||
|
||||
def save_var(self, save_path, variable):
|
||||
print("saving vars ...")
|
||||
file = open(save_path, 'wb')
|
||||
pickle.dump(variable, file)
|
||||
print("variable saved.")
|
||||
file.close()
|
||||
|
||||
def build_stem_dictionary(self, normalizer, verb_tense_path, mokasar_noun_path):
|
||||
path_dir = "resource/Persian_Dependency_Treebank/Data/2ndRep"
|
||||
lexicon_stem = set()
|
||||
verb_stem = set()
|
||||
#verb_tense_map = {}
|
||||
verb_p2f_map = {}
|
||||
verb_f2p_map = {}
|
||||
for fileName in os.listdir(path_dir):
|
||||
file_path = path_dir + "/" + fileName
|
||||
with open(file_path, "r") as input:
|
||||
input_content = input.readlines()
|
||||
for el in input_content:
|
||||
el = normalizer.sub_alphabets(el)
|
||||
el = el.split("\t")
|
||||
if (len(el) > 2):
|
||||
if (el[3] == 'V'):
|
||||
tmp_pos = "V"
|
||||
else:
|
||||
tmp_pos = "N"
|
||||
stem_word = el[2]
|
||||
stem_word = stem_word.split("#")
|
||||
stem_word = [x.strip('\u200c') for x in stem_word]
|
||||
if (tmp_pos == "V" and len(stem_word) == 2):
|
||||
if (len(stem_word[0]) != 0 and len(stem_word[1]) != 0):
|
||||
verb_p2f_map[stem_word[0]] = stem_word[1]
|
||||
verb_f2p_map[stem_word[1]] = stem_word[0]
|
||||
verb_stem.add(stem_word[0])
|
||||
verb_stem.add(stem_word[1])
|
||||
if(tmp_pos == 'V' and len(stem_word) == 3):
|
||||
if(len(stem_word[0]) != 0 and len(stem_word[1]) != 0 and len(stem_word[2]) !=0):
|
||||
#verb_prifix.add(stem_word[0])
|
||||
verb_p2f_map[stem_word[1]] = stem_word[2]
|
||||
verb_f2p_map[stem_word[2]] = stem_word[1]
|
||||
verb_stem.add(stem_word[1])
|
||||
verb_stem.add(stem_word[2])
|
||||
for t in stem_word:
|
||||
if len(t) > 1:
|
||||
if (tmp_pos == 'N'):
|
||||
lexicon_stem.add(t)
|
||||
|
||||
with open(verb_tense_path, "r") as bon_file:
|
||||
bon_file_content = bon_file.readlines()
|
||||
for el in bon_file_content:
|
||||
el = el.strip()
|
||||
el = normalizer.sub_alphabets(el)
|
||||
el = el.split()
|
||||
el = [x.strip('\u200c') for x in el]
|
||||
|
||||
verb_p2f_map[el[0]] = el[1]
|
||||
verb_f2p_map[el[1]] = el[0]
|
||||
verb_stem.add(el[0])
|
||||
verb_stem.add(el[1])
|
||||
|
||||
irregular_noun = {}
|
||||
with open(mokasar_noun_path, "r") as input:
|
||||
input_content = input.readlines()
|
||||
for el in input_content:
|
||||
el = normalizer.sub_alphabets(el)
|
||||
el = el.replace("\t\t", "\t")
|
||||
el = el.strip().split("\t")
|
||||
el = [x.strip('\u200c') for x in el]
|
||||
irregular_noun[el[0]] = el[1]
|
||||
lexicon_stem.add(el[0])
|
||||
|
||||
verb_tense_map = [verb_p2f_map, verb_f2p_map]
|
||||
return lexicon_stem, verb_stem, verb_tense_map, irregular_noun
|
Loading…
Reference in New Issue
Block a user