153 lines
5.8 KiB
Python
153 lines
5.8 KiB
Python
import pickle
|
||
import re
|
||
import string
|
||
import os
|
||
|
||
|
||
class DataHelper():
|
||
def __init__(self):
|
||
pass
|
||
|
||
def clean_text(self, text_doc, new_line_elimination):
|
||
punctuations = r')(}{:؟!،؛»«.' + r"/<>?.,:;"
|
||
punctuations = '[' + punctuations + string.punctuation + ']'
|
||
punctuations = punctuations.replace("@", "")
|
||
|
||
text_doc.strip()
|
||
|
||
# pattern = ur'\s*@[a-zA-Z0-9]*\s*'
|
||
# tmp = re.findall(pattern, text_doc)
|
||
# newstring = re.sub(pattern, eliminate_pattern, text_doc)
|
||
|
||
|
||
#finding the numbers
|
||
pattern = r"[-+]?\d*\.\d+|\d+"
|
||
nums_list = re.findall(pattern, text_doc)
|
||
newstring = re.sub(pattern, 'floatingpointnumber', text_doc)
|
||
|
||
|
||
pattern = '\s*' + punctuations + '+' + '\s*'
|
||
tmp = re.findall(pattern, newstring)
|
||
newstring = re.sub(pattern, self.add_space, newstring)
|
||
|
||
# pattern = u'([a-zA-Z0-9]+)(\s*)(' + punctuations + u')(\s*)([a-zA-Z0-9]+)'
|
||
# rep = ur'\1\3\5'
|
||
# tmp = re.findall(pattern, newstring)
|
||
# newstring = re.sub(pattern, rep, newstring)
|
||
|
||
pattern = r'[\n]+'
|
||
tmp = re.findall(pattern, newstring)
|
||
if new_line_elimination:
|
||
newstring = re.sub(pattern, " ", newstring)
|
||
else:
|
||
# newstring = re.sub(pattern, "\n", newstring)
|
||
pass
|
||
|
||
punctuations = r")(}{:؟!-،؛»«.@$&%" + r"/<>?.,:;"
|
||
latinLettersDigits = r"a-zA-Z0-9"
|
||
pattern = r'[^' + punctuations + latinLettersDigits + 'آ-ی' + '' + '\d\s:]'
|
||
tmp = re.findall(pattern, newstring)
|
||
newstring = re.sub(pattern, self.eliminate_pattern, newstring)
|
||
|
||
pattern = r'[ ]+'
|
||
tmp = re.findall(pattern, newstring)
|
||
newstring = re.sub(pattern, ' ', newstring)
|
||
|
||
for number in nums_list:
|
||
pattern = 'floatingpointnumber'
|
||
newstring = re.sub(pattern, number, newstring, 1)
|
||
|
||
return newstring
|
||
|
||
def add_space(self, mystring):
|
||
mystring = mystring.group() # this method return the string matched by re
|
||
mystring = mystring.strip(' ') # ommiting the whitespace around the pucntuation
|
||
mystring = " " + mystring + " " # adding a space after and before punctuation
|
||
return mystring
|
||
|
||
def replace_newline_with_dot(self, mystring):
|
||
return ' . '
|
||
|
||
def eliminate_pattern(self, mystring):
|
||
return ""
|
||
|
||
def load_var(self, load_path):
|
||
file = open(load_path, 'rb')
|
||
variable = pickle.load(file)
|
||
file.close()
|
||
return variable
|
||
|
||
def save_var(self, save_path, variable):
|
||
print("saving vars ...")
|
||
file = open(save_path, 'wb')
|
||
pickle.dump(variable, file)
|
||
print("variable saved.")
|
||
file.close()
|
||
|
||
def build_stem_dictionary(self, normalizer, verb_tense_path, mokasar_noun_path):
|
||
path_dir = "resource/Persian_Dependency_Treebank/Data/2ndRep"
|
||
lexicon_stem = set()
|
||
verb_stem = set()
|
||
#verb_tense_map = {}
|
||
verb_p2f_map = {}
|
||
verb_f2p_map = {}
|
||
for fileName in os.listdir(path_dir):
|
||
file_path = path_dir + "/" + fileName
|
||
with open(file_path, "r") as input:
|
||
input_content = input.readlines()
|
||
for el in input_content:
|
||
el = normalizer.sub_alphabets(el)
|
||
el = el.split("\t")
|
||
if (len(el) > 2):
|
||
if (el[3] == 'V'):
|
||
tmp_pos = "V"
|
||
else:
|
||
tmp_pos = "N"
|
||
stem_word = el[2]
|
||
stem_word = stem_word.split("#")
|
||
stem_word = [x.strip('\u200c') for x in stem_word]
|
||
if (tmp_pos == "V" and len(stem_word) == 2):
|
||
if (len(stem_word[0]) != 0 and len(stem_word[1]) != 0):
|
||
verb_p2f_map[stem_word[0]] = stem_word[1]
|
||
verb_f2p_map[stem_word[1]] = stem_word[0]
|
||
verb_stem.add(stem_word[0])
|
||
verb_stem.add(stem_word[1])
|
||
if(tmp_pos == 'V' and len(stem_word) == 3):
|
||
if(len(stem_word[0]) != 0 and len(stem_word[1]) != 0 and len(stem_word[2]) !=0):
|
||
#verb_prifix.add(stem_word[0])
|
||
verb_p2f_map[stem_word[1]] = stem_word[2]
|
||
verb_f2p_map[stem_word[2]] = stem_word[1]
|
||
verb_stem.add(stem_word[1])
|
||
verb_stem.add(stem_word[2])
|
||
for t in stem_word:
|
||
if len(t) > 1:
|
||
if (tmp_pos == 'N'):
|
||
lexicon_stem.add(t)
|
||
|
||
with open(verb_tense_path, "r") as bon_file:
|
||
bon_file_content = bon_file.readlines()
|
||
for el in bon_file_content:
|
||
el = el.strip()
|
||
el = normalizer.sub_alphabets(el)
|
||
el = el.split()
|
||
el = [x.strip('\u200c') for x in el]
|
||
|
||
verb_p2f_map[el[0]] = el[1]
|
||
verb_f2p_map[el[1]] = el[0]
|
||
verb_stem.add(el[0])
|
||
verb_stem.add(el[1])
|
||
|
||
irregular_noun = {}
|
||
with open(mokasar_noun_path, "r") as input:
|
||
input_content = input.readlines()
|
||
for el in input_content:
|
||
el = normalizer.sub_alphabets(el)
|
||
el = el.replace("\t\t", "\t")
|
||
el = el.strip().split("\t")
|
||
el = [x.strip('\u200c') for x in el]
|
||
irregular_noun[el[0]] = el[1]
|
||
lexicon_stem.add(el[0])
|
||
|
||
verb_tense_map = [verb_p2f_map, verb_f2p_map]
|
||
return lexicon_stem, verb_stem, verb_tense_map, irregular_noun
|