import re class Tokenizer(): def __init__(self): pass def tokenize_words(self, doc_string): token_list = doc_string.strip().split() #token_list = [x.strip("\u200c").strip('\u200f') for x in token_list if len(x.strip("\u200c")) != 0] new_token_list = [] new_token = '' for index,token in enumerate(token_list): if len(token.strip("\u200c")) != 0: new_token = token.strip("\u200c") if len(token.strip("\u200f")) != 0: new_token = new_token.strip('\u200f') new_token_list.append(new_token) return new_token_list def tokenize_sentences(self, doc_string): #finding the numbers pattern = r"[-+]?\d*\.\d+|\d+" nums_list = re.findall(pattern, doc_string) doc_string = re.sub(pattern, 'floatingpointnumber', doc_string) pattern = r'([!\.\?؟]+)[\n]*' tmp = re.findall(pattern, doc_string) doc_string = re.sub(pattern, self.add_tab, doc_string) pattern = r':\n' tmp = re.findall(pattern, doc_string) doc_string = re.sub(pattern, self.add_tab, doc_string) pattern = r';\n' tmp = re.findall(pattern, doc_string) doc_string = re.sub(pattern, self.add_tab, doc_string) pattern = r'؛\n' tmp = re.findall(pattern, doc_string) doc_string = re.sub(pattern, self.add_tab, doc_string) pattern = r'[\n]+' doc_string = re.sub(pattern, self.add_tab, doc_string) for number in nums_list: pattern = 'floatingpointnumber' doc_string = re.sub(pattern, number, doc_string, 1) doc_string = doc_string.split('\t\t') doc_string = [x for x in doc_string if len(x) > 0] return doc_string def add_tab(self, mystring): mystring = mystring.group() # this method return the string matched by re mystring = mystring.strip(' ') # ommiting the whitespace around the pucntuation mystring = mystring.strip('\n') # ommiting the newline around the pucntuation mystring = " " + mystring + "\t\t" # adding a space after and before punctuation return mystring