diff --git a/import_data/tokenizer.py b/import_data/tokenizer.py new file mode 100644 index 0000000..8a6fc99 --- /dev/null +++ b/import_data/tokenizer.py @@ -0,0 +1,59 @@ +import re + + +class Tokenizer(): + def __init__(self): + pass + + def tokenize_words(self, doc_string): + token_list = doc_string.strip().split() + #token_list = [x.strip("\u200c").strip('\u200f') for x in token_list if len(x.strip("\u200c")) != 0] + new_token_list = [] + new_token = '' + for index,token in enumerate(token_list): + if len(token.strip("\u200c")) != 0: + new_token = token.strip("\u200c") + if len(token.strip("\u200f")) != 0: + new_token = new_token.strip('\u200f') + new_token_list.append(new_token) + return new_token_list + + def tokenize_sentences(self, doc_string): + #finding the numbers + pattern = r"[-+]?\d*\.\d+|\d+" + nums_list = re.findall(pattern, doc_string) + doc_string = re.sub(pattern, 'floatingpointnumber', doc_string) + + pattern = r'([!\.\?؟]+)[\n]*' + tmp = re.findall(pattern, doc_string) + doc_string = re.sub(pattern, self.add_tab, doc_string) + + pattern = r':\n' + tmp = re.findall(pattern, doc_string) + doc_string = re.sub(pattern, self.add_tab, doc_string) + + pattern = r';\n' + tmp = re.findall(pattern, doc_string) + doc_string = re.sub(pattern, self.add_tab, doc_string) + + pattern = r'؛\n' + tmp = re.findall(pattern, doc_string) + doc_string = re.sub(pattern, self.add_tab, doc_string) + + pattern = r'[\n]+' + doc_string = re.sub(pattern, self.add_tab, doc_string) + + for number in nums_list: + pattern = 'floatingpointnumber' + doc_string = re.sub(pattern, number, doc_string, 1) + + doc_string = doc_string.split('\t\t') + doc_string = [x for x in doc_string if len(x) > 0] + return doc_string + + def add_tab(self, mystring): + mystring = mystring.group() # this method return the string matched by re + mystring = mystring.strip(' ') # ommiting the whitespace around the pucntuation + mystring = mystring.strip('\n') # ommiting the newline around the pucntuation + mystring = " " + mystring + "\t\t" # adding a space after and before punctuation + return mystring \ No newline at end of file