Upload files to "import_data"
This commit is contained in:
parent
26a283cc16
commit
2e141b5cca
59
import_data/tokenizer.py
Normal file
59
import_data/tokenizer.py
Normal file
|
@ -0,0 +1,59 @@
|
|||
import re
|
||||
|
||||
|
||||
class Tokenizer():
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def tokenize_words(self, doc_string):
|
||||
token_list = doc_string.strip().split()
|
||||
#token_list = [x.strip("\u200c").strip('\u200f') for x in token_list if len(x.strip("\u200c")) != 0]
|
||||
new_token_list = []
|
||||
new_token = ''
|
||||
for index,token in enumerate(token_list):
|
||||
if len(token.strip("\u200c")) != 0:
|
||||
new_token = token.strip("\u200c")
|
||||
if len(token.strip("\u200f")) != 0:
|
||||
new_token = new_token.strip('\u200f')
|
||||
new_token_list.append(new_token)
|
||||
return new_token_list
|
||||
|
||||
def tokenize_sentences(self, doc_string):
|
||||
#finding the numbers
|
||||
pattern = r"[-+]?\d*\.\d+|\d+"
|
||||
nums_list = re.findall(pattern, doc_string)
|
||||
doc_string = re.sub(pattern, 'floatingpointnumber', doc_string)
|
||||
|
||||
pattern = r'([!\.\?؟]+)[\n]*'
|
||||
tmp = re.findall(pattern, doc_string)
|
||||
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
||||
|
||||
pattern = r':\n'
|
||||
tmp = re.findall(pattern, doc_string)
|
||||
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
||||
|
||||
pattern = r';\n'
|
||||
tmp = re.findall(pattern, doc_string)
|
||||
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
||||
|
||||
pattern = r'؛\n'
|
||||
tmp = re.findall(pattern, doc_string)
|
||||
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
||||
|
||||
pattern = r'[\n]+'
|
||||
doc_string = re.sub(pattern, self.add_tab, doc_string)
|
||||
|
||||
for number in nums_list:
|
||||
pattern = 'floatingpointnumber'
|
||||
doc_string = re.sub(pattern, number, doc_string, 1)
|
||||
|
||||
doc_string = doc_string.split('\t\t')
|
||||
doc_string = [x for x in doc_string if len(x) > 0]
|
||||
return doc_string
|
||||
|
||||
def add_tab(self, mystring):
|
||||
mystring = mystring.group() # this method return the string matched by re
|
||||
mystring = mystring.strip(' ') # ommiting the whitespace around the pucntuation
|
||||
mystring = mystring.strip('\n') # ommiting the newline around the pucntuation
|
||||
mystring = " " + mystring + "\t\t" # adding a space after and before punctuation
|
||||
return mystring
|
Loading…
Reference in New Issue
Block a user