Compare commits

..

3 Commits

3 changed files with 104 additions and 31 deletions

View File

@ -21,12 +21,10 @@ python flair_ner_train.py
````
## Project Structure
A simple view of project's dependencies tree:
A simple view of project tree:
![project structure image](./images/project_structure.png)
This image made with **pylint** and **graphviz**:
````shell
pip install pylint graphviz
pyreverse -o png -p <YourProjectName> <path/to/your/project>

View File

@ -250,18 +250,54 @@ class Normalizer():
L = wrds.__len__()
if L < 2:
return doc_string
cnt = 1
for i in range(0, L - 1):
w = wrds[i] + wrds[i + 1]
# NOTE: مشکل کد زیر :
# NOTE: یک کلمه می تواند هم با کلمه قبل و هم با کلمه بعد ترکیب موفق درست کند و این باعث تکرار یک تک کلمه در دو ترکیب می شود.
# cnt = 1
# for i in range(0, L - 1):
# w = wrds[i] + wrds[i + 1]
# try:
# out_sentences = out_sentences + ' ' + self.dic2[w]
# cnt = 0
# except KeyError:
# if cnt == 1:
# out_sentences = out_sentences + ' ' + wrds[i]
# cnt = 1
# if cnt == 1:
# out_sentences = out_sentences + ' ' + wrds[i + 1]
# NOTE: کد جایگزین
# cnt = 1
# for i in range(L-1):
# if cnt == 0:
# cnt = 1
# continue
# w = wrds[i] + wrds[i+1]
# try:
# out_sentences = out_sentences + ' ' + self.dic2[w]
# cnt = 0
# except KeyError:
# if cnt == 1:
# out_sentences = out_sentences + ' ' + wrds[i]
# cnt = 1
# if cnt == 1:
# out_sentences = out_sentences + ' ' + wrds[L-1]
# NOTE: کد جایگزین دیگر
i = 0
while i > L -1:
w = wrds[i] + wrds[i+1]
try:
out_sentences = out_sentences + ' ' + self.dic2[w]
cnt = 0
i = i + 2
except KeyError:
if cnt == 1:
out_sentences = out_sentences + ' ' + wrds[i]
cnt = 1
if cnt == 1:
out_sentences = out_sentences + ' ' + wrds[i + 1]
out_sentences = out_sentences + ' ' + wrds[i]
i = i + 1
if i == L - 1:
out_sentences = out_sentences + ' ' + wrds[L-1]
return out_sentences
def space_correction_plus3(self, doc_string):
@ -271,24 +307,63 @@ class Normalizer():
L = wrds.__len__()
if L < 3:
return doc_string
cnt = 1
cnt2 = 0
for i in range(0, L - 2):
# NOTE: مشکل کد زیر :
# NOTE: این کد نیز مشکل ترکیبات هم پوشان را دارد به طوری که یک کلمه می تواند در سه ترکیب (یک بار انتها و یک بار وسط و یک بار ابتدای ترکیب) تکرار شود.
# cnt = 1
# cnt2 = 0
# for i in range(0, L - 2):
# w = wrds[i] + wrds[i + 1] + wrds[i + 2]
# try:
# out_sentences = out_sentences + ' ' + self.dic3[w]
# cnt = 0
# cnt2 = 2
# except KeyError:
# if cnt == 1 and cnt2 == 0:
# out_sentences = out_sentences + ' ' + wrds[i]
# else:
# cnt2 -= 1
# cnt = 1
# if cnt == 1 and cnt2 == 0:
# out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
# elif cnt == 1 and cnt2 == 1:
# out_sentences = out_sentences + ' ' + wrds[i + 2]
# NOTE: کد جایگزین
# cnt = 0
# for i in range(0, L - 2):
# if cnt > 0:
# cnt -= 1
# continue
# w = wrds[i] + wrds[i + 1] + wrds[i + 2]
# try:
# out_sentences = out_sentences + ' ' + self.dic3[w]
# cnt = 2
# except KeyError:
# out_sentences = out_sentences + ' ' + wrds[i]
# if cnt == 2:
# out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
# elif cnt == 1:
# out_sentences = out_sentences + ' ' + wrds[i + 2]
# return out_sentences
# NOTE: کد جایگزین دیگر
i = 0
while i < L - 2:
w = wrds[i] + wrds[i + 1] + wrds[i + 2]
try:
out_sentences = out_sentences + ' ' + self.dic3[w]
cnt = 0
cnt2 = 2
except KeyError:
if cnt == 1 and cnt2 == 0:
out_sentences = out_sentences + ' ' + wrds[i]
else:
cnt2 -= 1
cnt = 1
if cnt == 1 and cnt2 == 0:
out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
elif cnt == 1 and cnt2 == 1:
out_sentences = out_sentences + ' ' + wrds[i + 2]
if w in self.dic3:
out_sentences += ' ' + self.dic3[w]
i += 3
else:
out_sentences += ' ' + wrds[i]
i += 1
while i < L:
out_sentences += ' ' + wrds[i]
i += 1
return out_sentences
def normalize(self, doc_string, new_line_elimination=False, return_dates = False):

View File

@ -16,7 +16,7 @@ class Tokenizer():
nums_list = re.findall(pattern, doc_string)
doc_string = re.sub(pattern, 'floatingpointnumber', doc_string)
pattern = r'([!\.\?؟]+)[\n]*'
pattern = r'([!\.\?؟]+)\n*'
tmp = re.findall(pattern, doc_string)
doc_string = re.sub(pattern, self.add_tab, doc_string)
@ -32,7 +32,7 @@ class Tokenizer():
tmp = re.findall(pattern, doc_string)
doc_string = re.sub(pattern, self.add_tab, doc_string)
pattern = r'[\n]+'
pattern = r'\n+'
doc_string = re.sub(pattern, self.add_tab, doc_string)
for number in nums_list: