Prevent space_correction_plus3() from repeating one word in three combinations

Prevent space_correction_plus2() from repeating one word in two combinations
Remove additional [] in two regex codes where doens't needed!
2025-07-01 14:59:56 +03:30 · 2025-07-01 14:58:17 +03:30 · 2025-06-30 18:24:43 +03:30
3 changed files with 104 additions and 31 deletions
--- a/README.md
+++ b/README.md
@ -21,12 +21,10 @@ python flair_ner_train.py
 ````

 ## Project Structure
-A simple view of project's dependencies tree:
-
+A simple view of project tree:
 ![project structure image](./images/project_structure.png)

 This image made with **pylint** and **graphviz**:
-
 ````shell
 pip install pylint graphviz
 pyreverse -o png -p <YourProjectName> <path/to/your/project>
--- a/normalizer.py
+++ b/normalizer.py
@ -250,18 +250,54 @@ class Normalizer():
        L = wrds.__len__()
        if L < 2:
            return doc_string
-        cnt = 1
-        for i in range(0, L - 1):
-            w = wrds[i] + wrds[i + 1]
+            
+        # NOTE: مشکل کد زیر :
+        # NOTE: یک کلمه می تواند هم با کلمه قبل و هم با کلمه بعد ترکیب موفق درست کند و این باعث تکرار یک تک کلمه در دو ترکیب می شود.
+        
+        # cnt = 1
+        #     for i in range(0, L - 1):
+        #         w = wrds[i] + wrds[i + 1]
+        #         try:
+        #             out_sentences = out_sentences + ' ' + self.dic2[w]
+        #             cnt = 0
+        #         except KeyError:
+        #             if cnt == 1:
+        #                 out_sentences = out_sentences + ' ' + wrds[i]
+        #             cnt = 1
+        #     if cnt == 1:
+        #         out_sentences = out_sentences + ' ' + wrds[i + 1]
+        
+        # NOTE: کد جایگزین 
+
+        # cnt = 1
+        # for i in range(L-1):
+        #     if cnt == 0:
+        #         cnt = 1 
+        #         continue
+        #     w = wrds[i] + wrds[i+1]
+        #     try:
+        #         out_sentences = out_sentences + ' ' + self.dic2[w]
+        #         cnt = 0
+        #     except KeyError:
+        #         if cnt == 1:
+        #             out_sentences = out_sentences + ' ' + wrds[i]
+        #         cnt = 1
+        # if cnt == 1:
+        #     out_sentences = out_sentences + ' ' + wrds[L-1]
+
+        # NOTE: کد جایگزین دیگر
+
+        i = 0
+        while i > L -1:
+            w = wrds[i] + wrds[i+1]
            try:
                out_sentences = out_sentences + ' ' + self.dic2[w]
-                cnt = 0
+                i = i + 2
            except KeyError:
-                if cnt == 1:
-                    out_sentences = out_sentences + ' ' + wrds[i]
-                cnt = 1
-        if cnt == 1:
-            out_sentences = out_sentences + ' ' + wrds[i + 1]
+                out_sentences = out_sentences + ' ' + wrds[i]
+                i = i + 1
+        if i == L - 1:
+            out_sentences = out_sentences + ' ' + wrds[L-1]
        return out_sentences

    def space_correction_plus3(self, doc_string):
@ -271,24 +307,63 @@ class Normalizer():
        L = wrds.__len__()
        if L < 3:
            return doc_string
-        cnt = 1
-        cnt2 = 0
-        for i in range(0, L - 2):
+        
+        # NOTE: مشکل کد زیر :
+        # NOTE: این کد نیز مشکل ترکیبات هم پوشان را دارد به طوری که یک کلمه می تواند در سه ترکیب (یک بار انتها و یک بار وسط و یک بار ابتدای ترکیب) تکرار شود.
+
+        # cnt = 1
+        # cnt2 = 0
+        # for i in range(0, L - 2):
+        #     w = wrds[i] + wrds[i + 1] + wrds[i + 2]
+        #     try:
+        #         out_sentences = out_sentences + ' ' + self.dic3[w]
+        #         cnt = 0
+        #         cnt2 = 2
+        #     except KeyError:
+        #         if cnt == 1 and cnt2 == 0:
+        #             out_sentences = out_sentences + ' ' + wrds[i]
+        #         else:
+        #             cnt2 -= 1
+        #         cnt = 1
+        # if cnt == 1 and cnt2 == 0:
+        #     out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
+        # elif cnt == 1 and cnt2 == 1:
+        #     out_sentences = out_sentences + ' ' + wrds[i + 2]
+
+        # NOTE: کد جایگزین 
+
+        # cnt = 0
+        # for i in range(0, L - 2):
+        #     if cnt > 0:
+        #         cnt -= 1
+        #         continue
+        #     w = wrds[i] + wrds[i + 1] + wrds[i + 2]
+        #     try:
+        #         out_sentences = out_sentences + ' ' + self.dic3[w]
+        #         cnt = 2
+        #     except KeyError:
+        #         out_sentences = out_sentences + ' ' + wrds[i]
+        # if cnt == 2:
+        #     out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
+        # elif cnt == 1:
+        #     out_sentences = out_sentences + ' ' + wrds[i + 2]
+
+        # return out_sentences
+
+        # NOTE: کد جایگزین دیگر
+
+        i = 0
+        while i < L - 2:
            w = wrds[i] + wrds[i + 1] + wrds[i + 2]
-            try:
-                out_sentences = out_sentences + ' ' + self.dic3[w]
-                cnt = 0
-                cnt2 = 2
-            except KeyError:
-                if cnt == 1 and cnt2 == 0:
-                    out_sentences = out_sentences + ' ' + wrds[i]
-                else:
-                    cnt2 -= 1
-                cnt = 1
-        if cnt == 1 and cnt2 == 0:
-            out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
-        elif cnt == 1 and cnt2 == 1:
-            out_sentences = out_sentences + ' ' + wrds[i + 2]
+            if w in self.dic3:
+                out_sentences += ' ' + self.dic3[w]
+                i += 3  
+            else:
+                out_sentences += ' ' + wrds[i]
+                i += 1
+        while i < L:
+            out_sentences += ' ' + wrds[i]
+            i += 1
        return out_sentences

    def normalize(self, doc_string, new_line_elimination=False, return_dates = False):
--- a/tokenizer.py
+++ b/tokenizer.py
@ -16,7 +16,7 @@ class Tokenizer():
        nums_list = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, 'floatingpointnumber', doc_string)

-        pattern = r'([!\.\?؟]+)[\n]*'
+        pattern = r'([!\.\?؟]+)\n*'
        tmp = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, self.add_tab, doc_string)

@ -32,7 +32,7 @@ class Tokenizer():
        tmp = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, self.add_tab, doc_string)

-        pattern = r'[\n]+'
+        pattern = r'\n+'
        doc_string = re.sub(pattern, self.add_tab, doc_string)

        for number in nums_list:
Author	SHA1	Message	Date
m_ghazizadeh	93142f70e9	Prevent space_correction_plus3() from repeating one word in three combinations	2025-07-01 14:59:56 +03:30
m_ghazizadeh	d6f3f95c37	Prevent space_correction_plus2() from repeating one word in two combinations	2025-07-01 14:58:17 +03:30
m_ghazizadeh	7b9537dd55	Remove additional [] in two regex codes where doens't needed!	2025-06-30 18:24:43 +03:30