From 7b9537dd55f4cf299d9f15fe01c8cf6845146f4d Mon Sep 17 00:00:00 2001
From: m_ghazizadeh <selfless315programmer@gmail.com>
Date: Mon, 30 Jun 2025 18:24:43 +0330
Subject: [PATCH 1/3] Remove additional [] in two regex codes where doens't
 needed!

---
 tokenizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tokenizer.py b/tokenizer.py
index 59694cd..946a7ad 100644
--- a/tokenizer.py
+++ b/tokenizer.py
@@ -16,7 +16,7 @@ class Tokenizer():
         nums_list = re.findall(pattern, doc_string)
         doc_string = re.sub(pattern, 'floatingpointnumber', doc_string)
 
-        pattern = r'([!\.\?؟]+)[\n]*'
+        pattern = r'([!\.\?؟]+)\n*'
         tmp = re.findall(pattern, doc_string)
         doc_string = re.sub(pattern, self.add_tab, doc_string)
 
@@ -32,7 +32,7 @@ class Tokenizer():
         tmp = re.findall(pattern, doc_string)
         doc_string = re.sub(pattern, self.add_tab, doc_string)
 
-        pattern = r'[\n]+'
+        pattern = r'\n+'
         doc_string = re.sub(pattern, self.add_tab, doc_string)
 
         for number in nums_list:

From d6f3f95c37d6c6ba92ecf555d6dec81145c24f22 Mon Sep 17 00:00:00 2001
From: m_ghazizadeh <selfless315programmer@gmail.com>
Date: Tue, 1 Jul 2025 14:58:17 +0330
Subject: [PATCH 2/3] Prevent space_correction_plus2() from repeating one word
 in two combinations

---
 normalizer.py | 54 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 9 deletions(-)

diff --git a/normalizer.py b/normalizer.py
index 64890b3..fbd7d89 100644
--- a/normalizer.py
+++ b/normalizer.py
@@ -250,18 +250,54 @@ class Normalizer():
         L = wrds.__len__()
         if L < 2:
             return doc_string
-        cnt = 1
-        for i in range(0, L - 1):
-            w = wrds[i] + wrds[i + 1]
+            
+        # NOTE: مشکل کد زیر :
+        # NOTE: یک کلمه می تواند هم با کلمه قبل و هم با کلمه بعد ترکیب موفق درست کند و این باعث تکرار یک تک کلمه در دو ترکیب می شود.
+        
+        # cnt = 1
+        #     for i in range(0, L - 1):
+        #         w = wrds[i] + wrds[i + 1]
+        #         try:
+        #             out_sentences = out_sentences + ' ' + self.dic2[w]
+        #             cnt = 0
+        #         except KeyError:
+        #             if cnt == 1:
+        #                 out_sentences = out_sentences + ' ' + wrds[i]
+        #             cnt = 1
+        #     if cnt == 1:
+        #         out_sentences = out_sentences + ' ' + wrds[i + 1]
+        
+        # NOTE: کد جایگزین 
+
+        # cnt = 1
+        # for i in range(L-1):
+        #     if cnt == 0:
+        #         cnt = 1 
+        #         continue
+        #     w = wrds[i] + wrds[i+1]
+        #     try:
+        #         out_sentences = out_sentences + ' ' + self.dic2[w]
+        #         cnt = 0
+        #     except KeyError:
+        #         if cnt == 1:
+        #             out_sentences = out_sentences + ' ' + wrds[i]
+        #         cnt = 1
+        # if cnt == 1:
+        #     out_sentences = out_sentences + ' ' + wrds[L-1]
+
+        # NOTE: کد جایگزین دیگر
+
+        i = 0
+        while i > L -1:
+            w = wrds[i] + wrds[i+1]
             try:
                 out_sentences = out_sentences + ' ' + self.dic2[w]
-                cnt = 0
+                i = i + 2
             except KeyError:
-                if cnt == 1:
-                    out_sentences = out_sentences + ' ' + wrds[i]
-                cnt = 1
-        if cnt == 1:
-            out_sentences = out_sentences + ' ' + wrds[i + 1]
+                out_sentences = out_sentences + ' ' + wrds[i]
+                i = i + 1
+        if i == L - 1:
+            out_sentences = out_sentences + ' ' + wrds[L-1]
         return out_sentences
 
     def space_correction_plus3(self, doc_string):

From 93142f70e92b22e7820490d69abadd66604b5692 Mon Sep 17 00:00:00 2001
From: m_ghazizadeh <selfless315programmer@gmail.com>
Date: Tue, 1 Jul 2025 14:59:56 +0330
Subject: [PATCH 3/3] Prevent space_correction_plus3() from repeating one word
 in three combinations

---
 normalizer.py | 73 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 56 insertions(+), 17 deletions(-)

diff --git a/normalizer.py b/normalizer.py
index fbd7d89..0e3e1e5 100644
--- a/normalizer.py
+++ b/normalizer.py
@@ -307,24 +307,63 @@ class Normalizer():
         L = wrds.__len__()
         if L < 3:
             return doc_string
-        cnt = 1
-        cnt2 = 0
-        for i in range(0, L - 2):
+        
+        # NOTE: مشکل کد زیر :
+        # NOTE: این کد نیز مشکل ترکیبات هم پوشان را دارد به طوری که یک کلمه می تواند در سه ترکیب (یک بار انتها و یک بار وسط و یک بار ابتدای ترکیب) تکرار شود.
+
+        # cnt = 1
+        # cnt2 = 0
+        # for i in range(0, L - 2):
+        #     w = wrds[i] + wrds[i + 1] + wrds[i + 2]
+        #     try:
+        #         out_sentences = out_sentences + ' ' + self.dic3[w]
+        #         cnt = 0
+        #         cnt2 = 2
+        #     except KeyError:
+        #         if cnt == 1 and cnt2 == 0:
+        #             out_sentences = out_sentences + ' ' + wrds[i]
+        #         else:
+        #             cnt2 -= 1
+        #         cnt = 1
+        # if cnt == 1 and cnt2 == 0:
+        #     out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
+        # elif cnt == 1 and cnt2 == 1:
+        #     out_sentences = out_sentences + ' ' + wrds[i + 2]
+
+        # NOTE: کد جایگزین 
+
+        # cnt = 0
+        # for i in range(0, L - 2):
+        #     if cnt > 0:
+        #         cnt -= 1
+        #         continue
+        #     w = wrds[i] + wrds[i + 1] + wrds[i + 2]
+        #     try:
+        #         out_sentences = out_sentences + ' ' + self.dic3[w]
+        #         cnt = 2
+        #     except KeyError:
+        #         out_sentences = out_sentences + ' ' + wrds[i]
+        # if cnt == 2:
+        #     out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
+        # elif cnt == 1:
+        #     out_sentences = out_sentences + ' ' + wrds[i + 2]
+
+        # return out_sentences
+
+        # NOTE: کد جایگزین دیگر
+
+        i = 0
+        while i < L - 2:
             w = wrds[i] + wrds[i + 1] + wrds[i + 2]
-            try:
-                out_sentences = out_sentences + ' ' + self.dic3[w]
-                cnt = 0
-                cnt2 = 2
-            except KeyError:
-                if cnt == 1 and cnt2 == 0:
-                    out_sentences = out_sentences + ' ' + wrds[i]
-                else:
-                    cnt2 -= 1
-                cnt = 1
-        if cnt == 1 and cnt2 == 0:
-            out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
-        elif cnt == 1 and cnt2 == 1:
-            out_sentences = out_sentences + ' ' + wrds[i + 2]
+            if w in self.dic3:
+                out_sentences += ' ' + self.dic3[w]
+                i += 3  
+            else:
+                out_sentences += ' ' + wrds[i]
+                i += 1
+        while i < L:
+            out_sentences += ' ' + wrds[i]
+            i += 1
         return out_sentences
 
     def normalize(self, doc_string, new_line_elimination=False, return_dates = False):