From 93142f70e92b22e7820490d69abadd66604b5692 Mon Sep 17 00:00:00 2001
From: m_ghazizadeh <selfless315programmer@gmail.com>
Date: Tue, 1 Jul 2025 14:59:56 +0330
Subject: [PATCH] Prevent space_correction_plus3() from repeating one word in
 three combinations

---
 normalizer.py | 73 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 56 insertions(+), 17 deletions(-)

diff --git a/normalizer.py b/normalizer.py
index fbd7d89..0e3e1e5 100644
--- a/normalizer.py
+++ b/normalizer.py
@@ -307,24 +307,63 @@ class Normalizer():
         L = wrds.__len__()
         if L < 3:
             return doc_string
-        cnt = 1
-        cnt2 = 0
-        for i in range(0, L - 2):
+        
+        # NOTE: مشکل کد زیر :
+        # NOTE: این کد نیز مشکل ترکیبات هم پوشان را دارد به طوری که یک کلمه می تواند در سه ترکیب (یک بار انتها و یک بار وسط و یک بار ابتدای ترکیب) تکرار شود.
+
+        # cnt = 1
+        # cnt2 = 0
+        # for i in range(0, L - 2):
+        #     w = wrds[i] + wrds[i + 1] + wrds[i + 2]
+        #     try:
+        #         out_sentences = out_sentences + ' ' + self.dic3[w]
+        #         cnt = 0
+        #         cnt2 = 2
+        #     except KeyError:
+        #         if cnt == 1 and cnt2 == 0:
+        #             out_sentences = out_sentences + ' ' + wrds[i]
+        #         else:
+        #             cnt2 -= 1
+        #         cnt = 1
+        # if cnt == 1 and cnt2 == 0:
+        #     out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
+        # elif cnt == 1 and cnt2 == 1:
+        #     out_sentences = out_sentences + ' ' + wrds[i + 2]
+
+        # NOTE: کد جایگزین 
+
+        # cnt = 0
+        # for i in range(0, L - 2):
+        #     if cnt > 0:
+        #         cnt -= 1
+        #         continue
+        #     w = wrds[i] + wrds[i + 1] + wrds[i + 2]
+        #     try:
+        #         out_sentences = out_sentences + ' ' + self.dic3[w]
+        #         cnt = 2
+        #     except KeyError:
+        #         out_sentences = out_sentences + ' ' + wrds[i]
+        # if cnt == 2:
+        #     out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
+        # elif cnt == 1:
+        #     out_sentences = out_sentences + ' ' + wrds[i + 2]
+
+        # return out_sentences
+
+        # NOTE: کد جایگزین دیگر
+
+        i = 0
+        while i < L - 2:
             w = wrds[i] + wrds[i + 1] + wrds[i + 2]
-            try:
-                out_sentences = out_sentences + ' ' + self.dic3[w]
-                cnt = 0
-                cnt2 = 2
-            except KeyError:
-                if cnt == 1 and cnt2 == 0:
-                    out_sentences = out_sentences + ' ' + wrds[i]
-                else:
-                    cnt2 -= 1
-                cnt = 1
-        if cnt == 1 and cnt2 == 0:
-            out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
-        elif cnt == 1 and cnt2 == 1:
-            out_sentences = out_sentences + ' ' + wrds[i + 2]
+            if w in self.dic3:
+                out_sentences += ' ' + self.dic3[w]
+                i += 3  
+            else:
+                out_sentences += ' ' + wrds[i]
+                i += 1
+        while i < L:
+            out_sentences += ' ' + wrds[i]
+            i += 1
         return out_sentences
 
     def normalize(self, doc_string, new_line_elimination=False, return_dates = False):