Upload files to "qavanin_content"

2024-09-17 16:48:45 +00:00 · 2024-09-17 16:48:45 +00:00 · 1e991a9c0f
commit 1e991a9c0f
parent d8d863eebe
1 changed files with 113 additions and 0 deletions
--- a/qavanin_content/law_sections_processor.py
+++ b/qavanin_content/law_sections_processor.py
@ -0,0 +1,113 @@
+import json
+import re
+from collections import defaultdict
+
+try:
+    with open('main_sections_15k.json', 'r', encoding='utf-8') as file:
+        data = json.load(file)
+except FileNotFoundError:
+    print("File not found: 'sections_15k.json'")
+    exit()
+except json.JSONDecodeError:
+    print("Error decoding JSON from file")
+    exit()
+
+
+laws_dict = defaultdict(list)
+for item in data:
+    laws_dict[item['law_id']].append(item)
+
+
+def tokenize(text):
+    tokens = re.findall(r'\S+', text)
+    return tokens
+
+
+def split_into_chunks(tokens, chunk_size=512):
+    chunks = []
+    i = 0
+    while i < len(tokens):
+        chunk = tokens[i:i + chunk_size]
+        if len(chunk) == chunk_size:
+            while len(chunk) > 0 and not chunk[-1].endswith('.'):
+                chunk = chunk[:-1]
+            if len(chunk) == 0:
+                chunk = tokens[i:i + chunk_size] 
+        chunks.append(' '.join(chunk))
+        i += len(chunk) 
+    return chunks
+
+result = []
+
+
+for law_id, items in laws_dict.items():
+
+    items_sorted = sorted(items, key=lambda x: int(x['child_order']))
+    
+    content = ' '.join(item['content'] for item in items_sorted) 
+    tokens = tokenize(content)
+    chunks = split_into_chunks(tokens)
+    
+    for i, chunk in enumerate(chunks):
+        result.append({
+            "id": f"{law_id}_{i+1}",
+            "law_id": law_id,
+            "content": chunk
+        })
+
+
+try:
+    with open('processed_laws.json', 'w', encoding='utf-8') as file:
+        json.dump(result, file, ensure_ascii=False, indent=4)
+except IOError:
+    print("Error writing to file: 'processed_laws.json'")
+    exit()
+
+print("Processing complete. Results saved to 'processed_laws.json'.")
+
+# import json
+# import re
+# from collections import defaultdict
+
+# with open('sections_15k.json', 'r', encoding='utf-8') as file:
+#     data = json.load(file)
+
+# laws_dict = defaultdict(str)
+# for item in data:
+#     laws_dict[item['law_id']] += item['content'] + " "
+
+# def tokenize(text):
+#     tokens = re.findall(r'\S+', text)
+#     return tokens
+
+# def split_into_chunks(tokens, chunk_size=512):
+#     chunks = []
+#     i = 0
+#     while i < len(tokens):
+#         chunk = tokens[i:i + chunk_size]
+#         if len(chunk) == chunk_size:
+
+#             while len(chunk) > 0 and not chunk[-1].endswith('.'):
+#                 chunk = chunk[:-1]
+#             if len(chunk) == 0:
+#                 chunk = tokens[i:i + chunk_size] 
+#         chunks.append(' '.join(chunk))
+#         i += len(chunk) 
+#     return chunks
+
+# result = []
+
+# for law_id, content in laws_dict.items():
+#     tokens = tokenize(content)
+#     chunks = split_into_chunks(tokens)
+#     for i, chunk in enumerate(chunks):
+#         result.append({
+#             "id": f"{law_id}_{i+1}",
+#             "law_id": law_id,
+#             "content": chunk
+#         })
+
+# with open('processed_laws.json', 'w', encoding='utf-8') as file:
+#     json.dump(result, file , ensure_ascii=False, indent=4)
+
+# print("Processing complete. Results saved to 'processed_laws.json'.")