ai_dataset/qavanin_content/law_sections_processor.py

import json
import re
from collections import defaultdict

try:
    with open('main_sections_15k.json', 'r', encoding='utf-8') as file:
        data = json.load(file)
except FileNotFoundError:
    print("File not found: 'sections_15k.json'")
    exit()
except json.JSONDecodeError:
    print("Error decoding JSON from file")
    exit()


laws_dict = defaultdict(list)
for item in data:
    laws_dict[item['law_id']].append(item)


def tokenize(text):
    tokens = re.findall(r'\S+', text)
    return tokens


def split_into_chunks(tokens, chunk_size=512):
    chunks = []
    i = 0
    while i < len(tokens):
        chunk = tokens[i:i + chunk_size]
        if len(chunk) == chunk_size:
            while len(chunk) > 0 and not chunk[-1].endswith('.'):
                chunk = chunk[:-1]
            if len(chunk) == 0:
                chunk = tokens[i:i + chunk_size] 
        chunks.append(' '.join(chunk))
        i += len(chunk) 
    return chunks

result = []


for law_id, items in laws_dict.items():

    items_sorted = sorted(items, key=lambda x: int(x['child_order']))
    
    content = ' '.join(item['content'] for item in items_sorted) 
    tokens = tokenize(content)
    chunks = split_into_chunks(tokens)
    
    for i, chunk in enumerate(chunks):
        result.append({
            "id": f"{law_id}_{i+1}",
            "law_id": law_id,
            "content": chunk
        })


try:
    with open('processed_laws.json', 'w', encoding='utf-8') as file:
        json.dump(result, file, ensure_ascii=False, indent=4)
except IOError:
    print("Error writing to file: 'processed_laws.json'")
    exit()

print("Processing complete. Results saved to 'processed_laws.json'.")

# import json
# import re
# from collections import defaultdict

# with open('sections_15k.json', 'r', encoding='utf-8') as file:
#     data = json.load(file)

# laws_dict = defaultdict(str)
# for item in data:
#     laws_dict[item['law_id']] += item['content'] + " "

# def tokenize(text):
#     tokens = re.findall(r'\S+', text)
#     return tokens

# def split_into_chunks(tokens, chunk_size=512):
#     chunks = []
#     i = 0
#     while i < len(tokens):
#         chunk = tokens[i:i + chunk_size]
#         if len(chunk) == chunk_size:

#             while len(chunk) > 0 and not chunk[-1].endswith('.'):
#                 chunk = chunk[:-1]
#             if len(chunk) == 0:
#                 chunk = tokens[i:i + chunk_size] 
#         chunks.append(' '.join(chunk))
#         i += len(chunk) 
#     return chunks

# result = []

# for law_id, content in laws_dict.items():
#     tokens = tokenize(content)
#     chunks = split_into_chunks(tokens)
#     for i, chunk in enumerate(chunks):
#         result.append({
#             "id": f"{law_id}_{i+1}",
#             "law_id": law_id,
#             "content": chunk
#         })

# with open('processed_laws.json', 'w', encoding='utf-8') as file:
#     json.dump(result, file , ensure_ascii=False, indent=4)

# print("Processing complete. Results saved to 'processed_laws.json'.")
Upload files to "qavanin_content" 2024-09-17 16:48:45 +00:00			`import json`
			`import re`
			`from collections import defaultdict`

			`try:`
			`with open('main_sections_15k.json', 'r', encoding='utf-8') as file:`
			`data = json.load(file)`
			`except FileNotFoundError:`
			`print("File not found: 'sections_15k.json'")`
			`exit()`
			`except json.JSONDecodeError:`
			`print("Error decoding JSON from file")`
			`exit()`


			`laws_dict = defaultdict(list)`
			`for item in data:`
			`laws_dict[item['law_id']].append(item)`


			`def tokenize(text):`
			`tokens = re.findall(r'\S+', text)`
			`return tokens`


			`def split_into_chunks(tokens, chunk_size=512):`
			`chunks = []`
			`i = 0`
			`while i < len(tokens):`
			`chunk = tokens[i:i + chunk_size]`
			`if len(chunk) == chunk_size:`
			`while len(chunk) > 0 and not chunk[-1].endswith('.'):`
			`chunk = chunk[:-1]`
			`if len(chunk) == 0:`
			`chunk = tokens[i:i + chunk_size]`
			`chunks.append(' '.join(chunk))`
			`i += len(chunk)`
			`return chunks`

			`result = []`


			`for law_id, items in laws_dict.items():`

			`items_sorted = sorted(items, key=lambda x: int(x['child_order']))`

			`content = ' '.join(item['content'] for item in items_sorted)`
			`tokens = tokenize(content)`
			`chunks = split_into_chunks(tokens)`

			`for i, chunk in enumerate(chunks):`
			`result.append({`
			`"id": f"{law_id}_{i+1}",`
			`"law_id": law_id,`
			`"content": chunk`
			`})`


			`try:`
			`with open('processed_laws.json', 'w', encoding='utf-8') as file:`
			`json.dump(result, file, ensure_ascii=False, indent=4)`
			`except IOError:`
			`print("Error writing to file: 'processed_laws.json'")`
			`exit()`

			`print("Processing complete. Results saved to 'processed_laws.json'.")`

			`# import json`
			`# import re`
			`# from collections import defaultdict`

			`# with open('sections_15k.json', 'r', encoding='utf-8') as file:`
			`# data = json.load(file)`

			`# laws_dict = defaultdict(str)`
			`# for item in data:`
			`# laws_dict[item['law_id']] += item['content'] + " "`

			`# def tokenize(text):`
			`# tokens = re.findall(r'\S+', text)`
			`# return tokens`

			`# def split_into_chunks(tokens, chunk_size=512):`
			`# chunks = []`
			`# i = 0`
			`# while i < len(tokens):`
			`# chunk = tokens[i:i + chunk_size]`
			`# if len(chunk) == chunk_size:`

			`# while len(chunk) > 0 and not chunk[-1].endswith('.'):`
			`# chunk = chunk[:-1]`
			`# if len(chunk) == 0:`
			`# chunk = tokens[i:i + chunk_size]`
			`# chunks.append(' '.join(chunk))`
			`# i += len(chunk)`
			`# return chunks`

			`# result = []`

			`# for law_id, content in laws_dict.items():`
			`# tokens = tokenize(content)`
			`# chunks = split_into_chunks(tokens)`
			`# for i, chunk in enumerate(chunks):`
			`# result.append({`
			`# "id": f"{law_id}_{i+1}",`
			`# "law_id": law_id,`
			`# "content": chunk`
			`# })`

			`# with open('processed_laws.json', 'w', encoding='utf-8') as file:`
			`# json.dump(result, file , ensure_ascii=False, indent=4)`

			`# print("Processing complete. Results saved to 'processed_laws.json'.")`