ai_dataset/qavanin_content/law_sections_processor.py

import json
import re
from collections import defaultdict

try:
    with open('main_sections_15k.json', 'r', encoding='utf-8') as file:
        data = json.load(file)
except FileNotFoundError:
    print("File not found: 'sections_15k.json'")
    exit()
except json.JSONDecodeError:
    print("Error decoding JSON from file")
    exit()


laws_dict = defaultdict(list)
for item in data:
    laws_dict[item['law_id']].append(item)


def tokenize(text):
    tokens = re.findall(r'\S+', text)
    return tokens


def split_into_chunks(tokens, chunk_size=512):
    chunks = []
    i = 0
    while i < len(tokens):
        chunk = tokens[i:i + chunk_size]
        if len(chunk) == chunk_size:
            while len(chunk) > 0 and not chunk[-1].endswith('.'):
                chunk = chunk[:-1]
            if len(chunk) == 0:
                chunk = tokens[i:i + chunk_size]
        chunks.append(' '.join(chunk))
        i += len(chunk)
    return chunks

result = []


for law_id, items in laws_dict.items():

    items_sorted = sorted(items, key=lambda x: int(x['child_order']))

    content = ' '.join(item['content'] for item in items_sorted)
    tokens = tokenize(content)
    chunks = split_into_chunks(tokens)

    for i, chunk in enumerate(chunks):
        result.append({
            "id": f"{law_id}_{i+1}",
            "law_id": law_id,
            "content": chunk
        })


try:
    with open('processed_laws.json', 'w', encoding='utf-8') as file:
        json.dump(result, file, ensure_ascii=False, indent=4)
except IOError:
    print("Error writing to file: 'processed_laws.json'")
    exit()

print("Processing complete. Results saved to 'processed_laws.json'.")

# import json
# import re
# from collections import defaultdict

# with open('sections_15k.json', 'r', encoding='utf-8') as file:
#     data = json.load(file)

# laws_dict = defaultdict(str)
# for item in data:
#     laws_dict[item['law_id']] += item['content'] + " "

# def tokenize(text):
#     tokens = re.findall(r'\S+', text)
#     return tokens

# def split_into_chunks(tokens, chunk_size=512):
#     chunks = []
#     i = 0
#     while i < len(tokens):
#         chunk = tokens[i:i + chunk_size]
#         if len(chunk) == chunk_size:

#             while len(chunk) > 0 and not chunk[-1].endswith('.'):
#                 chunk = chunk[:-1]
#             if len(chunk) == 0:
#                 chunk = tokens[i:i + chunk_size]
#         chunks.append(' '.join(chunk))
#         i += len(chunk)
#     return chunks

# result = []

# for law_id, content in laws_dict.items():
#     tokens = tokenize(content)
#     chunks = split_into_chunks(tokens)
#     for i, chunk in enumerate(chunks):
#         result.append({
#             "id": f"{law_id}_{i+1}",
#             "law_id": law_id,
#             "content": chunk
#         })

# with open('processed_laws.json', 'w', encoding='utf-8') as file:
#     json.dump(result, file , ensure_ascii=False, indent=4)

# print("Processing complete. Results saved to 'processed_laws.json'.")