ai_dataset/qavanin_content/law_sections_processor.py

113 lines
3.1 KiB
Python

import json
import re
from collections import defaultdict
try:
with open('main_sections_15k.json', 'r', encoding='utf-8') as file:
data = json.load(file)
except FileNotFoundError:
print("File not found: 'sections_15k.json'")
exit()
except json.JSONDecodeError:
print("Error decoding JSON from file")
exit()
laws_dict = defaultdict(list)
for item in data:
laws_dict[item['law_id']].append(item)
def tokenize(text):
tokens = re.findall(r'\S+', text)
return tokens
def split_into_chunks(tokens, chunk_size=512):
chunks = []
i = 0
while i < len(tokens):
chunk = tokens[i:i + chunk_size]
if len(chunk) == chunk_size:
while len(chunk) > 0 and not chunk[-1].endswith('.'):
chunk = chunk[:-1]
if len(chunk) == 0:
chunk = tokens[i:i + chunk_size]
chunks.append(' '.join(chunk))
i += len(chunk)
return chunks
result = []
for law_id, items in laws_dict.items():
items_sorted = sorted(items, key=lambda x: int(x['child_order']))
content = ' '.join(item['content'] for item in items_sorted)
tokens = tokenize(content)
chunks = split_into_chunks(tokens)
for i, chunk in enumerate(chunks):
result.append({
"id": f"{law_id}_{i+1}",
"law_id": law_id,
"content": chunk
})
try:
with open('processed_laws.json', 'w', encoding='utf-8') as file:
json.dump(result, file, ensure_ascii=False, indent=4)
except IOError:
print("Error writing to file: 'processed_laws.json'")
exit()
print("Processing complete. Results saved to 'processed_laws.json'.")
# import json
# import re
# from collections import defaultdict
# with open('sections_15k.json', 'r', encoding='utf-8') as file:
# data = json.load(file)
# laws_dict = defaultdict(str)
# for item in data:
# laws_dict[item['law_id']] += item['content'] + " "
# def tokenize(text):
# tokens = re.findall(r'\S+', text)
# return tokens
# def split_into_chunks(tokens, chunk_size=512):
# chunks = []
# i = 0
# while i < len(tokens):
# chunk = tokens[i:i + chunk_size]
# if len(chunk) == chunk_size:
# while len(chunk) > 0 and not chunk[-1].endswith('.'):
# chunk = chunk[:-1]
# if len(chunk) == 0:
# chunk = tokens[i:i + chunk_size]
# chunks.append(' '.join(chunk))
# i += len(chunk)
# return chunks
# result = []
# for law_id, content in laws_dict.items():
# tokens = tokenize(content)
# chunks = split_into_chunks(tokens)
# for i, chunk in enumerate(chunks):
# result.append({
# "id": f"{law_id}_{i+1}",
# "law_id": law_id,
# "content": chunk
# })
# with open('processed_laws.json', 'w', encoding='utf-8') as file:
# json.dump(result, file , ensure_ascii=False, indent=4)
# print("Processing complete. Results saved to 'processed_laws.json'.")