113 lines
3.1 KiB
Python
113 lines
3.1 KiB
Python
import json
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
try:
|
|
with open('main_sections_15k.json', 'r', encoding='utf-8') as file:
|
|
data = json.load(file)
|
|
except FileNotFoundError:
|
|
print("File not found: 'sections_15k.json'")
|
|
exit()
|
|
except json.JSONDecodeError:
|
|
print("Error decoding JSON from file")
|
|
exit()
|
|
|
|
|
|
laws_dict = defaultdict(list)
|
|
for item in data:
|
|
laws_dict[item['law_id']].append(item)
|
|
|
|
|
|
def tokenize(text):
|
|
tokens = re.findall(r'\S+', text)
|
|
return tokens
|
|
|
|
|
|
def split_into_chunks(tokens, chunk_size=512):
|
|
chunks = []
|
|
i = 0
|
|
while i < len(tokens):
|
|
chunk = tokens[i:i + chunk_size]
|
|
if len(chunk) == chunk_size:
|
|
while len(chunk) > 0 and not chunk[-1].endswith('.'):
|
|
chunk = chunk[:-1]
|
|
if len(chunk) == 0:
|
|
chunk = tokens[i:i + chunk_size]
|
|
chunks.append(' '.join(chunk))
|
|
i += len(chunk)
|
|
return chunks
|
|
|
|
result = []
|
|
|
|
|
|
for law_id, items in laws_dict.items():
|
|
|
|
items_sorted = sorted(items, key=lambda x: int(x['child_order']))
|
|
|
|
content = ' '.join(item['content'] for item in items_sorted)
|
|
tokens = tokenize(content)
|
|
chunks = split_into_chunks(tokens)
|
|
|
|
for i, chunk in enumerate(chunks):
|
|
result.append({
|
|
"id": f"{law_id}_{i+1}",
|
|
"law_id": law_id,
|
|
"content": chunk
|
|
})
|
|
|
|
|
|
try:
|
|
with open('processed_laws.json', 'w', encoding='utf-8') as file:
|
|
json.dump(result, file, ensure_ascii=False, indent=4)
|
|
except IOError:
|
|
print("Error writing to file: 'processed_laws.json'")
|
|
exit()
|
|
|
|
print("Processing complete. Results saved to 'processed_laws.json'.")
|
|
|
|
# import json
|
|
# import re
|
|
# from collections import defaultdict
|
|
|
|
# with open('sections_15k.json', 'r', encoding='utf-8') as file:
|
|
# data = json.load(file)
|
|
|
|
# laws_dict = defaultdict(str)
|
|
# for item in data:
|
|
# laws_dict[item['law_id']] += item['content'] + " "
|
|
|
|
# def tokenize(text):
|
|
# tokens = re.findall(r'\S+', text)
|
|
# return tokens
|
|
|
|
# def split_into_chunks(tokens, chunk_size=512):
|
|
# chunks = []
|
|
# i = 0
|
|
# while i < len(tokens):
|
|
# chunk = tokens[i:i + chunk_size]
|
|
# if len(chunk) == chunk_size:
|
|
|
|
# while len(chunk) > 0 and not chunk[-1].endswith('.'):
|
|
# chunk = chunk[:-1]
|
|
# if len(chunk) == 0:
|
|
# chunk = tokens[i:i + chunk_size]
|
|
# chunks.append(' '.join(chunk))
|
|
# i += len(chunk)
|
|
# return chunks
|
|
|
|
# result = []
|
|
|
|
# for law_id, content in laws_dict.items():
|
|
# tokens = tokenize(content)
|
|
# chunks = split_into_chunks(tokens)
|
|
# for i, chunk in enumerate(chunks):
|
|
# result.append({
|
|
# "id": f"{law_id}_{i+1}",
|
|
# "law_id": law_id,
|
|
# "content": chunk
|
|
# })
|
|
|
|
# with open('processed_laws.json', 'w', encoding='utf-8') as file:
|
|
# json.dump(result, file , ensure_ascii=False, indent=4)
|
|
|
|
# print("Processing complete. Results saved to 'processed_laws.json'.") |