import json import re from collections import defaultdict try: with open('main_sections_15k.json', 'r', encoding='utf-8') as file: data = json.load(file) except FileNotFoundError: print("File not found: 'sections_15k.json'") exit() except json.JSONDecodeError: print("Error decoding JSON from file") exit() laws_dict = defaultdict(list) for item in data: laws_dict[item['law_id']].append(item) def tokenize(text): tokens = re.findall(r'\S+', text) return tokens def split_into_chunks(tokens, chunk_size=512): chunks = [] i = 0 while i < len(tokens): chunk = tokens[i:i + chunk_size] if len(chunk) == chunk_size: while len(chunk) > 0 and not chunk[-1].endswith('.'): chunk = chunk[:-1] if len(chunk) == 0: chunk = tokens[i:i + chunk_size] chunks.append(' '.join(chunk)) i += len(chunk) return chunks result = [] for law_id, items in laws_dict.items(): items_sorted = sorted(items, key=lambda x: int(x['child_order'])) content = ' '.join(item['content'] for item in items_sorted) tokens = tokenize(content) chunks = split_into_chunks(tokens) for i, chunk in enumerate(chunks): result.append({ "id": f"{law_id}_{i+1}", "law_id": law_id, "content": chunk }) try: with open('processed_laws.json', 'w', encoding='utf-8') as file: json.dump(result, file, ensure_ascii=False, indent=4) except IOError: print("Error writing to file: 'processed_laws.json'") exit() print("Processing complete. Results saved to 'processed_laws.json'.") # import json # import re # from collections import defaultdict # with open('sections_15k.json', 'r', encoding='utf-8') as file: # data = json.load(file) # laws_dict = defaultdict(str) # for item in data: # laws_dict[item['law_id']] += item['content'] + " " # def tokenize(text): # tokens = re.findall(r'\S+', text) # return tokens # def split_into_chunks(tokens, chunk_size=512): # chunks = [] # i = 0 # while i < len(tokens): # chunk = tokens[i:i + chunk_size] # if len(chunk) == chunk_size: # while len(chunk) > 0 and not chunk[-1].endswith('.'): # chunk = chunk[:-1] # if len(chunk) == 0: # chunk = tokens[i:i + chunk_size] # chunks.append(' '.join(chunk)) # i += len(chunk) # return chunks # result = [] # for law_id, content in laws_dict.items(): # tokens = tokenize(content) # chunks = split_into_chunks(tokens) # for i, chunk in enumerate(chunks): # result.append({ # "id": f"{law_id}_{i+1}", # "law_id": law_id, # "content": chunk # }) # with open('processed_laws.json', 'w', encoding='utf-8') as file: # json.dump(result, file , ensure_ascii=False, indent=4) # print("Processing complete. Results saved to 'processed_laws.json'.")