From 1e991a9c0ff9b1889ab542616a0d38a4a372b398 Mon Sep 17 00:00:00 2001 From: ajokar Date: Tue, 17 Sep 2024 16:48:45 +0000 Subject: [PATCH] Upload files to "qavanin_content" --- qavanin_content/law_sections_processor.py | 113 ++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 qavanin_content/law_sections_processor.py diff --git a/qavanin_content/law_sections_processor.py b/qavanin_content/law_sections_processor.py new file mode 100644 index 0000000..6e7df45 --- /dev/null +++ b/qavanin_content/law_sections_processor.py @@ -0,0 +1,113 @@ +import json +import re +from collections import defaultdict + +try: + with open('main_sections_15k.json', 'r', encoding='utf-8') as file: + data = json.load(file) +except FileNotFoundError: + print("File not found: 'sections_15k.json'") + exit() +except json.JSONDecodeError: + print("Error decoding JSON from file") + exit() + + +laws_dict = defaultdict(list) +for item in data: + laws_dict[item['law_id']].append(item) + + +def tokenize(text): + tokens = re.findall(r'\S+', text) + return tokens + + +def split_into_chunks(tokens, chunk_size=512): + chunks = [] + i = 0 + while i < len(tokens): + chunk = tokens[i:i + chunk_size] + if len(chunk) == chunk_size: + while len(chunk) > 0 and not chunk[-1].endswith('.'): + chunk = chunk[:-1] + if len(chunk) == 0: + chunk = tokens[i:i + chunk_size] + chunks.append(' '.join(chunk)) + i += len(chunk) + return chunks + +result = [] + + +for law_id, items in laws_dict.items(): + + items_sorted = sorted(items, key=lambda x: int(x['child_order'])) + + content = ' '.join(item['content'] for item in items_sorted) + tokens = tokenize(content) + chunks = split_into_chunks(tokens) + + for i, chunk in enumerate(chunks): + result.append({ + "id": f"{law_id}_{i+1}", + "law_id": law_id, + "content": chunk + }) + + +try: + with open('processed_laws.json', 'w', encoding='utf-8') as file: + json.dump(result, file, ensure_ascii=False, indent=4) +except IOError: + print("Error writing to file: 'processed_laws.json'") + exit() + +print("Processing complete. Results saved to 'processed_laws.json'.") + +# import json +# import re +# from collections import defaultdict + +# with open('sections_15k.json', 'r', encoding='utf-8') as file: +# data = json.load(file) + +# laws_dict = defaultdict(str) +# for item in data: +# laws_dict[item['law_id']] += item['content'] + " " + +# def tokenize(text): +# tokens = re.findall(r'\S+', text) +# return tokens + +# def split_into_chunks(tokens, chunk_size=512): +# chunks = [] +# i = 0 +# while i < len(tokens): +# chunk = tokens[i:i + chunk_size] +# if len(chunk) == chunk_size: + +# while len(chunk) > 0 and not chunk[-1].endswith('.'): +# chunk = chunk[:-1] +# if len(chunk) == 0: +# chunk = tokens[i:i + chunk_size] +# chunks.append(' '.join(chunk)) +# i += len(chunk) +# return chunks + +# result = [] + +# for law_id, content in laws_dict.items(): +# tokens = tokenize(content) +# chunks = split_into_chunks(tokens) +# for i, chunk in enumerate(chunks): +# result.append({ +# "id": f"{law_id}_{i+1}", +# "law_id": law_id, +# "content": chunk +# }) + +# with open('processed_laws.json', 'w', encoding='utf-8') as file: +# json.dump(result, file , ensure_ascii=False, indent=4) + +# print("Processing complete. Results saved to 'processed_laws.json'.") \ No newline at end of file