Upload files to "qavanin_content"
This commit is contained in:
parent
d8d863eebe
commit
1e991a9c0f
113
qavanin_content/law_sections_processor.py
Normal file
113
qavanin_content/law_sections_processor.py
Normal file
|
@ -0,0 +1,113 @@
|
|||
import json
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
try:
|
||||
with open('main_sections_15k.json', 'r', encoding='utf-8') as file:
|
||||
data = json.load(file)
|
||||
except FileNotFoundError:
|
||||
print("File not found: 'sections_15k.json'")
|
||||
exit()
|
||||
except json.JSONDecodeError:
|
||||
print("Error decoding JSON from file")
|
||||
exit()
|
||||
|
||||
|
||||
laws_dict = defaultdict(list)
|
||||
for item in data:
|
||||
laws_dict[item['law_id']].append(item)
|
||||
|
||||
|
||||
def tokenize(text):
|
||||
tokens = re.findall(r'\S+', text)
|
||||
return tokens
|
||||
|
||||
|
||||
def split_into_chunks(tokens, chunk_size=512):
|
||||
chunks = []
|
||||
i = 0
|
||||
while i < len(tokens):
|
||||
chunk = tokens[i:i + chunk_size]
|
||||
if len(chunk) == chunk_size:
|
||||
while len(chunk) > 0 and not chunk[-1].endswith('.'):
|
||||
chunk = chunk[:-1]
|
||||
if len(chunk) == 0:
|
||||
chunk = tokens[i:i + chunk_size]
|
||||
chunks.append(' '.join(chunk))
|
||||
i += len(chunk)
|
||||
return chunks
|
||||
|
||||
result = []
|
||||
|
||||
|
||||
for law_id, items in laws_dict.items():
|
||||
|
||||
items_sorted = sorted(items, key=lambda x: int(x['child_order']))
|
||||
|
||||
content = ' '.join(item['content'] for item in items_sorted)
|
||||
tokens = tokenize(content)
|
||||
chunks = split_into_chunks(tokens)
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
result.append({
|
||||
"id": f"{law_id}_{i+1}",
|
||||
"law_id": law_id,
|
||||
"content": chunk
|
||||
})
|
||||
|
||||
|
||||
try:
|
||||
with open('processed_laws.json', 'w', encoding='utf-8') as file:
|
||||
json.dump(result, file, ensure_ascii=False, indent=4)
|
||||
except IOError:
|
||||
print("Error writing to file: 'processed_laws.json'")
|
||||
exit()
|
||||
|
||||
print("Processing complete. Results saved to 'processed_laws.json'.")
|
||||
|
||||
# import json
|
||||
# import re
|
||||
# from collections import defaultdict
|
||||
|
||||
# with open('sections_15k.json', 'r', encoding='utf-8') as file:
|
||||
# data = json.load(file)
|
||||
|
||||
# laws_dict = defaultdict(str)
|
||||
# for item in data:
|
||||
# laws_dict[item['law_id']] += item['content'] + " "
|
||||
|
||||
# def tokenize(text):
|
||||
# tokens = re.findall(r'\S+', text)
|
||||
# return tokens
|
||||
|
||||
# def split_into_chunks(tokens, chunk_size=512):
|
||||
# chunks = []
|
||||
# i = 0
|
||||
# while i < len(tokens):
|
||||
# chunk = tokens[i:i + chunk_size]
|
||||
# if len(chunk) == chunk_size:
|
||||
|
||||
# while len(chunk) > 0 and not chunk[-1].endswith('.'):
|
||||
# chunk = chunk[:-1]
|
||||
# if len(chunk) == 0:
|
||||
# chunk = tokens[i:i + chunk_size]
|
||||
# chunks.append(' '.join(chunk))
|
||||
# i += len(chunk)
|
||||
# return chunks
|
||||
|
||||
# result = []
|
||||
|
||||
# for law_id, content in laws_dict.items():
|
||||
# tokens = tokenize(content)
|
||||
# chunks = split_into_chunks(tokens)
|
||||
# for i, chunk in enumerate(chunks):
|
||||
# result.append({
|
||||
# "id": f"{law_id}_{i+1}",
|
||||
# "law_id": law_id,
|
||||
# "content": chunk
|
||||
# })
|
||||
|
||||
# with open('processed_laws.json', 'w', encoding='utf-8') as file:
|
||||
# json.dump(result, file , ensure_ascii=False, indent=4)
|
||||
|
||||
# print("Processing complete. Results saved to 'processed_laws.json'.")
|
Loading…
Reference in New Issue
Block a user