add files
This commit is contained in:
parent
393aace89c
commit
0c822ffa61
5114
data/errors.txt
Normal file
5114
data/errors.txt
Normal file
File diff suppressed because it is too large
Load Diff
65718
data/recent_sections.json
Normal file
65718
data/recent_sections.json
Normal file
File diff suppressed because one or more lines are too long
6
do_nlp_processes.py
Normal file
6
do_nlp_processes.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
import p1_classifier
|
||||||
|
import p2_keyword_extractor
|
||||||
|
import p3_ner_recognizer
|
||||||
|
import p4_simplifier
|
||||||
|
import p5_words_embedder
|
||||||
|
|
|
@ -10,23 +10,40 @@ sections = eh_obj.iterateJsonFile(path, True)
|
||||||
update_time = datetime.datetime(1403,10,5)
|
update_time = datetime.datetime(1403,10,5)
|
||||||
|
|
||||||
def get_data_from_date(date):
|
def get_data_from_date(date):
|
||||||
|
errors = []
|
||||||
recent_sections = {}
|
recent_sections = {}
|
||||||
|
counter = 1
|
||||||
for i, item in enumerate(sections):
|
for i, item in enumerate(sections):
|
||||||
id = item['id']
|
id = item['id']
|
||||||
source = item['source']
|
source = item['source']
|
||||||
ts_date = source['ts_date']
|
ts_date = source['ts_date']
|
||||||
ts_date_standard = datetime.datetime(ts_date.split('/')[0],ts_date.split('/')[1],ts_date.split('/')[2])
|
try:
|
||||||
|
ts_date_standard = datetime.datetime(int(ts_date.split('/')[0]),int(ts_date.split('/')[1]),int(ts_date.split('/')[2]))
|
||||||
|
except:
|
||||||
|
# errors+= f'{ts_date} - {id}\n'
|
||||||
|
errors.append(f'{ts_date} - {id}')
|
||||||
|
continue
|
||||||
|
|
||||||
if ts_date_standard>date:
|
if ts_date_standard>date:
|
||||||
recent_sections[id] = source
|
recent_sections[id] = source
|
||||||
|
counter+=1
|
||||||
|
print(ts_date)
|
||||||
|
|
||||||
|
errors.sort()
|
||||||
|
errors_text = ''
|
||||||
|
for item in errors:
|
||||||
|
errors_text += item +'\n'
|
||||||
|
with open('./data/errors.txt', 'w', encoding='utf-8') as file:
|
||||||
|
file.write(errors_text)
|
||||||
|
|
||||||
|
print(f'new sections count: {counter}')
|
||||||
return recent_sections
|
return recent_sections
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
recent_sections = get_data_from_date(update_time)
|
recent_sections = get_data_from_date(update_time)
|
||||||
|
|
||||||
with open('./data/recent_sections.json', 'w', encoding='utf-8') as file:
|
with open('./data/recent_sections.json', 'w', encoding='utf-8') as file:
|
||||||
data = json.dump(recent_sections)
|
data = json.dumps(recent_sections, ensure_ascii=False, indent=4)
|
||||||
file.write(data)
|
file.write(data)
|
||||||
|
|
||||||
print('finished!')
|
print('finished!')
|
||||||
|
|
0
p1_classifier.py
Normal file
0
p1_classifier.py
Normal file
0
p2_keyword_extractor.py
Normal file
0
p2_keyword_extractor.py
Normal file
0
p3_ner_recognizer.py
Normal file
0
p3_ner_recognizer.py
Normal file
0
p4_simplifier.py
Normal file
0
p4_simplifier.py
Normal file
0
p5_words_embedder.py
Normal file
0
p5_words_embedder.py
Normal file
Loading…
Reference in New Issue
Block a user