find large sections which send to window

2025-08-20 13:56:00 +03:30 · 2025-08-20 13:56:00 +03:30 · 3a23cb6e66
commit 3a23cb6e66
parent 0e856fd797
5 changed files with 154581 additions and 1445 deletions
--- a/data/empty_content_log.txt
+++ b/data/empty_content_log.txt
@ -1 +1,104 @@
-qs832374 >> No text is provided to clean
+qs214614 >> No text is provided to clean
 qs214630 >> No text is provided to clean
 qs999303 >> No text is provided to clean
 qs688916 >> No text is provided to clean
 qs213422 >> No text is provided to clean
 qs894830 >> No text is provided to clean
 qs625850 >> No text is provided to clean
 qs762340 >> No text is provided to clean
 qs1816282 >> No text is provided to clean
 qs668053 >> No text is provided to clean
 qs692739 >> No text is provided to clean
 qs218353 >> No text is provided to clean
 qs2065818 >> No text is provided to clean
 qs286455 >> No text is provided to clean
 qs573113 >> No text is provided to clean
 qs688971 >> No text is provided to clean
 qs1051386 >> No text is provided to clean
 qs1118015 >> No text is provided to clean
 qs776897 >> No text is provided to clean
 qs1115771 >> No text is provided to clean
 298683e9ae8dffe3_10901111 >> No text is provided to clean
 298683e9ae8dffe3_1090111 >> No text is provided to clean
 qs1102704 >> No text is provided to clean
 qs768498 >> No text is provided to clean
 qs988057 >> No text is provided to clean
 qs830281 >> No text is provided to clean
 qs953637 >> No text is provided to clean
 qs975560 >> No text is provided to clean
 qs214537 >> No text is provided to clean
 qs2516016 >> No text is provided to clean
 qs1115812 >> No text is provided to clean
 qs832536 >> No text is provided to clean
 qs3437676 >> No text is provided to clean
 qs1102796 >> No text is provided to clean
 qs799402 >> No text is provided to clean
 qs866692 >> No text is provided to clean
 qs761253 >> No text is provided to clean
 qs1044009 >> No text is provided to clean
 qs773634 >> No text is provided to clean
 qs1102868 >> No text is provided to clean
 qs1102870 >> No text is provided to clean
 qs1843826 >> No text is provided to clean
 qs1102735 >> No text is provided to clean
 qs693032 >> No text is provided to clean
 qs877648 >> No text is provided to clean
 qs987743 >> No text is provided to clean
 qs1128475 >> No text is provided to clean
 qs3349198 >> No text is provided to clean
 qs621263 >> No text is provided to clean
 qs218787 >> No text is provided to clean
 qs218788 >> No text is provided to clean
 qs2515800 >> No text is provided to clean
 qs218789 >> No text is provided to clean
 qs218791 >> No text is provided to clean
 qs218792 >> No text is provided to clean
 qs786853 >> No text is provided to clean
 qs516658 >> No text is provided to clean
 qs214636 >> No text is provided to clean
 qs1115772 >> No text is provided to clean
 qs552406 >> No text is provided to clean
 qs236126 >> No text is provided to clean
 qs286506 >> No text is provided to clean
 qs286511 >> No text is provided to clean
 qs94717 >> No text is provided to clean
 qs719521 >> No text is provided to clean
 qs1117997 >> No text is provided to clean
 qs286490 >> No text is provided to clean
 qs1988696 >> No text is provided to clean
 qs2516015 >> No text is provided to clean
 qs214546 >> No text is provided to clean
 qs507915 >> No text is provided to clean
 qs980157 >> No text is provided to clean
 qs2651573 >> No text is provided to clean
 qs621745 >> No text is provided to clean
 qs957628 >> No text is provided to clean
 qs725920 >> No text is provided to clean
 qs832412 >> No text is provided to clean
 qs1113187 >> No text is provided to clean
 qs621180 >> No text is provided to clean
 qs1092296 >> No text is provided to clean
 qs987744 >> No text is provided to clean
 qs990251 >> No text is provided to clean
 qs956440 >> No text is provided to clean
 qs998707 >> No text is provided to clean
 qs431837 >> No text is provided to clean
 qs1120063 >> No text is provided to clean
 qs2516020 >> No text is provided to clean
 qs214569 >> No text is provided to clean
 qs214577 >> No text is provided to clean
 qs680f6f37b77af_21 >> No text is provided to clean
 qs2128754 >> No text is provided to clean
 qs925414 >> No text is provided to clean
 774683546f75f603 >> No text is provided to clean
 298683e9ae8dffe3_109011 >> No text is provided to clean
 qs1113973 >> No text is provided to clean
 qs286498 >> No text is provided to clean
 qs286501 >> No text is provided to clean
 qs217778 >> No text is provided to clean
 qs761370 >> No text is provided to clean
 qs289962 >> No text is provided to clean
 qs236391 >> No text is provided to clean
 qs218594 >> No text is provided to clean
 qs236285 >> No text is provided to clean
 qs1113974 >> No text is provided to clean
--- a/data/large_sections.txt
+++ b/data/large_sections.txt
--- a/data/test_log_60e_hoosh_fp3.txt
+++ b/data/test_log_60e_hoosh_fp3.txt
--- a/sections_window_3.py
+++ b/sections_window_3.py
@ -147,10 +147,7 @@ qanon_title_list = []
 new_sections_dict = {}
 selectedids = []
 for index, item in enumerate(sections):
-    if index < 2900:
+
        continue
    if index > 10000:
        break
    id = item['id']
    source = item['source']
@ -216,9 +213,6 @@ for index, item in enumerate(sections):
 with open('./data/all_sections_classes_new_140405.json', 'w', encoding='utf-8') as output_file:
    json_data = json.dumps(new_sections_dict, indent=4, ensure_ascii=False)
    output_file.write(json_data)
 # with open('./data/all_sections_classes_tttttesttttt.json', 'w', encoding='utf-8') as output_file:
 #     json_data = json.dumps(new_sections_dict, indent=4, ensure_ascii=False)
 #     output_file.write(json_data)
 print(f'end: {datetime.datetime.now()}')
 print('finished!')
--- a/temp.py
+++ b/temp.py
@ -85,5 +85,29 @@ def classified_sections():
    return large_not_classified
 if __name__ == '__main__':
-    result = classified_sections()      
+    
-    print(len(result))
+    with open('./data/all_sections_classes_new_140405.json', 'r', encoding='utf-8') as _file:
        sections = json.load(_file)
    # region large sections which send to window
    faults = []
    for item in sections:
        itm = sections[item]
        try:
            best = itm['best-class']['score']
        except:
            continue
        if best > 1:
            print(best)
            faults.append((item,best))
    faults_text = ''
    for item in faults:
        faults_text += ''.join(item[0]) + '\n' 
    with open('./data/large_sections.txt', 'a+') as file:
            file.write(faults_text.strip())
    # endregion        
    # result = classified_sections()      
    # print(len(result))