Classifier/mean_multiclasses.py

"""
این فایل ، جیسون سکشن هایی که با کمک پنجره شناور، کلاسیفای شده اند را می خواند و بر اساس میانگین وزنی، چهار کلاس که بالاترین میانگین وزنی را داشته اند انتخاب و ذخیره می کند
"""
import pandas as pd
import json

# with open('./data/large_sections_classes.json', 'r', encoding='utf-8') as input_file:
#     large_sections = json.load(input_file)

# with open('./data/error_sections_classes.json', 'r', encoding='utf-8') as input_file:
#     error_sections = json.load(input_file)

# large_sections.extend(error_sections)

# with open('./data/final_large_sections_classes.json', 'w', encoding='utf-8') as output_file:
#     json_data = json.dumps(large_sections, indent=4, ensure_ascii=False)
#     output_file.write(json_data)

# exit()

with open('./data/final_large_sections_classes.json', 'r', encoding='utf-8') as input_file:
    sections = json.load(input_file)

def mean_classes(input_classes):
    pass
    all_classes = []
    for cclass in input_classes:
        for item in cclass:
            all_classes.append({
                'label': item['label'],
                'score': item['score']
            })

    # sorted_classes = sorted(all_classes, key=lambda x: x['class'])
    classes_df = pd.DataFrame(all_classes)
    # گروه بندی بر اساس کلاس
    grouped_df = classes_df.groupby("label").agg(
    total_value=("score", "sum"),  # مجموع امتیازها
    count=("score", "count")      # تعداد تکرار هر کلاس
    ).reset_index()
    # تعریف فاکتور وزن بر اساس تعداد تکرار کلاس
    grouped_df["weight"] = grouped_df["count"]
    # بازسازی امتیاز با دخالت دادن وزن
    grouped_df["score"] = grouped_df["total_value"] * grouped_df["weight"]
    # حذف ستون‌های اضافی و ایجاد دیتافریم نهایی
    final_df = grouped_df[["label", "count", "score"]]
    # مرتب سازی دیتافریم نهایی بر اساس بالاترین امتیاز کلاسها
    sorted_df = final_df.sort_values(by="score", ascending=False)
    # تبدیل دیتافریم به دیکشنری
    top_4_classes = sorted_df.head(4).to_dict(orient="records")

    for item in top_4_classes:
        # تبدیل امتیاز در مبنای درصد
        item['score'] = (item['score']*100)/sorted_df['score'].sum()
        item.pop('count')

    return top_4_classes

sections_classes_list = []
for index, item in enumerate(sections):
    # if index > 11:
    #     break
    id = item['id']
    content = item['content']
    try:
        classes = item['classes']
        print(f'section: {len(sections)}/{index+1}/{id}')
    except:
        print(f'section: {len(sections)}/{index+1}/{id} --error')
        continue

    if id == "qs678693":
        pass
    final_classes = mean_classes(classes)

    sections_classes_list.append({
        'id': id,
        'best-class': final_classes[0],
        'other-classes': final_classes[1:]
    })

with open('./data/all_large_sections_final_2.json', 'w', encoding='utf-8') as output_file:
    json_data = json.dumps(sections_classes_list, indent=4, ensure_ascii=False)
    output_file.write(json_data)

# with open('./data/errors_final.txt', 'w', encoding='utf-8') as output_file:
#     output_file.write(errors)

print('finished!')