nahj_rag/query_analyzer.py

# from transformers import AutoTokenizer

# -------------------------------------------------------------
#                    SYSTEM PROMPT
# -------------------------------------------------------------
SYSTEM_PROMPT = """
You are a Query Analyzer AI. Your job is to analyze any user query and classify it into structured components.
Always produce your output EXACTLY in the JSON structure specified below.

### HOW TO PROCESS THE USER QUERY

1. Detect if the query contains **greeting / small talk**, such as "سلام"، "درود"، "خوبی؟"، "وقت بخیر"، "چطوری؟".
2. Detect whether the query contains **an actual question** besides greeting.

3. Structure your output as follows:

{
  "has_greeting": true | false,
  "greeting_text": "...",
  "has_question": true | false,
  "original_question": "...",
  "sub_questions": ["...", "...", ...]
}

### RULES

- If the user only greets and asks nothing →
  • has_question = false
  • sub_questions = []
- If the user greets AND asks →
  • extract greeting into greeting_text
  • put only the QUESTION part in original_question
- ALWAYS decompose the question into meaningful sub-questions
  for a multi-step RAG pipeline.

### Decomposition Requirements
- Sub-questions must be clear, independent, and non-overlapping.
- Each sub-question must be solvable by a single chunk of retrieved context.
- Keep them as **factual**, **atomic**, and **retrievable** as possible.

### Output Format Rules
- Output ONLY valid JSON. No explanations. No markdown. No filler text.
"""

# -------------------------------------------------------------
#                   FEW-SHOT EXAMPLES
# -------------------------------------------------------------
EXAMPLES = [
    """
### Example 1:
USER: سلام خوبی؟
OUTPUT:
{
  "has_greeting": true,
  "greeting_text": "سلام خوبی؟",
  "has_question": false,
  "original_question": "",
  "sub_questions": []
}
    """,
"""
### Example 2:
USER: سلام خوبی؟ نقش مردم در اقتصاد مقاومتی چیست و کدام مسئولین و نهادها و چگونه می توانند به نقش آفرینی مردم در آن کمک کنند؟
OUTPUT:
{
  "has_greeting": true,
  "greeting_text": "سلام خوبی؟",
  "has_question": true,
  "original_question": "نقش مردم در اقتصاد مقاومتی چیست و کدام مسئولین و نهادها و چگونه می توانند به نقش آفرینی مردم در آن کمک کنند؟",
  "sub_questions": [
    "اقتصاد مقاومتی چه تعریفی دارد و چه ویژگی‌هایی دارد؟",
    "مردم در چه زمینه‌هایی می‌توانند در تحقق اقتصاد مقاومتی نقش‌آفرینی کنند؟",
    "چه رفتارهایی از سوی مردم با اصول اقتصاد مقاومتی هم‌خوانی دارد یا مغایر است؟",
    "کدام نهادها و سازمان‌های دولتی در زمینه اقتصاد مقاومتی مسئولیت مستقیم دارند؟",
    "نهادها و سازمان‌های دولتی چگونه می‌تواند مشارکت مردمی در اقتصاد مقاومتی را تقویت کنند؟,
    "چه سیاست‌ها یا مشوق‌هایی می‌تواند مردم را به مشارکت بیشتر در اقتصاد مقاومتی ترغیب کند؟"
  ]
}
""",
"""
### Example 3:
USER: مردم‌سالاری دینی به چه معناست و دارای کدام مصادیق است؟
OUTPUT:
{
  "has_greeting": false,
  "greeting_text": "",
  "has_question": true,
  "original_question": "مردم‌سالاری دینی به چه معناست و دارای کدام مصادیق است؟",
  "sub_questions": [
    "مردم‌سالاری دینی چه تعریفی دارد؟",
    "چه تفاوت‌هایی میان مردم‌سالاری دینی و سایر انواع مردم‌سالاری وجود دارد؟",
    "مردم‌سالاری دینی چگونه در ساختار سیاسی جمهوری اسلامی ایران به رسمیت شناخته شده است؟",
    "چه نمونه‌هایی از مردم‌سالاری دینی را می‌توان در رفتار مردم مشاهده کرد؟"
  ]
}
"""
]

# -------------------------------------------------------------
#          MESSAGE BUILDER WITH AUTO TOKEN COMPRESSION
# -------------------------------------------------------------
def build_messages(user_query: str, max_tokens: int = 1024):
    """
    ایجاد کامل پیام‌های System + Examples + User
    با قابلیت حذف مثال‌ها در صورت محدودیت توکن
    """
    # tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    # 1) ساخت پیام‌های کامل
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]

    for ex in EXAMPLES:
        messages.append({"role": "user", "content": ex})
        # messages.append({"role": "assistant", "content": ex["assistant"]})

    # پیام کاربر در انتها
    messages.append({"role": "user", "content": user_query})

    # 2) شمارش توکن‌ها
    # def count_tokens(msgs):
    #     return sum(len(tokenizer.encode(m["content"])) for m in msgs)

    # total = count_tokens(messages)

    # 3) اگر توکن زیاد بود: نسخه خلاصه
    # if total > max_tokens:
    #     print(f"⚠️ Token limit exceeded ({total}>{max_tokens}), compressing examples...")

    #     # حذف مثال دوم
    #     compressed = [
    #         {"role": "system", "content": SYSTEM_PROMPT},
    #         {"role": "user", "content": EXAMPLES[0]},
    #         {"role": "user", "content": EXAMPLES[1]},
    #         {"role": "user", "content": user_query}
    #     ]

    #     if count_tokens(compressed) > max_tokens:
    #         print("⚠️ Removing all examples...")
    #         compressed = [
    #             {"role": "system", "content": SYSTEM_PROMPT},
    #             {"role": "user", "content": user_query},
    #         ]

    #     return compressed

    return messages