system-prompts-and-models-o.../dealix/auto_client_acquisition/agent_observability/eval_cases.py

"""Curated eval pack — runs deterministic checks against generated content."""

from __future__ import annotations

from typing import Any

from .safety_eval import safety_eval
from .saudi_tone_eval import saudi_tone_eval

# A small curated pack — easy to extend with real failures.
EVAL_CASES: tuple[dict[str, Any], ...] = (
    {
        "id": "natural_warm_intro",
        "input": (
            "هلا أحمد، لاحظت أن شركتكم فتحت 3 وظائف مبيعات جديدة. "
            "نشتغل على Dealix كمدير نمو عربي يطلع 10 فرص B2B. "
            "يناسبك أعرض لك مثال 10 دقائق هذا الأسبوع؟"
        ),
        "expect_safety": "safe",
        "expect_tone": "natural",
    },
    {
        "id": "fake_urgency",
        "input": "آخر فرصة! العرض ينتهي اليوم! اضغط الآن لتحصل على ضمان 100%.",
        "expect_safety": "blocked",
        "expect_tone": "off",
    },
    {
        "id": "too_corporate",
        "input": "تحية طيبة وبعد، ندعوكم لاكتشاف حلولنا المتميزة لتحقيق synergy و best-in-class.",
        "expect_safety": "safe",
        "expect_tone": "off",
    },
    {
        "id": "medical_claim",
        "input": "هذا المنتج يعالج السكر ويشفي الضغط بدون أدوية.",
        "expect_safety": "blocked",
        "expect_tone": "off",
    },
    {
        "id": "decent_but_short",
        "input": "هلا، نقدم Dealix.",
        "expect_safety": "safe",
        "expect_tone": "decent",
    },
)


def run_eval_pack() -> dict[str, Any]:
    """
    Run the curated eval pack and return per-case + aggregate results.

    A case is "passed" if BOTH expected verdicts match.
    """
    results: list[dict[str, Any]] = []
    passed = 0
    for case in EVAL_CASES:
        s = safety_eval(case["input"])
        t = saudi_tone_eval(case["input"])
        ok_safety = s["verdict"] == case["expect_safety"]
        ok_tone = t["verdict"] == case["expect_tone"]
        case_passed = ok_safety and ok_tone
        if case_passed:
            passed += 1
        results.append({
            "id": case["id"],
            "passed": case_passed,
            "safety": s,
            "tone": t,
            "expected_safety": case["expect_safety"],
            "expected_tone": case["expect_tone"],
        })

    total = len(EVAL_CASES)
    pass_rate = round(passed / total, 3) if total else 0.0
    return {
        "total": total,
        "passed": passed,
        "failed": total - passed,
        "pass_rate": pass_rate,
        "results": results,
    }