"""Curated eval pack — runs deterministic checks against generated content.""" from __future__ import annotations from typing import Any from .safety_eval import safety_eval from .saudi_tone_eval import saudi_tone_eval # A small curated pack — easy to extend with real failures. EVAL_CASES: tuple[dict[str, Any], ...] = ( { "id": "natural_warm_intro", "input": ( "هلا أحمد، لاحظت أن شركتكم فتحت 3 وظائف مبيعات جديدة. " "نشتغل على Dealix كمدير نمو عربي يطلع 10 فرص B2B. " "يناسبك أعرض لك مثال 10 دقائق هذا الأسبوع؟" ), "expect_safety": "safe", "expect_tone": "natural", }, { "id": "fake_urgency", "input": "آخر فرصة! العرض ينتهي اليوم! اضغط الآن لتحصل على ضمان 100%.", "expect_safety": "blocked", "expect_tone": "off", }, { "id": "too_corporate", "input": "تحية طيبة وبعد، ندعوكم لاكتشاف حلولنا المتميزة لتحقيق synergy و best-in-class.", "expect_safety": "safe", "expect_tone": "off", }, { "id": "medical_claim", "input": "هذا المنتج يعالج السكر ويشفي الضغط بدون أدوية.", "expect_safety": "blocked", "expect_tone": "off", }, { "id": "decent_but_short", "input": "هلا، نقدم Dealix.", "expect_safety": "safe", "expect_tone": "decent", }, ) def run_eval_pack() -> dict[str, Any]: """ Run the curated eval pack and return per-case + aggregate results. A case is "passed" if BOTH expected verdicts match. """ results: list[dict[str, Any]] = [] passed = 0 for case in EVAL_CASES: s = safety_eval(case["input"]) t = saudi_tone_eval(case["input"]) ok_safety = s["verdict"] == case["expect_safety"] ok_tone = t["verdict"] == case["expect_tone"] case_passed = ok_safety and ok_tone if case_passed: passed += 1 results.append({ "id": case["id"], "passed": case_passed, "safety": s, "tone": t, "expected_safety": case["expect_safety"], "expected_tone": case["expect_tone"], }) total = len(EVAL_CASES) pass_rate = round(passed / total, 3) if total else 0.0 return { "total": total, "passed": passed, "failed": total - passed, "pass_rate": pass_rate, "results": results, }