system-prompts-and-models-o.../dealix/scripts/run_evals.py

#!/usr/bin/env python3
"""Deterministic JSON-shape evals against in-process FastAPI routes (no LLM keys required).

Usage:
  python scripts/run_evals.py
  python scripts/run_evals.py --suite personal_operator
"""

from __future__ import annotations

import argparse
import asyncio
import json
import sys
from pathlib import Path

_REPO = Path(__file__).resolve().parents[1]
if str(_REPO) not in sys.path:
    sys.path.insert(0, str(_REPO))

import httpx  # noqa: E402

from api.main import create_app  # noqa: E402

_DEFAULT_SUITES = ("personal_operator", "revenue_os")


def _load_cases(name: str) -> list[dict]:
    path = _REPO / "evals" / f"{name}_cases.jsonl"
    rows: list[dict] = []
    with path.open(encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows


def _check_case(case_id: str, data: dict, rules: dict) -> list[str]:
    errs: list[str] = []
    for key in rules.get("expect_keys") or []:
        if key not in data:
            errs.append(f"{case_id}: missing key {key!r}")
    blob = json.dumps(data, ensure_ascii=False)
    for sub in rules.get("forbid_substrings") or []:
        if sub in blob:
            errs.append(f"{case_id}: forbidden substring {sub!r} in payload")
    score = data.get("overall_score")
    if isinstance(score, int):
        if "max_overall_score" in rules and score > rules["max_overall_score"]:
            errs.append(f"{case_id}: overall_score {score} too high")
        if "min_overall_score" in rules and score < rules["min_overall_score"]:
            errs.append(f"{case_id}: overall_score {score} too low")
    tiers = data.get("tiers")
    if isinstance(tiers, list) and "min_tiers" in rules:
        if len(tiers) < int(rules["min_tiers"]):
            errs.append(f"{case_id}: tiers length {len(tiers)} < min")
    return errs


PATH_BY_SUITE = {
    "personal_operator": [
        ("/api/v1/personal-operator/daily-brief", "po_daily_brief_has_greeting"),
        ("/api/v1/personal-operator/launch-report", "po_launch_report_has_score"),
    ],
    "revenue_os": [
        ("/api/v1/v3/command-center/snapshot", "v3_command_snapshot_shape"),
        ("/api/v1/business/pricing", "business_pricing_tiers"),
    ],
}


async def _run_suite(client: httpx.AsyncClient, suite: str) -> list[str]:
    errs: list[str] = []
    cases_by_id = {c["id"]: c for c in _load_cases(suite)}
    for path, case_id in PATH_BY_SUITE[suite]:
        case = cases_by_id.get(case_id)
        if not case:
            errs.append(f"missing case definition {case_id}")
            continue
        r = await client.get(path)
        if r.status_code != 200:
            errs.append(f"{case_id}: HTTP {r.status_code} for {path}")
            continue
        try:
            data = r.json()
        except json.JSONDecodeError:
            errs.append(f"{case_id}: invalid JSON from {path}")
            continue
        errs.extend(_check_case(case_id, data, case))
    return errs


async def main_async(args: argparse.Namespace) -> int:
    suites = args.suite or list(_DEFAULT_SUITES)
    app = create_app()
    transport = httpx.ASGITransport(app=app)
    all_errs: list[str] = []
    async with httpx.AsyncClient(transport=transport, base_url="http://eval") as client:
        for s in suites:
            all_errs.extend(await _run_suite(client, s))
    if all_errs:
        print("EVAL_FAIL")
        for e in all_errs:
            print(e)
        return 1
    print("EVAL_OK suites=", ",".join(suites))
    return 0


def main() -> int:
    p = argparse.ArgumentParser()
    p.add_argument(
        "--suite",
        action="append",
        choices=list(PATH_BY_SUITE.keys()),
        help="Repeatable; default runs both suites",
    )
    args = p.parse_args()
    return asyncio.run(main_async(args))


if __name__ == "__main__":
    raise SystemExit(main())