system-prompts-and-models-o.../dealix/scripts/run_evals.py
2026-05-01 14:03:52 +03:00

126 lines
3.9 KiB
Python

#!/usr/bin/env python3
"""Deterministic JSON-shape evals against in-process FastAPI routes (no LLM keys required).
Usage:
python scripts/run_evals.py
python scripts/run_evals.py --suite personal_operator
"""
from __future__ import annotations
import argparse
import asyncio
import json
import sys
from pathlib import Path
_REPO = Path(__file__).resolve().parents[1]
if str(_REPO) not in sys.path:
sys.path.insert(0, str(_REPO))
import httpx # noqa: E402
from api.main import create_app # noqa: E402
_DEFAULT_SUITES = ("personal_operator", "revenue_os")
def _load_cases(name: str) -> list[dict]:
path = _REPO / "evals" / f"{name}_cases.jsonl"
rows: list[dict] = []
with path.open(encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
rows.append(json.loads(line))
return rows
def _check_case(case_id: str, data: dict, rules: dict) -> list[str]:
errs: list[str] = []
for key in rules.get("expect_keys") or []:
if key not in data:
errs.append(f"{case_id}: missing key {key!r}")
blob = json.dumps(data, ensure_ascii=False)
for sub in rules.get("forbid_substrings") or []:
if sub in blob:
errs.append(f"{case_id}: forbidden substring {sub!r} in payload")
score = data.get("overall_score")
if isinstance(score, int):
if "max_overall_score" in rules and score > rules["max_overall_score"]:
errs.append(f"{case_id}: overall_score {score} too high")
if "min_overall_score" in rules and score < rules["min_overall_score"]:
errs.append(f"{case_id}: overall_score {score} too low")
tiers = data.get("tiers")
if isinstance(tiers, list) and "min_tiers" in rules:
if len(tiers) < int(rules["min_tiers"]):
errs.append(f"{case_id}: tiers length {len(tiers)} < min")
return errs
PATH_BY_SUITE = {
"personal_operator": [
("/api/v1/personal-operator/daily-brief", "po_daily_brief_has_greeting"),
("/api/v1/personal-operator/launch-report", "po_launch_report_has_score"),
],
"revenue_os": [
("/api/v1/v3/command-center/snapshot", "v3_command_snapshot_shape"),
("/api/v1/business/pricing", "business_pricing_tiers"),
],
}
async def _run_suite(client: httpx.AsyncClient, suite: str) -> list[str]:
errs: list[str] = []
cases_by_id = {c["id"]: c for c in _load_cases(suite)}
for path, case_id in PATH_BY_SUITE[suite]:
case = cases_by_id.get(case_id)
if not case:
errs.append(f"missing case definition {case_id}")
continue
r = await client.get(path)
if r.status_code != 200:
errs.append(f"{case_id}: HTTP {r.status_code} for {path}")
continue
try:
data = r.json()
except json.JSONDecodeError:
errs.append(f"{case_id}: invalid JSON from {path}")
continue
errs.extend(_check_case(case_id, data, case))
return errs
async def main_async(args: argparse.Namespace) -> int:
suites = args.suite or list(_DEFAULT_SUITES)
app = create_app()
transport = httpx.ASGITransport(app=app)
all_errs: list[str] = []
async with httpx.AsyncClient(transport=transport, base_url="http://eval") as client:
for s in suites:
all_errs.extend(await _run_suite(client, s))
if all_errs:
print("EVAL_FAIL")
for e in all_errs:
print(e)
return 1
print("EVAL_OK suites=", ",".join(suites))
return 0
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument(
"--suite",
action="append",
choices=list(PATH_BY_SUITE.keys()),
help="Repeatable; default runs both suites",
)
args = p.parse_args()
return asyncio.run(main_async(args))
if __name__ == "__main__":
raise SystemExit(main())