fix: Enhanced Hermes API, router registration, and observability service

https://claude.ai/code/session_01LsnvBa7HwF5hs99VZbgLGj
2026-06-19 07:49:34 +00:00 · 2026-04-11 08:36:50 +00:00 · 2026-04-11 08:36:50 +00:00 · 85a9c9a23f
commit 85a9c9a23f
parent 40a48b98c9
3 changed files with 498 additions and 270 deletions
--- a/salesflow-saas/backend/app/api/v1/hermes.py
+++ b/salesflow-saas/backend/app/api/v1/hermes.py
@ -1,214 +1,285 @@
 """
-Hermes API — Dealix AI Revenue OS
+Hermes API -- Dealix AI Revenue OS -- واجهة هيرمس البرمجية
-Top-level orchestration, profiles, cost, health, improvements, security.
+Orchestrator endpoints: execute tasks, manage profiles, view costs,
 run security scans, approve improvements, and generate executive summaries.
 """
-import logging
+from __future__ import annotations
 from typing import Optional
-from fastapi import APIRouter, Depends, HTTPException, status
+import logging
-from pydantic import BaseModel
+from typing import Any, Dict, List, Optional
 from fastapi import APIRouter, HTTPException
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel, Field
 from app.services.hermes_orchestrator import hermes_orchestrator, HermesProfile
 from app.services.execution_router import execution_router, TaskClass
 from app.services.shannon_security import shannon_security, ShannonScope
 from app.services.self_improvement import self_improvement_engine
 from app.services.observability import observability_service
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/hermes", tags=["hermes"])
 router = APIRouter(prefix="/hermes", tags=["Hermes Orchestrator"])
 # ---------------------------------------------------------------------------
 # Request / response schemas
 # ---------------------------------------------------------------------------
 class ExecuteRequest(BaseModel):
    profile_id: str = "founder"
    task: str
    params: Dict[str, Any] = Field(default_factory=dict)
    user_context: Dict[str, Any] = Field(default_factory=lambda: {
        "user_id": "api_user", "tenant_id": "default", "role": "owner",
    })
 class ExecuteResponse(BaseModel):
    run_id: str
    profile_id: str
    task: str
-    params: dict = {}
+    status: str
-
+    backend: str = ""
-
+    data: Dict[str, Any] = {}
-class ApproveRequest(BaseModel):
+    evidence: List[str] = []
-    approved_by: str
+    receipt_id: Optional[str] = None
    cost_usd: float = 0.0
    duration_ms: int = 0
    error: Optional[str] = None
    message_ar: str = ""
 class ScanRequest(BaseModel):
-    environment: str
+    environment: str = "staging"
-    base_url: str
+    scopes: List[str] = Field(
-    scopes: list[str] = []
+        default_factory=lambda: ["auth", "injection", "pdpl"],
    )
    base_url: str = "https://staging.dealix.sa"
-# ── Execute ────────────────────────────────────────
+class ApproveRequest(BaseModel):
    user_id: str = "founder"
-@router.post("/execute")
+# ---------------------------------------------------------------------------
-async def execute_task(req: ExecuteRequest):
+# Execute
-    from app.services.hermes_orchestrator import hermes
+# ---------------------------------------------------------------------------
-    try:
+
-        result = await hermes.execute(
+@router.post("/execute", response_model=ExecuteResponse)
-            profile_id=req.profile_id,
+async def execute_task(req: ExecuteRequest) -> ExecuteResponse:
-            task=req.task,
+    """Execute a task via the Hermes orchestrator."""
-            params=req.params,
+    result = await hermes_orchestrator.execute(
-        )
+        profile_id=req.profile_id,
-        return result
+        task=req.task,
-    except ValueError as e:
+        params=req.params,
-        raise HTTPException(status_code=400, detail=str(e))
+        user_context=req.user_context,
-    except PermissionError as e:
+    )
-        raise HTTPException(status_code=403, detail=str(e))
+    return ExecuteResponse(**result.model_dump())
-# ── Profiles ───────────────────────────────────────
+# ---------------------------------------------------------------------------
-
+# Profiles
 # ---------------------------------------------------------------------------
@router.get("/profiles")
-async def list_profiles():
+async def list_profiles() -> JSONResponse:
-    from app.services.hermes_orchestrator import hermes
+    """List all available Hermes profiles."""
-    profiles = hermes.list_profiles()
+    profiles = await hermes_orchestrator.list_profiles()
-    return {"profiles": [p.model_dump() if hasattr(p, 'model_dump') else p.__dict__ for p in profiles]}
+    return JSONResponse(content={
        "profiles": [p.model_dump() for p in profiles],
        "count": len(profiles),
        "message_ar": f"{len(profiles)} ملفات شخصية متاحة",
    })
@router.get("/profiles/{profile_id}")
-async def get_profile(profile_id: str):
+async def get_profile(profile_id: str) -> JSONResponse:
-    from app.services.hermes_orchestrator import hermes
+    """Get details for a specific profile."""
-    profile = hermes.get_profile(profile_id)
+    profile = await hermes_orchestrator.get_profile(profile_id)
    if not profile:
-        raise HTTPException(status_code=404, detail="الملف الشخصي غير موجود")
+        raise HTTPException(status_code=404, detail=f"الملف الشخصي غير موجود: {profile_id}")
-    return profile.model_dump() if hasattr(profile, 'model_dump') else profile.__dict__
+    return JSONResponse(content=profile.model_dump())
-# ── Cost & Health ──────────────────────────────────
+# ---------------------------------------------------------------------------
-
+# Cost
 # ---------------------------------------------------------------------------
@router.get("/cost")
-async def cost_report(period: str = "daily", profile: Optional[str] = None):
+async def cost_report(period: str = "daily") -> JSONResponse:
-    from app.services.observability import observability
+    """Get cost report from the orchestrator and observability service."""
-    return await observability.get_cost_report(period, profile)
+    hermes_cost = await hermes_orchestrator.get_cost_report(period)
    obs_cost = await observability_service.get_cost_report(period)
    return JSONResponse(content={
        "hermes": hermes_cost,
        "observability": obs_cost,
    })
 # ---------------------------------------------------------------------------
 # Health
 # ---------------------------------------------------------------------------
@router.get("/health")
-async def health_report():
+async def health_report() -> JSONResponse:
-    from app.services.observability import observability
+    """System health report across all backends and workflows."""
-    return await observability.get_health_report()
+    obs_health = await observability_service.get_health_report()
    backend_health = await execution_router.get_backend_health()
    anomalies = await observability_service.detect_anomalies()
    return JSONResponse(content={
        "system": obs_health,
        "backends": {k: v.model_dump() for k, v in backend_health.items()},
        "anomalies": [a.model_dump(mode="json") for a in anomalies],
        "anomaly_count": len(anomalies),
    })
-@router.get("/performance")
+# ---------------------------------------------------------------------------
-async def performance_report(period: str = "daily"):
+# Runs
-    from app.services.observability import observability
+# ---------------------------------------------------------------------------
    return await observability.get_performance_report(period)
@router.get("/anomalies")
 async def detect_anomalies():
    from app.services.observability import observability
    return {"anomalies": await observability.detect_anomalies()}
@router.get("/executive-summary")
 async def executive_summary(period: str = "weekly"):
    from app.services.observability import observability
    summary = await observability.get_executive_summary(period)
    return {"summary": summary}
 # ── Runs ───────────────────────────────────────────
@router.get("/runs")
-async def list_active_runs():
+async def list_active_runs() -> JSONResponse:
-    from app.services.hermes_orchestrator import hermes
+    """List currently active runs."""
-    return {"runs": hermes.get_active_runs()}
+    runs = await hermes_orchestrator.get_active_runs()
    return JSONResponse(content={
        "active_runs": [r.model_dump(mode="json") for r in runs],
        "count": len(runs),
        "message_ar": f"{len(runs)} عمليات نشطة حالياً",
    })
@router.post("/runs/{run_id}/abort")
-async def abort_run(run_id: str):
+async def abort_run(run_id: str) -> JSONResponse:
-    from app.services.hermes_orchestrator import hermes
+    """Abort an active run."""
-    success = hermes.abort_run(run_id)
+    success = await hermes_orchestrator.abort_run(run_id)
    if not success:
-        raise HTTPException(status_code=404, detail="التشغيل غير موجود")
+        raise HTTPException(status_code=404, detail=f"التشغيل غير موجود أو اكتمل بالفعل: {run_id}")
-    return {"status": "aborted", "run_id": run_id}
+    return JSONResponse(content={
        "run_id": run_id,
        "status": "aborted",
        "message_ar": f"تم إلغاء التشغيل: {run_id}",
    })
-# ── Self-Improvement ──────────────────────────────
+# ---------------------------------------------------------------------------
-
+# Self-improvement
 # ---------------------------------------------------------------------------
@router.get("/improvements")
-async def list_improvements(status_filter: Optional[str] = None):
+async def list_improvements(status: Optional[str] = None) -> JSONResponse:
-    from app.services.self_improvement import self_improvement, ImprovementStatus
+    """List self-improvement proposals."""
-    if status_filter:
+    proposals = self_improvement_engine.list_proposals(status)
-        try:
+    report = await self_improvement_engine.report()
-            s = ImprovementStatus(status_filter)
+    return JSONResponse(content={
-        except ValueError:
+        "proposals": [p.model_dump(mode="json") for p in proposals],
-            raise HTTPException(status_code=400, detail="حالة غير صالحة")
+        "count": len(proposals),
-        proposals = await self_improvement.get_proposals(s)
+        "summary": report,
-    else:
+    })
        proposals = await self_improvement.get_proposals()
    return {"proposals": [p.model_dump() for p in proposals]}
@router.post("/improvements/cycle")
 async def run_improvement_cycle():
    from app.services.self_improvement import self_improvement
    result = await self_improvement.run_cycle()
    return result.model_dump()
@router.post("/improvements/{proposal_id}/approve")
-async def approve_improvement(proposal_id: str, req: ApproveRequest):
+async def approve_improvement(proposal_id: str, req: ApproveRequest) -> JSONResponse:
-    from app.services.self_improvement import self_improvement
+    """Approve a self-improvement proposal."""
-    success = await self_improvement.apply(proposal_id, req.approved_by)
+    proposal = await self_improvement_engine.approve(proposal_id, req.user_id)
-    if not success:
+    if not proposal:
-        raise HTTPException(status_code=404, detail="المقترح غير موجود أو يحتاج موافقة")
+        raise HTTPException(
-    return {"status": "approved", "proposal_id": proposal_id}
+            status_code=404,
            detail=f"المقترح غير موجود أو ليس في حالة 'مقترح': {proposal_id}",
        )
    return JSONResponse(content={
        "proposal_id": proposal.id,
        "status": proposal.status,
        "approved_by": proposal.approved_by,
        "message_ar": f"تمت الموافقة على المقترح: {proposal.title_ar}",
    })
-@router.post("/improvements/{proposal_id}/reject")
+@router.post("/improvements/cycle")
-async def reject_improvement(proposal_id: str):
+async def run_improvement_cycle(tenant_id: Optional[str] = None) -> JSONResponse:
-    from app.services.self_improvement import self_improvement
+    """Trigger a self-improvement cycle."""
-    success = await self_improvement.reject(proposal_id)
+    cycle = await self_improvement_engine.run_cycle(tenant_id)
-    if not success:
+    return JSONResponse(content=cycle.model_dump(mode="json"))
        raise HTTPException(status_code=404, detail="المقترح غير موجود")
    return {"status": "rejected", "proposal_id": proposal_id}
-# ── Security (Shannon) ────────────────────────────
+# ---------------------------------------------------------------------------
-
+# Shannon security
 # ---------------------------------------------------------------------------
@router.get("/security/report")
-async def security_report(severity: Optional[str] = None):
+async def latest_security_report() -> JSONResponse:
-    from app.services.shannon_security import shannon
+    """Get the latest Shannon security scan report."""
-    findings = shannon.get_all_findings(severity)
+    report = shannon_security.get_latest_report()
-    should_block = await shannon.should_block_release()
+    if not report:
-    return {
+        return JSONResponse(content={
-        "findings": [f.model_dump() for f in findings],
+            "report": None,
-        "total": len(findings),
+            "message_ar": "لا يوجد تقرير أمني بعد. قم بتشغيل فحص أولاً.",
-        "should_block_release": should_block,
+        })
-    }
+    return JSONResponse(content=report.model_dump(mode="json"))
@router.post("/security/scan")
-async def trigger_security_scan(req: ScanRequest):
+async def trigger_security_scan(req: ScanRequest) -> JSONResponse:
-    from app.services.shannon_security import shannon, ShannonScope
+    """Trigger a Shannon security scan (staging only)."""
-    try:
+    scopes = []
-        scopes = [ShannonScope(s) for s in req.scopes] if req.scopes else None
+    for s in req.scopes:
-        report = await shannon.run_scan(
+        try:
-            environment=req.environment,
+            scopes.append(ShannonScope(s))
-            base_url=req.base_url,
+        except ValueError:
-            scopes=scopes,
+            raise HTTPException(
                status_code=400,
                detail=f"نطاق غير صالح: {s}. المتاحة: {[x.value for x in ShannonScope]}",
            )
    report = await shannon_security.run_scan(
        environment=req.environment,
        scopes=scopes,
        base_url=req.base_url,
    )
    return JSONResponse(content=report.model_dump(mode="json"))
 # ---------------------------------------------------------------------------
 # Executive summary
 # ---------------------------------------------------------------------------
@router.get("/executive-summary")
 async def executive_summary(period: str = "weekly") -> JSONResponse:
    """Arabic executive summary for leadership."""
    summary = await observability_service.get_executive_summary(period)
    improvement_report = await self_improvement_engine.report()
    security_report = shannon_security.get_latest_report()
    full_summary = summary
    if improvement_report.get("total_proposals", 0) > 0:
        applied = improvement_report.get("by_status", {}).get("applied", 0)
        pending = improvement_report.get("by_status", {}).get("proposed", 0)
        full_summary += f"، {applied} تحسين مطبق، {pending} تحسين معلق"
    if security_report:
        full_summary += (
            f"، آخر فحص أمني: {security_report.critical_count} حرجة، "
            f"{security_report.high_count} عالية"
        )
-        return report.model_dump()
+    else:
-    except PermissionError as e:
+        full_summary += "، لا يوجد فحص أمني بعد"
-        raise HTTPException(
+
-            status_code=403,
+    return JSONResponse(content={
-            detail=f"محظور: {str(e)}"
+        "summary_ar": full_summary,
-        )
+        "period": period,
-    except ValueError as e:
+        "improvements": improvement_report,
-        raise HTTPException(status_code=400, detail=str(e))
+        "security_status": security_report.model_dump(mode="json") if security_report else None,
    })
-# ── Session Continuity ────────────────────────────
+# ---------------------------------------------------------------------------
 # Routing stats
 # ---------------------------------------------------------------------------
-
+@router.get("/routing-stats")
-@router.get("/session/restore")
+async def routing_stats() -> JSONResponse:
-async def restore_session():
+    """Execution routing statistics."""
-    from app.services.session_continuity import session_continuity
+    stats = await execution_router.get_routing_stats()
-    prompt = await session_continuity.get_restore_prompt()
+    return JSONResponse(content=stats)
    return {"restore_prompt": prompt}
@router.get("/session/state")
 async def get_session_state():
    from app.services.session_continuity import session_continuity
    state = await session_continuity.restore_state()
    if not state:
        return {"state": None}
    return {"state": state.model_dump() if hasattr(state, 'model_dump') else state.__dict__}
--- a/salesflow-saas/backend/app/api/v1/router.py
+++ b/salesflow-saas/backend/app/api/v1/router.py
@ -16,6 +16,7 @@ from app.api.v1 import lead_prospector as prospector_router
 from app.api.v1 import pipeline as pipeline_router
 from app.api.v1 import agent_system as agent_system_router
 from app.api.v1 import autonomous_foundation as autonomous_foundation_router
 from app.api.v1 import hermes as hermes_router
 from app.api.v1 import marketing_hub as marketing_hub_router
 from app.api.v1 import strategy_summary as strategy_summary_router
 from app.api.v1 import value_proposition as value_proposition_router
@ -86,3 +87,6 @@ api_router.include_router(pipeline_router.router)
 # ── 22-Agent AI System — Full Empire Control ─────────────────
 api_router.include_router(agent_system_router.router)
 api_router.include_router(autonomous_foundation_router.router)
 # ── Hermes Fusion — Orchestration Layer ──────────────────────
 api_router.include_router(hermes_router.router)
--- a/salesflow-saas/backend/app/services/observability.py
+++ b/salesflow-saas/backend/app/services/observability.py
@ -1,17 +1,27 @@
 """
-Observability Service — Dealix AI Revenue OS
+Observability Service -- Dealix AI Revenue OS -- خدمة المراقبة
-Cost tracking, workflow metrics, health monitoring, and Arabic executive summaries.
+Track cost, performance, and health across all agent workflows.
 Anomaly detection, executive summaries in Arabic.
 """
 from __future__ import annotations
 import logging
-from datetime import datetime, timezone, timedelta
+import statistics
-from typing import Optional
+from collections import defaultdict
 from datetime import datetime, timedelta, timezone
 from typing import Any, Optional
 from pydantic import BaseModel, Field
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Models
 # ---------------------------------------------------------------------------
 class WorkflowMetric(BaseModel):
    """Single workflow execution metric."""
    workflow_name: str
    profile_id: str
    backend: str
@ -23,170 +33,313 @@ class WorkflowMetric(BaseModel):
    error: Optional[str] = None
 class AnomalyAlert(BaseModel):
    """Detected anomaly."""
    id: str = ""
    anomaly_type: str  # cost_spike, failure_spike, latency_spike, regression
    description: str
    description_ar: str
    severity: str = "medium"  # critical, high, medium, low
    metric_name: str = ""
    current_value: float = 0.0
    baseline_value: float = 0.0
    deviation_pct: float = 0.0
    detected_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
 # ---------------------------------------------------------------------------
 # Service
 # ---------------------------------------------------------------------------
 class ObservabilityService:
    """Track cost, performance, and health across all agent workflows."""
-    def __init__(self):
+    def __init__(self) -> None:
        self._metrics: list[WorkflowMetric] = []
-        self._max_metrics = 50000
+        self._max_metrics = 50_000
        self._anomalies: list[AnomalyAlert] = []
        self._max_anomalies = 1_000
        logger.info("خدمة المراقبة: تم التهيئة")
    # -- Recording ---------------------------------------------------------
    async def record_workflow(self, metric: WorkflowMetric) -> None:
        """Store a workflow execution metric."""
        self._metrics.append(metric)
        if len(self._metrics) > self._max_metrics:
            self._metrics = self._metrics[-self._max_metrics:]
        logger.debug(
-            f"Recorded: {metric.workflow_name} "
+            "[Obs] سجل: %s profile=%s backend=%s %dms $%.4f %s",
-            f"cost=${metric.estimated_cost_usd:.4f} "
+            metric.workflow_name, metric.profile_id, metric.backend,
-            f"{'OK' if metric.success else 'FAIL'}"
+            metric.duration_ms, metric.estimated_cost_usd,
            "OK" if metric.success else "FAIL",
        )
-    def _filter_by_period(
+    # -- Cost report -------------------------------------------------------
        self, period: str, metrics: list[WorkflowMetric] = None
    ) -> list[WorkflowMetric]:
        source = metrics or self._metrics
        now = datetime.now(timezone.utc)
        if period == "hourly":
            cutoff = now - timedelta(hours=1)
        elif period == "daily":
            cutoff = now - timedelta(days=1)
        elif period == "weekly":
            cutoff = now - timedelta(weeks=1)
        elif period == "monthly":
            cutoff = now - timedelta(days=30)
        else:
            cutoff = now - timedelta(days=1)
        return [m for m in source if m.started_at >= cutoff]
    async def get_cost_report(
-        self, period: str = "daily", profile: str = None
+        self, period: str = "daily", profile: Optional[str] = None,
-    ) -> dict:
+    ) -> dict[str, Any]:
-        filtered = self._filter_by_period(period)
+        """Total cost, cost by profile, cost by backend, cost by workflow."""
-        if profile:
+        cutoff = self._period_cutoff(period)
-            filtered = [m for m in filtered if m.profile_id == profile]
+        filtered = [
            m for m in self._metrics
            if m.started_at >= cutoff and (profile is None or m.profile_id == profile)
        ]
-        total_cost = sum(m.estimated_cost_usd for m in filtered)
+        total = sum(m.estimated_cost_usd for m in filtered)
-        by_profile: dict[str, float] = {}
+        by_profile: dict[str, float] = defaultdict(float)
-        by_backend: dict[str, float] = {}
+        by_backend: dict[str, float] = defaultdict(float)
-        by_workflow: dict[str, float] = {}
+        by_workflow: dict[str, float] = defaultdict(float)
        for m in filtered:
-            by_profile[m.profile_id] = by_profile.get(m.profile_id, 0) + m.estimated_cost_usd
+            by_profile[m.profile_id] += m.estimated_cost_usd
-            by_backend[m.backend] = by_backend.get(m.backend, 0) + m.estimated_cost_usd
+            by_backend[m.backend] += m.estimated_cost_usd
-            by_workflow[m.workflow_name] = by_workflow.get(m.workflow_name, 0) + m.estimated_cost_usd
+            by_workflow[m.workflow_name] += m.estimated_cost_usd
-        top_expensive = sorted(by_workflow.items(), key=lambda x: x[1], reverse=True)[:5]
+        # Sort by cost descending
        top_workflows = sorted(by_workflow.items(), key=lambda x: -x[1])[:10]
        return {
            "period": period,
-            "total_cost_usd": round(total_cost, 4),
+            "total_usd": round(total, 4),
            "total_workflows": len(filtered),
            "by_profile": {k: round(v, 4) for k, v in by_profile.items()},
            "by_backend": {k: round(v, 4) for k, v in by_backend.items()},
-            "top_expensive": [{"name": k, "cost": round(v, 4)} for k, v in top_expensive],
+            "top_workflows": [{"name": n, "cost_usd": round(c, 4)} for n, c in top_workflows],
            "total_executions": len(filtered),
            "message_ar": f"التكلفة الإجمالية ({period}): ${total:.2f} عبر {len(filtered)} عملية",
        }
-    async def get_performance_report(self, period: str = "daily") -> dict:
+    # -- Performance report ------------------------------------------------
-        filtered = self._filter_by_period(period)
+
    async def get_performance_report(self, period: str = "daily") -> dict[str, Any]:
        """Average duration, P95 duration, success rate, error rate."""
        cutoff = self._period_cutoff(period)
        filtered = [m for m in self._metrics if m.started_at >= cutoff]
        if not filtered:
-            return {"period": period, "total": 0}
+            return {
                "period": period, "total_executions": 0,
                "message_ar": "لا توجد بيانات لهذه الفترة",
            }
        durations = [m.duration_ms for m in filtered]
-        durations.sort()
+        successes = sum(1 for m in filtered if m.success)
-        total = len(durations)
+        failures = len(filtered) - successes
        success_count = sum(1 for m in filtered if m.success)
-        p95_idx = min(int(total * 0.95), total - 1)
+        avg_ms = statistics.mean(durations)
-        errors = [m for m in filtered if not m.success]
+        p95_ms = sorted(durations)[int(len(durations) * 0.95)] if len(durations) >= 20 else max(durations)
        success_rate = successes / len(filtered)
        # Slowest workflows
        by_wf: dict[str, list[int]] = defaultdict(list)
        for m in filtered:
            by_wf[m.workflow_name].append(m.duration_ms)
        slowest = sorted(
            [(wf, statistics.mean(ds)) for wf, ds in by_wf.items()],
            key=lambda x: -x[1],
        )[:5]
        # Most expensive
        cost_wf: dict[str, float] = defaultdict(float)
        for m in filtered:
            cost_wf[m.workflow_name] += m.estimated_cost_usd
        most_expensive = sorted(cost_wf.items(), key=lambda x: -x[1])[:5]
        return {
            "period": period,
-            "total_workflows": total,
+            "total_executions": len(filtered),
-            "success_rate": round(success_count / total * 100, 1) if total else 0,
+            "avg_duration_ms": round(avg_ms, 2),
-            "avg_duration_ms": round(sum(durations) / total) if total else 0,
+            "p95_duration_ms": p95_ms,
-            "p95_duration_ms": durations[p95_idx] if durations else 0,
+            "success_rate": round(success_rate, 4),
-            "error_count": len(errors),
+            "error_rate": round(1 - success_rate, 4),
-            "error_rate": round(len(errors) / total * 100, 1) if total else 0,
+            "total_successes": successes,
-            "recent_errors": [
+            "total_failures": failures,
-                {"workflow": e.workflow_name, "error": e.error, "at": e.started_at.isoformat()}
+            "slowest_workflows": [{"name": n, "avg_ms": round(d, 2)} for n, d in slowest],
-                for e in errors[-5:]
+            "most_expensive": [{"name": n, "cost_usd": round(c, 4)} for n, c in most_expensive],
-            ],
+            "message_ar": (
                f"أداء ({period}): {len(filtered)} عملية، "
                f"متوسط {avg_ms:.0f}ms، نجاح {success_rate:.0%}، فشل {failures}"
            ),
        }
-    async def get_health_report(self) -> dict:
+    # -- Health report -----------------------------------------------------
        daily = self._filter_by_period("daily")
        total = len(daily)
        success = sum(1 for m in daily if m.success)
-        backends_used = set(m.backend for m in daily)
+    async def get_health_report(self) -> dict[str, Any]:
        """Backend health, skill health, memory health, knowledge health, trust health."""
        recent = [m for m in self._metrics if m.started_at >= self._period_cutoff("daily")]
        # Backend health
        backend_calls: dict[str, dict[str, int]] = defaultdict(lambda: {"ok": 0, "fail": 0})
        for m in recent:
            backend_calls[m.backend]["ok" if m.success else "fail"] += 1
        backend_health = {}
-        for b in backends_used:
+        for b, counts in backend_calls.items():
-            b_metrics = [m for m in daily if m.backend == b]
+            total = counts["ok"] + counts["fail"]
-            b_success = sum(1 for m in b_metrics if m.success)
+            rate = counts["ok"] / total if total else 1.0
            backend_health[b] = {
-                "total": len(b_metrics),
+                "total": total, "success_rate": round(rate, 4),
-                "success_rate": round(b_success / len(b_metrics) * 100, 1) if b_metrics else 0,
+                "healthy": rate >= 0.7 or total < 5,
                "avg_duration_ms": round(
                    sum(m.duration_ms for m in b_metrics) / len(b_metrics)
                ) if b_metrics else 0,
            }
        # Skill health (by workflow)
        skill_calls: dict[str, dict[str, int]] = defaultdict(lambda: {"ok": 0, "fail": 0})
        for m in recent:
            skill_calls[m.workflow_name]["ok" if m.success else "fail"] += 1
        skill_health = {}
        for s, counts in skill_calls.items():
            total = counts["ok"] + counts["fail"]
            rate = counts["ok"] / total if total else 1.0
            skill_health[s] = {"total": total, "success_rate": round(rate, 4)}
        # Overall
        total = len(recent)
        overall_ok = sum(1 for m in recent if m.success)
        overall_rate = overall_ok / total if total else 1.0
        healthy = overall_rate >= 0.8
        return {
-            "overall_health": "healthy" if (total == 0 or success / total > 0.9) else "degraded",
+            "overall_healthy": healthy,
-            "workflows_today": total,
+            "overall_success_rate": round(overall_rate, 4),
-            "success_rate": round(success / total * 100, 1) if total else 100,
+            "total_today": total,
-            "total_cost_today_usd": round(sum(m.estimated_cost_usd for m in daily), 4),
+            "backend_health": backend_health,
-            "backends": backend_health,
+            "skill_health": skill_health,
            "anomalies_today": len([
                a for a in self._anomalies
                if a.detected_at >= self._period_cutoff("daily")
            ]),
            "message_ar": (
                f"صحة النظام: {'سليم' if healthy else 'يحتاج انتباه'} -- "
                f"نجاح {overall_rate:.0%}، {total} عملية اليوم"
            ),
        }
-    async def get_executive_summary(self, period: str = "weekly") -> str:
+    # -- Executive summary -------------------------------------------------
        filtered = self._filter_by_period(period)
        total = len(filtered)
        success = sum(1 for m in filtered if m.success)
        cost = sum(m.estimated_cost_usd for m in filtered)
        success_rate = round(success / total * 100) if total else 100
-        period_ar = {"daily": "اليوم", "weekly": "هذا الأسبوع", "monthly": "هذا الشهر"}.get(period, period)
+    async def get_executive_summary(self, period: str = "weekly") -> str:
        """Arabic executive summary for leadership."""
        cutoff = self._period_cutoff(period)
        filtered = [m for m in self._metrics if m.started_at >= cutoff]
        total = len(filtered)
        successes = sum(1 for m in filtered if m.success)
        rate = successes / total if total else 0
        cost = sum(m.estimated_cost_usd for m in filtered)
        anomalies = [a for a in self._anomalies if a.detected_at >= cutoff]
        critical = sum(1 for a in anomalies if a.severity == "critical")
        period_ar = {
            "daily": "اليوم", "weekly": "هذا الأسبوع",
            "monthly": "هذا الشهر",
        }.get(period, period)
        summary = (
-            f"📊 ملخص {period_ar}:\n"
+            f"{period_ar}: {total} مهمة منفذة، "
-            f"• {total} مهمة منفذة\n"
+            f"{rate:.0%} نجاح، "
-            f"• {success_rate}% نسبة النجاح\n"
+            f"تكلفة ${cost:.2f}، "
-            f"• ${cost:.2f} التكلفة الإجمالية\n"
+            f"{len(anomalies)} تنبيه"
        )
-
+        if critical:
-        errors = [m for m in filtered if not m.success]
+            summary += f"، {critical} مشاكل حرجة تحتاج تدخل فوري"
        if errors:
            summary += f"• {len(errors)} خطأ يحتاج مراجعة\n"
        else:
-            summary += "• لا أخطاء حرجة ✅\n"
+            summary += "، 0 مشاكل حرجة"
        return summary
-    async def detect_anomalies(self) -> list[dict]:
+    # -- Anomaly detection -------------------------------------------------
        anomalies = []
        hourly = self._filter_by_period("hourly")
        daily = self._filter_by_period("daily")
-        if hourly:
+    async def detect_anomalies(self) -> list[AnomalyAlert]:
-            hourly_cost = sum(m.estimated_cost_usd for m in hourly)
+        """Sudden cost spikes, unusual failure patterns, backend degradation."""
-            if hourly_cost > 5.0:
+        alerts: list[AnomalyAlert] = []
-                anomalies.append({
+        now = datetime.now(timezone.utc)
-                    "type": "cost_spike",
+        today = [m for m in self._metrics if m.started_at >= now - timedelta(hours=24)]
-                    "severity": "high",
+        prev_week = [
-                    "message": f"تكلفة الساعة الأخيرة ${hourly_cost:.2f} — أعلى من الحد الطبيعي",
+            m for m in self._metrics
-                    "value": hourly_cost,
+            if now - timedelta(days=8) <= m.started_at < now - timedelta(days=1)
-                })
+        ]
-            hourly_errors = sum(1 for m in hourly if not m.success)
+        if not today or not prev_week:
-            if len(hourly) > 5 and hourly_errors / len(hourly) > 0.3:
+            return alerts
                anomalies.append({
                    "type": "error_spike",
                    "severity": "critical",
                    "message": f"معدل أخطاء مرتفع: {hourly_errors}/{len(hourly)} في الساعة الأخيرة",
                    "value": hourly_errors,
                })
-        return anomalies
+        # Cost spike detection
        today_cost = sum(m.estimated_cost_usd for m in today)
        avg_daily_cost = sum(m.estimated_cost_usd for m in prev_week) / 7
        if avg_daily_cost > 0 and today_cost > avg_daily_cost * 2:
            deviation = ((today_cost - avg_daily_cost) / avg_daily_cost) * 100
            alerts.append(AnomalyAlert(
                id=f"cost-{now.strftime('%Y%m%d')}",
                anomaly_type="cost_spike",
                description=f"Today's cost ${today_cost:.2f} is {deviation:.0f}% above daily avg ${avg_daily_cost:.2f}",
                description_ar=f"تكلفة اليوم ${today_cost:.2f} أعلى بنسبة {deviation:.0f}% من المتوسط ${avg_daily_cost:.2f}",
                severity="high" if deviation > 200 else "medium",
                metric_name="daily_cost_usd",
                current_value=today_cost,
                baseline_value=avg_daily_cost,
                deviation_pct=round(deviation, 2),
            ))
        # Failure rate spike
        today_fail_rate = sum(1 for m in today if not m.success) / max(len(today), 1)
        prev_fail_rate = sum(1 for m in prev_week if not m.success) / max(len(prev_week), 1)
        if today_fail_rate > 0.15 and today_fail_rate > prev_fail_rate * 2:
            alerts.append(AnomalyAlert(
                id=f"fail-{now.strftime('%Y%m%d')}",
                anomaly_type="failure_spike",
                description=f"Failure rate {today_fail_rate:.1%} vs baseline {prev_fail_rate:.1%}",
                description_ar=f"معدل الفشل {today_fail_rate:.1%} مقابل الخط الأساسي {prev_fail_rate:.1%}",
                severity="critical" if today_fail_rate > 0.3 else "high",
                metric_name="failure_rate",
                current_value=today_fail_rate,
                baseline_value=prev_fail_rate,
                deviation_pct=round(((today_fail_rate - prev_fail_rate) / max(prev_fail_rate, 0.01)) * 100, 2),
            ))
        # Latency spike (per backend)
        for backend in {"claude", "openclaude", "goose", "internal"}:
            today_b = [m.duration_ms for m in today if m.backend == backend]
            prev_b = [m.duration_ms for m in prev_week if m.backend == backend]
            if len(today_b) >= 5 and len(prev_b) >= 5:
                today_avg = statistics.mean(today_b)
                prev_avg = statistics.mean(prev_b)
                if prev_avg > 0 and today_avg > prev_avg * 3:
                    deviation = ((today_avg - prev_avg) / prev_avg) * 100
                    alerts.append(AnomalyAlert(
                        id=f"latency-{backend}-{now.strftime('%Y%m%d')}",
                        anomaly_type="latency_spike",
                        description=f"{backend} latency {today_avg:.0f}ms vs baseline {prev_avg:.0f}ms",
                        description_ar=f"زمن استجابة {backend}: {today_avg:.0f}ms مقابل {prev_avg:.0f}ms",
                        severity="high",
                        metric_name=f"latency_{backend}",
                        current_value=today_avg,
                        baseline_value=prev_avg,
                        deviation_pct=round(deviation, 2),
                    ))
        self._anomalies.extend(alerts)
        if len(self._anomalies) > self._max_anomalies:
            self._anomalies = self._anomalies[-self._max_anomalies:]
        if alerts:
            logger.warning("[Obs] %d anomalies detected", len(alerts))
        return alerts
    # -- Helpers -----------------------------------------------------------
    @staticmethod
    def _period_cutoff(period: str) -> datetime:
        now = datetime.now(timezone.utc)
        if period == "daily":
            return now - timedelta(hours=24)
        if period == "weekly":
            return now - timedelta(days=7)
        if period == "monthly":
            return now - timedelta(days=30)
        return now - timedelta(hours=24)
-observability = ObservabilityService()
+# ---------------------------------------------------------------------------
 # Module-level singleton
 # ---------------------------------------------------------------------------
 observability_service = ObservabilityService()