fix: Enhanced Hermes API, router registration, and observability service

https://claude.ai/code/session_01LsnvBa7HwF5hs99VZbgLGj
This commit is contained in:
Claude 2026-04-11 08:36:50 +00:00
parent 40a48b98c9
commit 85a9c9a23f
No known key found for this signature in database
3 changed files with 498 additions and 270 deletions

View File

@ -1,214 +1,285 @@
""" """
Hermes API Dealix AI Revenue OS Hermes API -- Dealix AI Revenue OS -- واجهة هيرمس البرمجية
Top-level orchestration, profiles, cost, health, improvements, security. Orchestrator endpoints: execute tasks, manage profiles, view costs,
run security scans, approve improvements, and generate executive summaries.
""" """
import logging from __future__ import annotations
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, status import logging
from pydantic import BaseModel from typing import Any, Dict, List, Optional
from fastapi import APIRouter, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field
from app.services.hermes_orchestrator import hermes_orchestrator, HermesProfile
from app.services.execution_router import execution_router, TaskClass
from app.services.shannon_security import shannon_security, ShannonScope
from app.services.self_improvement import self_improvement_engine
from app.services.observability import observability_service
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
router = APIRouter(prefix="/hermes", tags=["hermes"])
router = APIRouter(prefix="/hermes", tags=["Hermes Orchestrator"])
# ---------------------------------------------------------------------------
# Request / response schemas
# ---------------------------------------------------------------------------
class ExecuteRequest(BaseModel): class ExecuteRequest(BaseModel):
profile_id: str = "founder"
task: str
params: Dict[str, Any] = Field(default_factory=dict)
user_context: Dict[str, Any] = Field(default_factory=lambda: {
"user_id": "api_user", "tenant_id": "default", "role": "owner",
})
class ExecuteResponse(BaseModel):
run_id: str
profile_id: str profile_id: str
task: str task: str
params: dict = {} status: str
backend: str = ""
data: Dict[str, Any] = {}
class ApproveRequest(BaseModel): evidence: List[str] = []
approved_by: str receipt_id: Optional[str] = None
cost_usd: float = 0.0
duration_ms: int = 0
error: Optional[str] = None
message_ar: str = ""
class ScanRequest(BaseModel): class ScanRequest(BaseModel):
environment: str environment: str = "staging"
base_url: str scopes: List[str] = Field(
scopes: list[str] = [] default_factory=lambda: ["auth", "injection", "pdpl"],
)
base_url: str = "https://staging.dealix.sa"
# ── Execute ──────────────────────────────────────── class ApproveRequest(BaseModel):
user_id: str = "founder"
@router.post("/execute") # ---------------------------------------------------------------------------
async def execute_task(req: ExecuteRequest): # Execute
from app.services.hermes_orchestrator import hermes # ---------------------------------------------------------------------------
try:
result = await hermes.execute( @router.post("/execute", response_model=ExecuteResponse)
async def execute_task(req: ExecuteRequest) -> ExecuteResponse:
"""Execute a task via the Hermes orchestrator."""
result = await hermes_orchestrator.execute(
profile_id=req.profile_id, profile_id=req.profile_id,
task=req.task, task=req.task,
params=req.params, params=req.params,
user_context=req.user_context,
) )
return result return ExecuteResponse(**result.model_dump())
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
# ── Profiles ─────────────────────────────────────── # ---------------------------------------------------------------------------
# Profiles
# ---------------------------------------------------------------------------
@router.get("/profiles") @router.get("/profiles")
async def list_profiles(): async def list_profiles() -> JSONResponse:
from app.services.hermes_orchestrator import hermes """List all available Hermes profiles."""
profiles = hermes.list_profiles() profiles = await hermes_orchestrator.list_profiles()
return {"profiles": [p.model_dump() if hasattr(p, 'model_dump') else p.__dict__ for p in profiles]} return JSONResponse(content={
"profiles": [p.model_dump() for p in profiles],
"count": len(profiles),
"message_ar": f"{len(profiles)} ملفات شخصية متاحة",
})
@router.get("/profiles/{profile_id}") @router.get("/profiles/{profile_id}")
async def get_profile(profile_id: str): async def get_profile(profile_id: str) -> JSONResponse:
from app.services.hermes_orchestrator import hermes """Get details for a specific profile."""
profile = hermes.get_profile(profile_id) profile = await hermes_orchestrator.get_profile(profile_id)
if not profile: if not profile:
raise HTTPException(status_code=404, detail="الملف الشخصي غير موجود") raise HTTPException(status_code=404, detail=f"الملف الشخصي غير موجود: {profile_id}")
return profile.model_dump() if hasattr(profile, 'model_dump') else profile.__dict__ return JSONResponse(content=profile.model_dump())
# ── Cost & Health ────────────────────────────────── # ---------------------------------------------------------------------------
# Cost
# ---------------------------------------------------------------------------
@router.get("/cost") @router.get("/cost")
async def cost_report(period: str = "daily", profile: Optional[str] = None): async def cost_report(period: str = "daily") -> JSONResponse:
from app.services.observability import observability """Get cost report from the orchestrator and observability service."""
return await observability.get_cost_report(period, profile) hermes_cost = await hermes_orchestrator.get_cost_report(period)
obs_cost = await observability_service.get_cost_report(period)
return JSONResponse(content={
"hermes": hermes_cost,
"observability": obs_cost,
})
# ---------------------------------------------------------------------------
# Health
# ---------------------------------------------------------------------------
@router.get("/health") @router.get("/health")
async def health_report(): async def health_report() -> JSONResponse:
from app.services.observability import observability """System health report across all backends and workflows."""
return await observability.get_health_report() obs_health = await observability_service.get_health_report()
backend_health = await execution_router.get_backend_health()
anomalies = await observability_service.detect_anomalies()
return JSONResponse(content={
"system": obs_health,
"backends": {k: v.model_dump() for k, v in backend_health.items()},
"anomalies": [a.model_dump(mode="json") for a in anomalies],
"anomaly_count": len(anomalies),
})
@router.get("/performance") # ---------------------------------------------------------------------------
async def performance_report(period: str = "daily"): # Runs
from app.services.observability import observability # ---------------------------------------------------------------------------
return await observability.get_performance_report(period)
@router.get("/anomalies")
async def detect_anomalies():
from app.services.observability import observability
return {"anomalies": await observability.detect_anomalies()}
@router.get("/executive-summary")
async def executive_summary(period: str = "weekly"):
from app.services.observability import observability
summary = await observability.get_executive_summary(period)
return {"summary": summary}
# ── Runs ───────────────────────────────────────────
@router.get("/runs") @router.get("/runs")
async def list_active_runs(): async def list_active_runs() -> JSONResponse:
from app.services.hermes_orchestrator import hermes """List currently active runs."""
return {"runs": hermes.get_active_runs()} runs = await hermes_orchestrator.get_active_runs()
return JSONResponse(content={
"active_runs": [r.model_dump(mode="json") for r in runs],
"count": len(runs),
"message_ar": f"{len(runs)} عمليات نشطة حالياً",
})
@router.post("/runs/{run_id}/abort") @router.post("/runs/{run_id}/abort")
async def abort_run(run_id: str): async def abort_run(run_id: str) -> JSONResponse:
from app.services.hermes_orchestrator import hermes """Abort an active run."""
success = hermes.abort_run(run_id) success = await hermes_orchestrator.abort_run(run_id)
if not success: if not success:
raise HTTPException(status_code=404, detail="التشغيل غير موجود") raise HTTPException(status_code=404, detail=f"التشغيل غير موجود أو اكتمل بالفعل: {run_id}")
return {"status": "aborted", "run_id": run_id} return JSONResponse(content={
"run_id": run_id,
"status": "aborted",
"message_ar": f"تم إلغاء التشغيل: {run_id}",
})
# ── Self-Improvement ────────────────────────────── # ---------------------------------------------------------------------------
# Self-improvement
# ---------------------------------------------------------------------------
@router.get("/improvements") @router.get("/improvements")
async def list_improvements(status_filter: Optional[str] = None): async def list_improvements(status: Optional[str] = None) -> JSONResponse:
from app.services.self_improvement import self_improvement, ImprovementStatus """List self-improvement proposals."""
if status_filter: proposals = self_improvement_engine.list_proposals(status)
try: report = await self_improvement_engine.report()
s = ImprovementStatus(status_filter) return JSONResponse(content={
except ValueError: "proposals": [p.model_dump(mode="json") for p in proposals],
raise HTTPException(status_code=400, detail="حالة غير صالحة") "count": len(proposals),
proposals = await self_improvement.get_proposals(s) "summary": report,
else: })
proposals = await self_improvement.get_proposals()
return {"proposals": [p.model_dump() for p in proposals]}
@router.post("/improvements/cycle")
async def run_improvement_cycle():
from app.services.self_improvement import self_improvement
result = await self_improvement.run_cycle()
return result.model_dump()
@router.post("/improvements/{proposal_id}/approve") @router.post("/improvements/{proposal_id}/approve")
async def approve_improvement(proposal_id: str, req: ApproveRequest): async def approve_improvement(proposal_id: str, req: ApproveRequest) -> JSONResponse:
from app.services.self_improvement import self_improvement """Approve a self-improvement proposal."""
success = await self_improvement.apply(proposal_id, req.approved_by) proposal = await self_improvement_engine.approve(proposal_id, req.user_id)
if not success: if not proposal:
raise HTTPException(status_code=404, detail="المقترح غير موجود أو يحتاج موافقة") raise HTTPException(
return {"status": "approved", "proposal_id": proposal_id} status_code=404,
detail=f"المقترح غير موجود أو ليس في حالة 'مقترح': {proposal_id}",
)
return JSONResponse(content={
"proposal_id": proposal.id,
"status": proposal.status,
"approved_by": proposal.approved_by,
"message_ar": f"تمت الموافقة على المقترح: {proposal.title_ar}",
})
@router.post("/improvements/{proposal_id}/reject") @router.post("/improvements/cycle")
async def reject_improvement(proposal_id: str): async def run_improvement_cycle(tenant_id: Optional[str] = None) -> JSONResponse:
from app.services.self_improvement import self_improvement """Trigger a self-improvement cycle."""
success = await self_improvement.reject(proposal_id) cycle = await self_improvement_engine.run_cycle(tenant_id)
if not success: return JSONResponse(content=cycle.model_dump(mode="json"))
raise HTTPException(status_code=404, detail="المقترح غير موجود")
return {"status": "rejected", "proposal_id": proposal_id}
# ── Security (Shannon) ──────────────────────────── # ---------------------------------------------------------------------------
# Shannon security
# ---------------------------------------------------------------------------
@router.get("/security/report") @router.get("/security/report")
async def security_report(severity: Optional[str] = None): async def latest_security_report() -> JSONResponse:
from app.services.shannon_security import shannon """Get the latest Shannon security scan report."""
findings = shannon.get_all_findings(severity) report = shannon_security.get_latest_report()
should_block = await shannon.should_block_release() if not report:
return { return JSONResponse(content={
"findings": [f.model_dump() for f in findings], "report": None,
"total": len(findings), "message_ar": "لا يوجد تقرير أمني بعد. قم بتشغيل فحص أولاً.",
"should_block_release": should_block, })
} return JSONResponse(content=report.model_dump(mode="json"))
@router.post("/security/scan") @router.post("/security/scan")
async def trigger_security_scan(req: ScanRequest): async def trigger_security_scan(req: ScanRequest) -> JSONResponse:
from app.services.shannon_security import shannon, ShannonScope """Trigger a Shannon security scan (staging only)."""
scopes = []
for s in req.scopes:
try: try:
scopes = [ShannonScope(s) for s in req.scopes] if req.scopes else None scopes.append(ShannonScope(s))
report = await shannon.run_scan( except ValueError:
environment=req.environment,
base_url=req.base_url,
scopes=scopes,
)
return report.model_dump()
except PermissionError as e:
raise HTTPException( raise HTTPException(
status_code=403, status_code=400,
detail=f"محظور: {str(e)}" detail=f"نطاق غير صالح: {s}. المتاحة: {[x.value for x in ShannonScope]}",
) )
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e)) report = await shannon_security.run_scan(
environment=req.environment,
scopes=scopes,
base_url=req.base_url,
)
return JSONResponse(content=report.model_dump(mode="json"))
# ── Session Continuity ──────────────────────────── # ---------------------------------------------------------------------------
# Executive summary
# ---------------------------------------------------------------------------
@router.get("/executive-summary")
async def executive_summary(period: str = "weekly") -> JSONResponse:
"""Arabic executive summary for leadership."""
summary = await observability_service.get_executive_summary(period)
improvement_report = await self_improvement_engine.report()
security_report = shannon_security.get_latest_report()
full_summary = summary
if improvement_report.get("total_proposals", 0) > 0:
applied = improvement_report.get("by_status", {}).get("applied", 0)
pending = improvement_report.get("by_status", {}).get("proposed", 0)
full_summary += f"، {applied} تحسين مطبق، {pending} تحسين معلق"
if security_report:
full_summary += (
f"، آخر فحص أمني: {security_report.critical_count} حرجة، "
f"{security_report.high_count} عالية"
)
else:
full_summary += "، لا يوجد فحص أمني بعد"
return JSONResponse(content={
"summary_ar": full_summary,
"period": period,
"improvements": improvement_report,
"security_status": security_report.model_dump(mode="json") if security_report else None,
})
@router.get("/session/restore") # ---------------------------------------------------------------------------
async def restore_session(): # Routing stats
from app.services.session_continuity import session_continuity # ---------------------------------------------------------------------------
prompt = await session_continuity.get_restore_prompt()
return {"restore_prompt": prompt}
@router.get("/routing-stats")
@router.get("/session/state") async def routing_stats() -> JSONResponse:
async def get_session_state(): """Execution routing statistics."""
from app.services.session_continuity import session_continuity stats = await execution_router.get_routing_stats()
state = await session_continuity.restore_state() return JSONResponse(content=stats)
if not state:
return {"state": None}
return {"state": state.model_dump() if hasattr(state, 'model_dump') else state.__dict__}

View File

@ -16,6 +16,7 @@ from app.api.v1 import lead_prospector as prospector_router
from app.api.v1 import pipeline as pipeline_router from app.api.v1 import pipeline as pipeline_router
from app.api.v1 import agent_system as agent_system_router from app.api.v1 import agent_system as agent_system_router
from app.api.v1 import autonomous_foundation as autonomous_foundation_router from app.api.v1 import autonomous_foundation as autonomous_foundation_router
from app.api.v1 import hermes as hermes_router
from app.api.v1 import marketing_hub as marketing_hub_router from app.api.v1 import marketing_hub as marketing_hub_router
from app.api.v1 import strategy_summary as strategy_summary_router from app.api.v1 import strategy_summary as strategy_summary_router
from app.api.v1 import value_proposition as value_proposition_router from app.api.v1 import value_proposition as value_proposition_router
@ -86,3 +87,6 @@ api_router.include_router(pipeline_router.router)
# ── 22-Agent AI System — Full Empire Control ───────────────── # ── 22-Agent AI System — Full Empire Control ─────────────────
api_router.include_router(agent_system_router.router) api_router.include_router(agent_system_router.router)
api_router.include_router(autonomous_foundation_router.router) api_router.include_router(autonomous_foundation_router.router)
# ── Hermes Fusion — Orchestration Layer ──────────────────────
api_router.include_router(hermes_router.router)

View File

@ -1,17 +1,27 @@
""" """
Observability Service Dealix AI Revenue OS Observability Service -- Dealix AI Revenue OS -- خدمة المراقبة
Cost tracking, workflow metrics, health monitoring, and Arabic executive summaries. Track cost, performance, and health across all agent workflows.
Anomaly detection, executive summaries in Arabic.
""" """
from __future__ import annotations
import logging import logging
from datetime import datetime, timezone, timedelta import statistics
from typing import Optional from collections import defaultdict
from datetime import datetime, timedelta, timezone
from typing import Any, Optional
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Models
# ---------------------------------------------------------------------------
class WorkflowMetric(BaseModel): class WorkflowMetric(BaseModel):
"""Single workflow execution metric."""
workflow_name: str workflow_name: str
profile_id: str profile_id: str
backend: str backend: str
@ -23,170 +33,313 @@ class WorkflowMetric(BaseModel):
error: Optional[str] = None error: Optional[str] = None
class AnomalyAlert(BaseModel):
"""Detected anomaly."""
id: str = ""
anomaly_type: str # cost_spike, failure_spike, latency_spike, regression
description: str
description_ar: str
severity: str = "medium" # critical, high, medium, low
metric_name: str = ""
current_value: float = 0.0
baseline_value: float = 0.0
deviation_pct: float = 0.0
detected_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
# ---------------------------------------------------------------------------
# Service
# ---------------------------------------------------------------------------
class ObservabilityService: class ObservabilityService:
"""Track cost, performance, and health across all agent workflows.""" """Track cost, performance, and health across all agent workflows."""
def __init__(self): def __init__(self) -> None:
self._metrics: list[WorkflowMetric] = [] self._metrics: list[WorkflowMetric] = []
self._max_metrics = 50000 self._max_metrics = 50_000
self._anomalies: list[AnomalyAlert] = []
self._max_anomalies = 1_000
logger.info("خدمة المراقبة: تم التهيئة")
# -- Recording ---------------------------------------------------------
async def record_workflow(self, metric: WorkflowMetric) -> None: async def record_workflow(self, metric: WorkflowMetric) -> None:
"""Store a workflow execution metric."""
self._metrics.append(metric) self._metrics.append(metric)
if len(self._metrics) > self._max_metrics: if len(self._metrics) > self._max_metrics:
self._metrics = self._metrics[-self._max_metrics:] self._metrics = self._metrics[-self._max_metrics:]
logger.debug( logger.debug(
f"Recorded: {metric.workflow_name} " "[Obs] سجل: %s profile=%s backend=%s %dms $%.4f %s",
f"cost=${metric.estimated_cost_usd:.4f} " metric.workflow_name, metric.profile_id, metric.backend,
f"{'OK' if metric.success else 'FAIL'}" metric.duration_ms, metric.estimated_cost_usd,
"OK" if metric.success else "FAIL",
) )
def _filter_by_period( # -- Cost report -------------------------------------------------------
self, period: str, metrics: list[WorkflowMetric] = None
) -> list[WorkflowMetric]:
source = metrics or self._metrics
now = datetime.now(timezone.utc)
if period == "hourly":
cutoff = now - timedelta(hours=1)
elif period == "daily":
cutoff = now - timedelta(days=1)
elif period == "weekly":
cutoff = now - timedelta(weeks=1)
elif period == "monthly":
cutoff = now - timedelta(days=30)
else:
cutoff = now - timedelta(days=1)
return [m for m in source if m.started_at >= cutoff]
async def get_cost_report( async def get_cost_report(
self, period: str = "daily", profile: str = None self, period: str = "daily", profile: Optional[str] = None,
) -> dict: ) -> dict[str, Any]:
filtered = self._filter_by_period(period) """Total cost, cost by profile, cost by backend, cost by workflow."""
if profile: cutoff = self._period_cutoff(period)
filtered = [m for m in filtered if m.profile_id == profile] filtered = [
m for m in self._metrics
if m.started_at >= cutoff and (profile is None or m.profile_id == profile)
]
total_cost = sum(m.estimated_cost_usd for m in filtered) total = sum(m.estimated_cost_usd for m in filtered)
by_profile: dict[str, float] = {} by_profile: dict[str, float] = defaultdict(float)
by_backend: dict[str, float] = {} by_backend: dict[str, float] = defaultdict(float)
by_workflow: dict[str, float] = {} by_workflow: dict[str, float] = defaultdict(float)
for m in filtered: for m in filtered:
by_profile[m.profile_id] = by_profile.get(m.profile_id, 0) + m.estimated_cost_usd by_profile[m.profile_id] += m.estimated_cost_usd
by_backend[m.backend] = by_backend.get(m.backend, 0) + m.estimated_cost_usd by_backend[m.backend] += m.estimated_cost_usd
by_workflow[m.workflow_name] = by_workflow.get(m.workflow_name, 0) + m.estimated_cost_usd by_workflow[m.workflow_name] += m.estimated_cost_usd
top_expensive = sorted(by_workflow.items(), key=lambda x: x[1], reverse=True)[:5] # Sort by cost descending
top_workflows = sorted(by_workflow.items(), key=lambda x: -x[1])[:10]
return { return {
"period": period, "period": period,
"total_cost_usd": round(total_cost, 4), "total_usd": round(total, 4),
"total_workflows": len(filtered),
"by_profile": {k: round(v, 4) for k, v in by_profile.items()}, "by_profile": {k: round(v, 4) for k, v in by_profile.items()},
"by_backend": {k: round(v, 4) for k, v in by_backend.items()}, "by_backend": {k: round(v, 4) for k, v in by_backend.items()},
"top_expensive": [{"name": k, "cost": round(v, 4)} for k, v in top_expensive], "top_workflows": [{"name": n, "cost_usd": round(c, 4)} for n, c in top_workflows],
"total_executions": len(filtered),
"message_ar": f"التكلفة الإجمالية ({period}): ${total:.2f} عبر {len(filtered)} عملية",
} }
async def get_performance_report(self, period: str = "daily") -> dict: # -- Performance report ------------------------------------------------
filtered = self._filter_by_period(period)
async def get_performance_report(self, period: str = "daily") -> dict[str, Any]:
"""Average duration, P95 duration, success rate, error rate."""
cutoff = self._period_cutoff(period)
filtered = [m for m in self._metrics if m.started_at >= cutoff]
if not filtered: if not filtered:
return {"period": period, "total": 0} return {
"period": period, "total_executions": 0,
"message_ar": "لا توجد بيانات لهذه الفترة",
}
durations = [m.duration_ms for m in filtered] durations = [m.duration_ms for m in filtered]
durations.sort() successes = sum(1 for m in filtered if m.success)
total = len(durations) failures = len(filtered) - successes
success_count = sum(1 for m in filtered if m.success)
p95_idx = min(int(total * 0.95), total - 1) avg_ms = statistics.mean(durations)
errors = [m for m in filtered if not m.success] p95_ms = sorted(durations)[int(len(durations) * 0.95)] if len(durations) >= 20 else max(durations)
success_rate = successes / len(filtered)
# Slowest workflows
by_wf: dict[str, list[int]] = defaultdict(list)
for m in filtered:
by_wf[m.workflow_name].append(m.duration_ms)
slowest = sorted(
[(wf, statistics.mean(ds)) for wf, ds in by_wf.items()],
key=lambda x: -x[1],
)[:5]
# Most expensive
cost_wf: dict[str, float] = defaultdict(float)
for m in filtered:
cost_wf[m.workflow_name] += m.estimated_cost_usd
most_expensive = sorted(cost_wf.items(), key=lambda x: -x[1])[:5]
return { return {
"period": period, "period": period,
"total_workflows": total, "total_executions": len(filtered),
"success_rate": round(success_count / total * 100, 1) if total else 0, "avg_duration_ms": round(avg_ms, 2),
"avg_duration_ms": round(sum(durations) / total) if total else 0, "p95_duration_ms": p95_ms,
"p95_duration_ms": durations[p95_idx] if durations else 0, "success_rate": round(success_rate, 4),
"error_count": len(errors), "error_rate": round(1 - success_rate, 4),
"error_rate": round(len(errors) / total * 100, 1) if total else 0, "total_successes": successes,
"recent_errors": [ "total_failures": failures,
{"workflow": e.workflow_name, "error": e.error, "at": e.started_at.isoformat()} "slowest_workflows": [{"name": n, "avg_ms": round(d, 2)} for n, d in slowest],
for e in errors[-5:] "most_expensive": [{"name": n, "cost_usd": round(c, 4)} for n, c in most_expensive],
], "message_ar": (
f"أداء ({period}): {len(filtered)} عملية، "
f"متوسط {avg_ms:.0f}ms، نجاح {success_rate:.0%}، فشل {failures}"
),
} }
async def get_health_report(self) -> dict: # -- Health report -----------------------------------------------------
daily = self._filter_by_period("daily")
total = len(daily)
success = sum(1 for m in daily if m.success)
backends_used = set(m.backend for m in daily) async def get_health_report(self) -> dict[str, Any]:
"""Backend health, skill health, memory health, knowledge health, trust health."""
recent = [m for m in self._metrics if m.started_at >= self._period_cutoff("daily")]
# Backend health
backend_calls: dict[str, dict[str, int]] = defaultdict(lambda: {"ok": 0, "fail": 0})
for m in recent:
backend_calls[m.backend]["ok" if m.success else "fail"] += 1
backend_health = {} backend_health = {}
for b in backends_used: for b, counts in backend_calls.items():
b_metrics = [m for m in daily if m.backend == b] total = counts["ok"] + counts["fail"]
b_success = sum(1 for m in b_metrics if m.success) rate = counts["ok"] / total if total else 1.0
backend_health[b] = { backend_health[b] = {
"total": len(b_metrics), "total": total, "success_rate": round(rate, 4),
"success_rate": round(b_success / len(b_metrics) * 100, 1) if b_metrics else 0, "healthy": rate >= 0.7 or total < 5,
"avg_duration_ms": round(
sum(m.duration_ms for m in b_metrics) / len(b_metrics)
) if b_metrics else 0,
} }
# Skill health (by workflow)
skill_calls: dict[str, dict[str, int]] = defaultdict(lambda: {"ok": 0, "fail": 0})
for m in recent:
skill_calls[m.workflow_name]["ok" if m.success else "fail"] += 1
skill_health = {}
for s, counts in skill_calls.items():
total = counts["ok"] + counts["fail"]
rate = counts["ok"] / total if total else 1.0
skill_health[s] = {"total": total, "success_rate": round(rate, 4)}
# Overall
total = len(recent)
overall_ok = sum(1 for m in recent if m.success)
overall_rate = overall_ok / total if total else 1.0
healthy = overall_rate >= 0.8
return { return {
"overall_health": "healthy" if (total == 0 or success / total > 0.9) else "degraded", "overall_healthy": healthy,
"workflows_today": total, "overall_success_rate": round(overall_rate, 4),
"success_rate": round(success / total * 100, 1) if total else 100, "total_today": total,
"total_cost_today_usd": round(sum(m.estimated_cost_usd for m in daily), 4), "backend_health": backend_health,
"backends": backend_health, "skill_health": skill_health,
"anomalies_today": len([
a for a in self._anomalies
if a.detected_at >= self._period_cutoff("daily")
]),
"message_ar": (
f"صحة النظام: {'سليم' if healthy else 'يحتاج انتباه'} -- "
f"نجاح {overall_rate:.0%}، {total} عملية اليوم"
),
} }
async def get_executive_summary(self, period: str = "weekly") -> str: # -- Executive summary -------------------------------------------------
filtered = self._filter_by_period(period)
total = len(filtered)
success = sum(1 for m in filtered if m.success)
cost = sum(m.estimated_cost_usd for m in filtered)
success_rate = round(success / total * 100) if total else 100
period_ar = {"daily": "اليوم", "weekly": "هذا الأسبوع", "monthly": "هذا الشهر"}.get(period, period) async def get_executive_summary(self, period: str = "weekly") -> str:
"""Arabic executive summary for leadership."""
cutoff = self._period_cutoff(period)
filtered = [m for m in self._metrics if m.started_at >= cutoff]
total = len(filtered)
successes = sum(1 for m in filtered if m.success)
rate = successes / total if total else 0
cost = sum(m.estimated_cost_usd for m in filtered)
anomalies = [a for a in self._anomalies if a.detected_at >= cutoff]
critical = sum(1 for a in anomalies if a.severity == "critical")
period_ar = {
"daily": "اليوم", "weekly": "هذا الأسبوع",
"monthly": "هذا الشهر",
}.get(period, period)
summary = ( summary = (
f"📊 ملخص {period_ar}:\n" f"{period_ar}: {total} مهمة منفذة، "
f"{total} مهمة منفذة\n" f"{rate:.0%} نجاح، "
f"{success_rate}% نسبة النجاح\n" f"تكلفة ${cost:.2f}، "
f"• ${cost:.2f} التكلفة الإجمالية\n" f"{len(anomalies)} تنبيه"
) )
if critical:
errors = [m for m in filtered if not m.success] summary += f"، {critical} مشاكل حرجة تحتاج تدخل فوري"
if errors:
summary += f"{len(errors)} خطأ يحتاج مراجعة\n"
else: else:
summary += "• لا أخطاء حرجة ✅\n" summary += "، 0 مشاكل حرجة"
return summary return summary
async def detect_anomalies(self) -> list[dict]: # -- Anomaly detection -------------------------------------------------
anomalies = []
hourly = self._filter_by_period("hourly")
daily = self._filter_by_period("daily")
if hourly: async def detect_anomalies(self) -> list[AnomalyAlert]:
hourly_cost = sum(m.estimated_cost_usd for m in hourly) """Sudden cost spikes, unusual failure patterns, backend degradation."""
if hourly_cost > 5.0: alerts: list[AnomalyAlert] = []
anomalies.append({ now = datetime.now(timezone.utc)
"type": "cost_spike", today = [m for m in self._metrics if m.started_at >= now - timedelta(hours=24)]
"severity": "high", prev_week = [
"message": f"تكلفة الساعة الأخيرة ${hourly_cost:.2f} — أعلى من الحد الطبيعي", m for m in self._metrics
"value": hourly_cost, if now - timedelta(days=8) <= m.started_at < now - timedelta(days=1)
}) ]
hourly_errors = sum(1 for m in hourly if not m.success) if not today or not prev_week:
if len(hourly) > 5 and hourly_errors / len(hourly) > 0.3: return alerts
anomalies.append({
"type": "error_spike",
"severity": "critical",
"message": f"معدل أخطاء مرتفع: {hourly_errors}/{len(hourly)} في الساعة الأخيرة",
"value": hourly_errors,
})
return anomalies # Cost spike detection
today_cost = sum(m.estimated_cost_usd for m in today)
avg_daily_cost = sum(m.estimated_cost_usd for m in prev_week) / 7
if avg_daily_cost > 0 and today_cost > avg_daily_cost * 2:
deviation = ((today_cost - avg_daily_cost) / avg_daily_cost) * 100
alerts.append(AnomalyAlert(
id=f"cost-{now.strftime('%Y%m%d')}",
anomaly_type="cost_spike",
description=f"Today's cost ${today_cost:.2f} is {deviation:.0f}% above daily avg ${avg_daily_cost:.2f}",
description_ar=f"تكلفة اليوم ${today_cost:.2f} أعلى بنسبة {deviation:.0f}% من المتوسط ${avg_daily_cost:.2f}",
severity="high" if deviation > 200 else "medium",
metric_name="daily_cost_usd",
current_value=today_cost,
baseline_value=avg_daily_cost,
deviation_pct=round(deviation, 2),
))
# Failure rate spike
today_fail_rate = sum(1 for m in today if not m.success) / max(len(today), 1)
prev_fail_rate = sum(1 for m in prev_week if not m.success) / max(len(prev_week), 1)
if today_fail_rate > 0.15 and today_fail_rate > prev_fail_rate * 2:
alerts.append(AnomalyAlert(
id=f"fail-{now.strftime('%Y%m%d')}",
anomaly_type="failure_spike",
description=f"Failure rate {today_fail_rate:.1%} vs baseline {prev_fail_rate:.1%}",
description_ar=f"معدل الفشل {today_fail_rate:.1%} مقابل الخط الأساسي {prev_fail_rate:.1%}",
severity="critical" if today_fail_rate > 0.3 else "high",
metric_name="failure_rate",
current_value=today_fail_rate,
baseline_value=prev_fail_rate,
deviation_pct=round(((today_fail_rate - prev_fail_rate) / max(prev_fail_rate, 0.01)) * 100, 2),
))
# Latency spike (per backend)
for backend in {"claude", "openclaude", "goose", "internal"}:
today_b = [m.duration_ms for m in today if m.backend == backend]
prev_b = [m.duration_ms for m in prev_week if m.backend == backend]
if len(today_b) >= 5 and len(prev_b) >= 5:
today_avg = statistics.mean(today_b)
prev_avg = statistics.mean(prev_b)
if prev_avg > 0 and today_avg > prev_avg * 3:
deviation = ((today_avg - prev_avg) / prev_avg) * 100
alerts.append(AnomalyAlert(
id=f"latency-{backend}-{now.strftime('%Y%m%d')}",
anomaly_type="latency_spike",
description=f"{backend} latency {today_avg:.0f}ms vs baseline {prev_avg:.0f}ms",
description_ar=f"زمن استجابة {backend}: {today_avg:.0f}ms مقابل {prev_avg:.0f}ms",
severity="high",
metric_name=f"latency_{backend}",
current_value=today_avg,
baseline_value=prev_avg,
deviation_pct=round(deviation, 2),
))
self._anomalies.extend(alerts)
if len(self._anomalies) > self._max_anomalies:
self._anomalies = self._anomalies[-self._max_anomalies:]
if alerts:
logger.warning("[Obs] %d anomalies detected", len(alerts))
return alerts
# -- Helpers -----------------------------------------------------------
@staticmethod
def _period_cutoff(period: str) -> datetime:
now = datetime.now(timezone.utc)
if period == "daily":
return now - timedelta(hours=24)
if period == "weekly":
return now - timedelta(days=7)
if period == "monthly":
return now - timedelta(days=30)
return now - timedelta(hours=24)
observability = ObservabilityService() # ---------------------------------------------------------------------------
# Module-level singleton
# ---------------------------------------------------------------------------
observability_service = ObservabilityService()