fix: Enhanced Hermes API, router registration, and observability service

https://claude.ai/code/session_01LsnvBa7HwF5hs99VZbgLGj
This commit is contained in:
Claude 2026-04-11 08:36:50 +00:00
parent 40a48b98c9
commit 85a9c9a23f
No known key found for this signature in database
3 changed files with 498 additions and 270 deletions

View File

@ -1,214 +1,285 @@
"""
Hermes API Dealix AI Revenue OS
Top-level orchestration, profiles, cost, health, improvements, security.
Hermes API -- Dealix AI Revenue OS -- واجهة هيرمس البرمجية
Orchestrator endpoints: execute tasks, manage profiles, view costs,
run security scans, approve improvements, and generate executive summaries.
"""
import logging
from typing import Optional
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel
import logging
from typing import Any, Dict, List, Optional
from fastapi import APIRouter, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field
from app.services.hermes_orchestrator import hermes_orchestrator, HermesProfile
from app.services.execution_router import execution_router, TaskClass
from app.services.shannon_security import shannon_security, ShannonScope
from app.services.self_improvement import self_improvement_engine
from app.services.observability import observability_service
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/hermes", tags=["hermes"])
router = APIRouter(prefix="/hermes", tags=["Hermes Orchestrator"])
# ---------------------------------------------------------------------------
# Request / response schemas
# ---------------------------------------------------------------------------
class ExecuteRequest(BaseModel):
profile_id: str = "founder"
task: str
params: Dict[str, Any] = Field(default_factory=dict)
user_context: Dict[str, Any] = Field(default_factory=lambda: {
"user_id": "api_user", "tenant_id": "default", "role": "owner",
})
class ExecuteResponse(BaseModel):
run_id: str
profile_id: str
task: str
params: dict = {}
class ApproveRequest(BaseModel):
approved_by: str
status: str
backend: str = ""
data: Dict[str, Any] = {}
evidence: List[str] = []
receipt_id: Optional[str] = None
cost_usd: float = 0.0
duration_ms: int = 0
error: Optional[str] = None
message_ar: str = ""
class ScanRequest(BaseModel):
environment: str
base_url: str
scopes: list[str] = []
environment: str = "staging"
scopes: List[str] = Field(
default_factory=lambda: ["auth", "injection", "pdpl"],
)
base_url: str = "https://staging.dealix.sa"
# ── Execute ────────────────────────────────────────
class ApproveRequest(BaseModel):
user_id: str = "founder"
@router.post("/execute")
async def execute_task(req: ExecuteRequest):
from app.services.hermes_orchestrator import hermes
try:
result = await hermes.execute(
profile_id=req.profile_id,
task=req.task,
params=req.params,
)
return result
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
# ---------------------------------------------------------------------------
# Execute
# ---------------------------------------------------------------------------
@router.post("/execute", response_model=ExecuteResponse)
async def execute_task(req: ExecuteRequest) -> ExecuteResponse:
"""Execute a task via the Hermes orchestrator."""
result = await hermes_orchestrator.execute(
profile_id=req.profile_id,
task=req.task,
params=req.params,
user_context=req.user_context,
)
return ExecuteResponse(**result.model_dump())
# ── Profiles ───────────────────────────────────────
# ---------------------------------------------------------------------------
# Profiles
# ---------------------------------------------------------------------------
@router.get("/profiles")
async def list_profiles():
from app.services.hermes_orchestrator import hermes
profiles = hermes.list_profiles()
return {"profiles": [p.model_dump() if hasattr(p, 'model_dump') else p.__dict__ for p in profiles]}
async def list_profiles() -> JSONResponse:
"""List all available Hermes profiles."""
profiles = await hermes_orchestrator.list_profiles()
return JSONResponse(content={
"profiles": [p.model_dump() for p in profiles],
"count": len(profiles),
"message_ar": f"{len(profiles)} ملفات شخصية متاحة",
})
@router.get("/profiles/{profile_id}")
async def get_profile(profile_id: str):
from app.services.hermes_orchestrator import hermes
profile = hermes.get_profile(profile_id)
async def get_profile(profile_id: str) -> JSONResponse:
"""Get details for a specific profile."""
profile = await hermes_orchestrator.get_profile(profile_id)
if not profile:
raise HTTPException(status_code=404, detail="الملف الشخصي غير موجود")
return profile.model_dump() if hasattr(profile, 'model_dump') else profile.__dict__
raise HTTPException(status_code=404, detail=f"الملف الشخصي غير موجود: {profile_id}")
return JSONResponse(content=profile.model_dump())
# ── Cost & Health ──────────────────────────────────
# ---------------------------------------------------------------------------
# Cost
# ---------------------------------------------------------------------------
@router.get("/cost")
async def cost_report(period: str = "daily", profile: Optional[str] = None):
from app.services.observability import observability
return await observability.get_cost_report(period, profile)
async def cost_report(period: str = "daily") -> JSONResponse:
"""Get cost report from the orchestrator and observability service."""
hermes_cost = await hermes_orchestrator.get_cost_report(period)
obs_cost = await observability_service.get_cost_report(period)
return JSONResponse(content={
"hermes": hermes_cost,
"observability": obs_cost,
})
# ---------------------------------------------------------------------------
# Health
# ---------------------------------------------------------------------------
@router.get("/health")
async def health_report():
from app.services.observability import observability
return await observability.get_health_report()
async def health_report() -> JSONResponse:
"""System health report across all backends and workflows."""
obs_health = await observability_service.get_health_report()
backend_health = await execution_router.get_backend_health()
anomalies = await observability_service.detect_anomalies()
return JSONResponse(content={
"system": obs_health,
"backends": {k: v.model_dump() for k, v in backend_health.items()},
"anomalies": [a.model_dump(mode="json") for a in anomalies],
"anomaly_count": len(anomalies),
})
@router.get("/performance")
async def performance_report(period: str = "daily"):
from app.services.observability import observability
return await observability.get_performance_report(period)
@router.get("/anomalies")
async def detect_anomalies():
from app.services.observability import observability
return {"anomalies": await observability.detect_anomalies()}
@router.get("/executive-summary")
async def executive_summary(period: str = "weekly"):
from app.services.observability import observability
summary = await observability.get_executive_summary(period)
return {"summary": summary}
# ── Runs ───────────────────────────────────────────
# ---------------------------------------------------------------------------
# Runs
# ---------------------------------------------------------------------------
@router.get("/runs")
async def list_active_runs():
from app.services.hermes_orchestrator import hermes
return {"runs": hermes.get_active_runs()}
async def list_active_runs() -> JSONResponse:
"""List currently active runs."""
runs = await hermes_orchestrator.get_active_runs()
return JSONResponse(content={
"active_runs": [r.model_dump(mode="json") for r in runs],
"count": len(runs),
"message_ar": f"{len(runs)} عمليات نشطة حالياً",
})
@router.post("/runs/{run_id}/abort")
async def abort_run(run_id: str):
from app.services.hermes_orchestrator import hermes
success = hermes.abort_run(run_id)
async def abort_run(run_id: str) -> JSONResponse:
"""Abort an active run."""
success = await hermes_orchestrator.abort_run(run_id)
if not success:
raise HTTPException(status_code=404, detail="التشغيل غير موجود")
return {"status": "aborted", "run_id": run_id}
raise HTTPException(status_code=404, detail=f"التشغيل غير موجود أو اكتمل بالفعل: {run_id}")
return JSONResponse(content={
"run_id": run_id,
"status": "aborted",
"message_ar": f"تم إلغاء التشغيل: {run_id}",
})
# ── Self-Improvement ──────────────────────────────
# ---------------------------------------------------------------------------
# Self-improvement
# ---------------------------------------------------------------------------
@router.get("/improvements")
async def list_improvements(status_filter: Optional[str] = None):
from app.services.self_improvement import self_improvement, ImprovementStatus
if status_filter:
try:
s = ImprovementStatus(status_filter)
except ValueError:
raise HTTPException(status_code=400, detail="حالة غير صالحة")
proposals = await self_improvement.get_proposals(s)
else:
proposals = await self_improvement.get_proposals()
return {"proposals": [p.model_dump() for p in proposals]}
@router.post("/improvements/cycle")
async def run_improvement_cycle():
from app.services.self_improvement import self_improvement
result = await self_improvement.run_cycle()
return result.model_dump()
async def list_improvements(status: Optional[str] = None) -> JSONResponse:
"""List self-improvement proposals."""
proposals = self_improvement_engine.list_proposals(status)
report = await self_improvement_engine.report()
return JSONResponse(content={
"proposals": [p.model_dump(mode="json") for p in proposals],
"count": len(proposals),
"summary": report,
})
@router.post("/improvements/{proposal_id}/approve")
async def approve_improvement(proposal_id: str, req: ApproveRequest):
from app.services.self_improvement import self_improvement
success = await self_improvement.apply(proposal_id, req.approved_by)
if not success:
raise HTTPException(status_code=404, detail="المقترح غير موجود أو يحتاج موافقة")
return {"status": "approved", "proposal_id": proposal_id}
async def approve_improvement(proposal_id: str, req: ApproveRequest) -> JSONResponse:
"""Approve a self-improvement proposal."""
proposal = await self_improvement_engine.approve(proposal_id, req.user_id)
if not proposal:
raise HTTPException(
status_code=404,
detail=f"المقترح غير موجود أو ليس في حالة 'مقترح': {proposal_id}",
)
return JSONResponse(content={
"proposal_id": proposal.id,
"status": proposal.status,
"approved_by": proposal.approved_by,
"message_ar": f"تمت الموافقة على المقترح: {proposal.title_ar}",
})
@router.post("/improvements/{proposal_id}/reject")
async def reject_improvement(proposal_id: str):
from app.services.self_improvement import self_improvement
success = await self_improvement.reject(proposal_id)
if not success:
raise HTTPException(status_code=404, detail="المقترح غير موجود")
return {"status": "rejected", "proposal_id": proposal_id}
@router.post("/improvements/cycle")
async def run_improvement_cycle(tenant_id: Optional[str] = None) -> JSONResponse:
"""Trigger a self-improvement cycle."""
cycle = await self_improvement_engine.run_cycle(tenant_id)
return JSONResponse(content=cycle.model_dump(mode="json"))
# ── Security (Shannon) ────────────────────────────
# ---------------------------------------------------------------------------
# Shannon security
# ---------------------------------------------------------------------------
@router.get("/security/report")
async def security_report(severity: Optional[str] = None):
from app.services.shannon_security import shannon
findings = shannon.get_all_findings(severity)
should_block = await shannon.should_block_release()
return {
"findings": [f.model_dump() for f in findings],
"total": len(findings),
"should_block_release": should_block,
}
async def latest_security_report() -> JSONResponse:
"""Get the latest Shannon security scan report."""
report = shannon_security.get_latest_report()
if not report:
return JSONResponse(content={
"report": None,
"message_ar": "لا يوجد تقرير أمني بعد. قم بتشغيل فحص أولاً.",
})
return JSONResponse(content=report.model_dump(mode="json"))
@router.post("/security/scan")
async def trigger_security_scan(req: ScanRequest):
from app.services.shannon_security import shannon, ShannonScope
try:
scopes = [ShannonScope(s) for s in req.scopes] if req.scopes else None
report = await shannon.run_scan(
environment=req.environment,
base_url=req.base_url,
scopes=scopes,
async def trigger_security_scan(req: ScanRequest) -> JSONResponse:
"""Trigger a Shannon security scan (staging only)."""
scopes = []
for s in req.scopes:
try:
scopes.append(ShannonScope(s))
except ValueError:
raise HTTPException(
status_code=400,
detail=f"نطاق غير صالح: {s}. المتاحة: {[x.value for x in ShannonScope]}",
)
report = await shannon_security.run_scan(
environment=req.environment,
scopes=scopes,
base_url=req.base_url,
)
return JSONResponse(content=report.model_dump(mode="json"))
# ---------------------------------------------------------------------------
# Executive summary
# ---------------------------------------------------------------------------
@router.get("/executive-summary")
async def executive_summary(period: str = "weekly") -> JSONResponse:
"""Arabic executive summary for leadership."""
summary = await observability_service.get_executive_summary(period)
improvement_report = await self_improvement_engine.report()
security_report = shannon_security.get_latest_report()
full_summary = summary
if improvement_report.get("total_proposals", 0) > 0:
applied = improvement_report.get("by_status", {}).get("applied", 0)
pending = improvement_report.get("by_status", {}).get("proposed", 0)
full_summary += f"، {applied} تحسين مطبق، {pending} تحسين معلق"
if security_report:
full_summary += (
f"، آخر فحص أمني: {security_report.critical_count} حرجة، "
f"{security_report.high_count} عالية"
)
return report.model_dump()
except PermissionError as e:
raise HTTPException(
status_code=403,
detail=f"محظور: {str(e)}"
)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
else:
full_summary += "، لا يوجد فحص أمني بعد"
return JSONResponse(content={
"summary_ar": full_summary,
"period": period,
"improvements": improvement_report,
"security_status": security_report.model_dump(mode="json") if security_report else None,
})
# ── Session Continuity ────────────────────────────
# ---------------------------------------------------------------------------
# Routing stats
# ---------------------------------------------------------------------------
@router.get("/session/restore")
async def restore_session():
from app.services.session_continuity import session_continuity
prompt = await session_continuity.get_restore_prompt()
return {"restore_prompt": prompt}
@router.get("/session/state")
async def get_session_state():
from app.services.session_continuity import session_continuity
state = await session_continuity.restore_state()
if not state:
return {"state": None}
return {"state": state.model_dump() if hasattr(state, 'model_dump') else state.__dict__}
@router.get("/routing-stats")
async def routing_stats() -> JSONResponse:
"""Execution routing statistics."""
stats = await execution_router.get_routing_stats()
return JSONResponse(content=stats)

View File

@ -16,6 +16,7 @@ from app.api.v1 import lead_prospector as prospector_router
from app.api.v1 import pipeline as pipeline_router
from app.api.v1 import agent_system as agent_system_router
from app.api.v1 import autonomous_foundation as autonomous_foundation_router
from app.api.v1 import hermes as hermes_router
from app.api.v1 import marketing_hub as marketing_hub_router
from app.api.v1 import strategy_summary as strategy_summary_router
from app.api.v1 import value_proposition as value_proposition_router
@ -86,3 +87,6 @@ api_router.include_router(pipeline_router.router)
# ── 22-Agent AI System — Full Empire Control ─────────────────
api_router.include_router(agent_system_router.router)
api_router.include_router(autonomous_foundation_router.router)
# ── Hermes Fusion — Orchestration Layer ──────────────────────
api_router.include_router(hermes_router.router)

View File

@ -1,17 +1,27 @@
"""
Observability Service Dealix AI Revenue OS
Cost tracking, workflow metrics, health monitoring, and Arabic executive summaries.
Observability Service -- Dealix AI Revenue OS -- خدمة المراقبة
Track cost, performance, and health across all agent workflows.
Anomaly detection, executive summaries in Arabic.
"""
from __future__ import annotations
import logging
from datetime import datetime, timezone, timedelta
from typing import Optional
import statistics
from collections import defaultdict
from datetime import datetime, timedelta, timezone
from typing import Any, Optional
from pydantic import BaseModel, Field
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Models
# ---------------------------------------------------------------------------
class WorkflowMetric(BaseModel):
"""Single workflow execution metric."""
workflow_name: str
profile_id: str
backend: str
@ -23,170 +33,313 @@ class WorkflowMetric(BaseModel):
error: Optional[str] = None
class AnomalyAlert(BaseModel):
"""Detected anomaly."""
id: str = ""
anomaly_type: str # cost_spike, failure_spike, latency_spike, regression
description: str
description_ar: str
severity: str = "medium" # critical, high, medium, low
metric_name: str = ""
current_value: float = 0.0
baseline_value: float = 0.0
deviation_pct: float = 0.0
detected_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
# ---------------------------------------------------------------------------
# Service
# ---------------------------------------------------------------------------
class ObservabilityService:
"""Track cost, performance, and health across all agent workflows."""
def __init__(self):
def __init__(self) -> None:
self._metrics: list[WorkflowMetric] = []
self._max_metrics = 50000
self._max_metrics = 50_000
self._anomalies: list[AnomalyAlert] = []
self._max_anomalies = 1_000
logger.info("خدمة المراقبة: تم التهيئة")
# -- Recording ---------------------------------------------------------
async def record_workflow(self, metric: WorkflowMetric) -> None:
"""Store a workflow execution metric."""
self._metrics.append(metric)
if len(self._metrics) > self._max_metrics:
self._metrics = self._metrics[-self._max_metrics:]
logger.debug(
f"Recorded: {metric.workflow_name} "
f"cost=${metric.estimated_cost_usd:.4f} "
f"{'OK' if metric.success else 'FAIL'}"
"[Obs] سجل: %s profile=%s backend=%s %dms $%.4f %s",
metric.workflow_name, metric.profile_id, metric.backend,
metric.duration_ms, metric.estimated_cost_usd,
"OK" if metric.success else "FAIL",
)
def _filter_by_period(
self, period: str, metrics: list[WorkflowMetric] = None
) -> list[WorkflowMetric]:
source = metrics or self._metrics
now = datetime.now(timezone.utc)
if period == "hourly":
cutoff = now - timedelta(hours=1)
elif period == "daily":
cutoff = now - timedelta(days=1)
elif period == "weekly":
cutoff = now - timedelta(weeks=1)
elif period == "monthly":
cutoff = now - timedelta(days=30)
else:
cutoff = now - timedelta(days=1)
return [m for m in source if m.started_at >= cutoff]
# -- Cost report -------------------------------------------------------
async def get_cost_report(
self, period: str = "daily", profile: str = None
) -> dict:
filtered = self._filter_by_period(period)
if profile:
filtered = [m for m in filtered if m.profile_id == profile]
self, period: str = "daily", profile: Optional[str] = None,
) -> dict[str, Any]:
"""Total cost, cost by profile, cost by backend, cost by workflow."""
cutoff = self._period_cutoff(period)
filtered = [
m for m in self._metrics
if m.started_at >= cutoff and (profile is None or m.profile_id == profile)
]
total_cost = sum(m.estimated_cost_usd for m in filtered)
by_profile: dict[str, float] = {}
by_backend: dict[str, float] = {}
by_workflow: dict[str, float] = {}
total = sum(m.estimated_cost_usd for m in filtered)
by_profile: dict[str, float] = defaultdict(float)
by_backend: dict[str, float] = defaultdict(float)
by_workflow: dict[str, float] = defaultdict(float)
for m in filtered:
by_profile[m.profile_id] = by_profile.get(m.profile_id, 0) + m.estimated_cost_usd
by_backend[m.backend] = by_backend.get(m.backend, 0) + m.estimated_cost_usd
by_workflow[m.workflow_name] = by_workflow.get(m.workflow_name, 0) + m.estimated_cost_usd
by_profile[m.profile_id] += m.estimated_cost_usd
by_backend[m.backend] += m.estimated_cost_usd
by_workflow[m.workflow_name] += m.estimated_cost_usd
top_expensive = sorted(by_workflow.items(), key=lambda x: x[1], reverse=True)[:5]
# Sort by cost descending
top_workflows = sorted(by_workflow.items(), key=lambda x: -x[1])[:10]
return {
"period": period,
"total_cost_usd": round(total_cost, 4),
"total_workflows": len(filtered),
"total_usd": round(total, 4),
"by_profile": {k: round(v, 4) for k, v in by_profile.items()},
"by_backend": {k: round(v, 4) for k, v in by_backend.items()},
"top_expensive": [{"name": k, "cost": round(v, 4)} for k, v in top_expensive],
"top_workflows": [{"name": n, "cost_usd": round(c, 4)} for n, c in top_workflows],
"total_executions": len(filtered),
"message_ar": f"التكلفة الإجمالية ({period}): ${total:.2f} عبر {len(filtered)} عملية",
}
async def get_performance_report(self, period: str = "daily") -> dict:
filtered = self._filter_by_period(period)
# -- Performance report ------------------------------------------------
async def get_performance_report(self, period: str = "daily") -> dict[str, Any]:
"""Average duration, P95 duration, success rate, error rate."""
cutoff = self._period_cutoff(period)
filtered = [m for m in self._metrics if m.started_at >= cutoff]
if not filtered:
return {"period": period, "total": 0}
return {
"period": period, "total_executions": 0,
"message_ar": "لا توجد بيانات لهذه الفترة",
}
durations = [m.duration_ms for m in filtered]
durations.sort()
total = len(durations)
success_count = sum(1 for m in filtered if m.success)
successes = sum(1 for m in filtered if m.success)
failures = len(filtered) - successes
p95_idx = min(int(total * 0.95), total - 1)
errors = [m for m in filtered if not m.success]
avg_ms = statistics.mean(durations)
p95_ms = sorted(durations)[int(len(durations) * 0.95)] if len(durations) >= 20 else max(durations)
success_rate = successes / len(filtered)
# Slowest workflows
by_wf: dict[str, list[int]] = defaultdict(list)
for m in filtered:
by_wf[m.workflow_name].append(m.duration_ms)
slowest = sorted(
[(wf, statistics.mean(ds)) for wf, ds in by_wf.items()],
key=lambda x: -x[1],
)[:5]
# Most expensive
cost_wf: dict[str, float] = defaultdict(float)
for m in filtered:
cost_wf[m.workflow_name] += m.estimated_cost_usd
most_expensive = sorted(cost_wf.items(), key=lambda x: -x[1])[:5]
return {
"period": period,
"total_workflows": total,
"success_rate": round(success_count / total * 100, 1) if total else 0,
"avg_duration_ms": round(sum(durations) / total) if total else 0,
"p95_duration_ms": durations[p95_idx] if durations else 0,
"error_count": len(errors),
"error_rate": round(len(errors) / total * 100, 1) if total else 0,
"recent_errors": [
{"workflow": e.workflow_name, "error": e.error, "at": e.started_at.isoformat()}
for e in errors[-5:]
],
"total_executions": len(filtered),
"avg_duration_ms": round(avg_ms, 2),
"p95_duration_ms": p95_ms,
"success_rate": round(success_rate, 4),
"error_rate": round(1 - success_rate, 4),
"total_successes": successes,
"total_failures": failures,
"slowest_workflows": [{"name": n, "avg_ms": round(d, 2)} for n, d in slowest],
"most_expensive": [{"name": n, "cost_usd": round(c, 4)} for n, c in most_expensive],
"message_ar": (
f"أداء ({period}): {len(filtered)} عملية، "
f"متوسط {avg_ms:.0f}ms، نجاح {success_rate:.0%}، فشل {failures}"
),
}
async def get_health_report(self) -> dict:
daily = self._filter_by_period("daily")
total = len(daily)
success = sum(1 for m in daily if m.success)
# -- Health report -----------------------------------------------------
backends_used = set(m.backend for m in daily)
async def get_health_report(self) -> dict[str, Any]:
"""Backend health, skill health, memory health, knowledge health, trust health."""
recent = [m for m in self._metrics if m.started_at >= self._period_cutoff("daily")]
# Backend health
backend_calls: dict[str, dict[str, int]] = defaultdict(lambda: {"ok": 0, "fail": 0})
for m in recent:
backend_calls[m.backend]["ok" if m.success else "fail"] += 1
backend_health = {}
for b in backends_used:
b_metrics = [m for m in daily if m.backend == b]
b_success = sum(1 for m in b_metrics if m.success)
for b, counts in backend_calls.items():
total = counts["ok"] + counts["fail"]
rate = counts["ok"] / total if total else 1.0
backend_health[b] = {
"total": len(b_metrics),
"success_rate": round(b_success / len(b_metrics) * 100, 1) if b_metrics else 0,
"avg_duration_ms": round(
sum(m.duration_ms for m in b_metrics) / len(b_metrics)
) if b_metrics else 0,
"total": total, "success_rate": round(rate, 4),
"healthy": rate >= 0.7 or total < 5,
}
# Skill health (by workflow)
skill_calls: dict[str, dict[str, int]] = defaultdict(lambda: {"ok": 0, "fail": 0})
for m in recent:
skill_calls[m.workflow_name]["ok" if m.success else "fail"] += 1
skill_health = {}
for s, counts in skill_calls.items():
total = counts["ok"] + counts["fail"]
rate = counts["ok"] / total if total else 1.0
skill_health[s] = {"total": total, "success_rate": round(rate, 4)}
# Overall
total = len(recent)
overall_ok = sum(1 for m in recent if m.success)
overall_rate = overall_ok / total if total else 1.0
healthy = overall_rate >= 0.8
return {
"overall_health": "healthy" if (total == 0 or success / total > 0.9) else "degraded",
"workflows_today": total,
"success_rate": round(success / total * 100, 1) if total else 100,
"total_cost_today_usd": round(sum(m.estimated_cost_usd for m in daily), 4),
"backends": backend_health,
"overall_healthy": healthy,
"overall_success_rate": round(overall_rate, 4),
"total_today": total,
"backend_health": backend_health,
"skill_health": skill_health,
"anomalies_today": len([
a for a in self._anomalies
if a.detected_at >= self._period_cutoff("daily")
]),
"message_ar": (
f"صحة النظام: {'سليم' if healthy else 'يحتاج انتباه'} -- "
f"نجاح {overall_rate:.0%}، {total} عملية اليوم"
),
}
async def get_executive_summary(self, period: str = "weekly") -> str:
filtered = self._filter_by_period(period)
total = len(filtered)
success = sum(1 for m in filtered if m.success)
cost = sum(m.estimated_cost_usd for m in filtered)
success_rate = round(success / total * 100) if total else 100
# -- Executive summary -------------------------------------------------
period_ar = {"daily": "اليوم", "weekly": "هذا الأسبوع", "monthly": "هذا الشهر"}.get(period, period)
async def get_executive_summary(self, period: str = "weekly") -> str:
"""Arabic executive summary for leadership."""
cutoff = self._period_cutoff(period)
filtered = [m for m in self._metrics if m.started_at >= cutoff]
total = len(filtered)
successes = sum(1 for m in filtered if m.success)
rate = successes / total if total else 0
cost = sum(m.estimated_cost_usd for m in filtered)
anomalies = [a for a in self._anomalies if a.detected_at >= cutoff]
critical = sum(1 for a in anomalies if a.severity == "critical")
period_ar = {
"daily": "اليوم", "weekly": "هذا الأسبوع",
"monthly": "هذا الشهر",
}.get(period, period)
summary = (
f"📊 ملخص {period_ar}:\n"
f"{total} مهمة منفذة\n"
f"{success_rate}% نسبة النجاح\n"
f"• ${cost:.2f} التكلفة الإجمالية\n"
f"{period_ar}: {total} مهمة منفذة، "
f"{rate:.0%} نجاح، "
f"تكلفة ${cost:.2f}، "
f"{len(anomalies)} تنبيه"
)
errors = [m for m in filtered if not m.success]
if errors:
summary += f"{len(errors)} خطأ يحتاج مراجعة\n"
if critical:
summary += f"، {critical} مشاكل حرجة تحتاج تدخل فوري"
else:
summary += "• لا أخطاء حرجة ✅\n"
summary += "، 0 مشاكل حرجة"
return summary
async def detect_anomalies(self) -> list[dict]:
anomalies = []
hourly = self._filter_by_period("hourly")
daily = self._filter_by_period("daily")
# -- Anomaly detection -------------------------------------------------
if hourly:
hourly_cost = sum(m.estimated_cost_usd for m in hourly)
if hourly_cost > 5.0:
anomalies.append({
"type": "cost_spike",
"severity": "high",
"message": f"تكلفة الساعة الأخيرة ${hourly_cost:.2f} — أعلى من الحد الطبيعي",
"value": hourly_cost,
})
async def detect_anomalies(self) -> list[AnomalyAlert]:
"""Sudden cost spikes, unusual failure patterns, backend degradation."""
alerts: list[AnomalyAlert] = []
now = datetime.now(timezone.utc)
today = [m for m in self._metrics if m.started_at >= now - timedelta(hours=24)]
prev_week = [
m for m in self._metrics
if now - timedelta(days=8) <= m.started_at < now - timedelta(days=1)
]
hourly_errors = sum(1 for m in hourly if not m.success)
if len(hourly) > 5 and hourly_errors / len(hourly) > 0.3:
anomalies.append({
"type": "error_spike",
"severity": "critical",
"message": f"معدل أخطاء مرتفع: {hourly_errors}/{len(hourly)} في الساعة الأخيرة",
"value": hourly_errors,
})
if not today or not prev_week:
return alerts
return anomalies
# Cost spike detection
today_cost = sum(m.estimated_cost_usd for m in today)
avg_daily_cost = sum(m.estimated_cost_usd for m in prev_week) / 7
if avg_daily_cost > 0 and today_cost > avg_daily_cost * 2:
deviation = ((today_cost - avg_daily_cost) / avg_daily_cost) * 100
alerts.append(AnomalyAlert(
id=f"cost-{now.strftime('%Y%m%d')}",
anomaly_type="cost_spike",
description=f"Today's cost ${today_cost:.2f} is {deviation:.0f}% above daily avg ${avg_daily_cost:.2f}",
description_ar=f"تكلفة اليوم ${today_cost:.2f} أعلى بنسبة {deviation:.0f}% من المتوسط ${avg_daily_cost:.2f}",
severity="high" if deviation > 200 else "medium",
metric_name="daily_cost_usd",
current_value=today_cost,
baseline_value=avg_daily_cost,
deviation_pct=round(deviation, 2),
))
# Failure rate spike
today_fail_rate = sum(1 for m in today if not m.success) / max(len(today), 1)
prev_fail_rate = sum(1 for m in prev_week if not m.success) / max(len(prev_week), 1)
if today_fail_rate > 0.15 and today_fail_rate > prev_fail_rate * 2:
alerts.append(AnomalyAlert(
id=f"fail-{now.strftime('%Y%m%d')}",
anomaly_type="failure_spike",
description=f"Failure rate {today_fail_rate:.1%} vs baseline {prev_fail_rate:.1%}",
description_ar=f"معدل الفشل {today_fail_rate:.1%} مقابل الخط الأساسي {prev_fail_rate:.1%}",
severity="critical" if today_fail_rate > 0.3 else "high",
metric_name="failure_rate",
current_value=today_fail_rate,
baseline_value=prev_fail_rate,
deviation_pct=round(((today_fail_rate - prev_fail_rate) / max(prev_fail_rate, 0.01)) * 100, 2),
))
# Latency spike (per backend)
for backend in {"claude", "openclaude", "goose", "internal"}:
today_b = [m.duration_ms for m in today if m.backend == backend]
prev_b = [m.duration_ms for m in prev_week if m.backend == backend]
if len(today_b) >= 5 and len(prev_b) >= 5:
today_avg = statistics.mean(today_b)
prev_avg = statistics.mean(prev_b)
if prev_avg > 0 and today_avg > prev_avg * 3:
deviation = ((today_avg - prev_avg) / prev_avg) * 100
alerts.append(AnomalyAlert(
id=f"latency-{backend}-{now.strftime('%Y%m%d')}",
anomaly_type="latency_spike",
description=f"{backend} latency {today_avg:.0f}ms vs baseline {prev_avg:.0f}ms",
description_ar=f"زمن استجابة {backend}: {today_avg:.0f}ms مقابل {prev_avg:.0f}ms",
severity="high",
metric_name=f"latency_{backend}",
current_value=today_avg,
baseline_value=prev_avg,
deviation_pct=round(deviation, 2),
))
self._anomalies.extend(alerts)
if len(self._anomalies) > self._max_anomalies:
self._anomalies = self._anomalies[-self._max_anomalies:]
if alerts:
logger.warning("[Obs] %d anomalies detected", len(alerts))
return alerts
# -- Helpers -----------------------------------------------------------
@staticmethod
def _period_cutoff(period: str) -> datetime:
now = datetime.now(timezone.utc)
if period == "daily":
return now - timedelta(hours=24)
if period == "weekly":
return now - timedelta(days=7)
if period == "monthly":
return now - timedelta(days=30)
return now - timedelta(hours=24)
observability = ObservabilityService()
# ---------------------------------------------------------------------------
# Module-level singleton
# ---------------------------------------------------------------------------
observability_service = ObservabilityService()