system-prompts-and-models-o.../salesflow-saas/backend/app/services/observability.py
Claude 1cebf54782
feat: Complete Hermes Fusion — execution router, Shannon, self-improvement, observability, API
Hermes Fusion Supreme integration:
- execution_router.py: Agent-level backend routing (Claude/OpenClaude/Goose/Internal)
  with fallback chains, cost estimation, health tracking
- shannon_security.py: Staging-only white-box pentesting lane
  (auth, injection, tenant isolation, PDPL compliance checks)
- self_improvement.py: Bounded inspect→measure→propose→verify→apply cycle
  (max 5 proposals, max 2 auto-applies for trivial fixes)
- observability.py: Cost tracking, performance metrics, health monitoring,
  Arabic executive summaries, anomaly detection
- hermes.py: Full API (execute, profiles, cost, health, improvements,
  security scans, session restore — 18 endpoints)

https://claude.ai/code/session_01LsnvBa7HwF5hs99VZbgLGj
2026-04-11 08:29:09 +00:00

193 lines
7.3 KiB
Python

"""
Observability Service — Dealix AI Revenue OS
Cost tracking, workflow metrics, health monitoring, and Arabic executive summaries.
"""
import logging
from datetime import datetime, timezone, timedelta
from typing import Optional
from pydantic import BaseModel, Field
logger = logging.getLogger(__name__)
class WorkflowMetric(BaseModel):
workflow_name: str
profile_id: str
backend: str
started_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
duration_ms: int = 0
token_count: int = 0
estimated_cost_usd: float = 0.0
success: bool = True
error: Optional[str] = None
class ObservabilityService:
"""Track cost, performance, and health across all agent workflows."""
def __init__(self):
self._metrics: list[WorkflowMetric] = []
self._max_metrics = 50000
async def record_workflow(self, metric: WorkflowMetric) -> None:
self._metrics.append(metric)
if len(self._metrics) > self._max_metrics:
self._metrics = self._metrics[-self._max_metrics:]
logger.debug(
f"Recorded: {metric.workflow_name} "
f"cost=${metric.estimated_cost_usd:.4f} "
f"{'OK' if metric.success else 'FAIL'}"
)
def _filter_by_period(
self, period: str, metrics: list[WorkflowMetric] = None
) -> list[WorkflowMetric]:
source = metrics or self._metrics
now = datetime.now(timezone.utc)
if period == "hourly":
cutoff = now - timedelta(hours=1)
elif period == "daily":
cutoff = now - timedelta(days=1)
elif period == "weekly":
cutoff = now - timedelta(weeks=1)
elif period == "monthly":
cutoff = now - timedelta(days=30)
else:
cutoff = now - timedelta(days=1)
return [m for m in source if m.started_at >= cutoff]
async def get_cost_report(
self, period: str = "daily", profile: str = None
) -> dict:
filtered = self._filter_by_period(period)
if profile:
filtered = [m for m in filtered if m.profile_id == profile]
total_cost = sum(m.estimated_cost_usd for m in filtered)
by_profile: dict[str, float] = {}
by_backend: dict[str, float] = {}
by_workflow: dict[str, float] = {}
for m in filtered:
by_profile[m.profile_id] = by_profile.get(m.profile_id, 0) + m.estimated_cost_usd
by_backend[m.backend] = by_backend.get(m.backend, 0) + m.estimated_cost_usd
by_workflow[m.workflow_name] = by_workflow.get(m.workflow_name, 0) + m.estimated_cost_usd
top_expensive = sorted(by_workflow.items(), key=lambda x: x[1], reverse=True)[:5]
return {
"period": period,
"total_cost_usd": round(total_cost, 4),
"total_workflows": len(filtered),
"by_profile": {k: round(v, 4) for k, v in by_profile.items()},
"by_backend": {k: round(v, 4) for k, v in by_backend.items()},
"top_expensive": [{"name": k, "cost": round(v, 4)} for k, v in top_expensive],
}
async def get_performance_report(self, period: str = "daily") -> dict:
filtered = self._filter_by_period(period)
if not filtered:
return {"period": period, "total": 0}
durations = [m.duration_ms for m in filtered]
durations.sort()
total = len(durations)
success_count = sum(1 for m in filtered if m.success)
p95_idx = min(int(total * 0.95), total - 1)
errors = [m for m in filtered if not m.success]
return {
"period": period,
"total_workflows": total,
"success_rate": round(success_count / total * 100, 1) if total else 0,
"avg_duration_ms": round(sum(durations) / total) if total else 0,
"p95_duration_ms": durations[p95_idx] if durations else 0,
"error_count": len(errors),
"error_rate": round(len(errors) / total * 100, 1) if total else 0,
"recent_errors": [
{"workflow": e.workflow_name, "error": e.error, "at": e.started_at.isoformat()}
for e in errors[-5:]
],
}
async def get_health_report(self) -> dict:
daily = self._filter_by_period("daily")
total = len(daily)
success = sum(1 for m in daily if m.success)
backends_used = set(m.backend for m in daily)
backend_health = {}
for b in backends_used:
b_metrics = [m for m in daily if m.backend == b]
b_success = sum(1 for m in b_metrics if m.success)
backend_health[b] = {
"total": len(b_metrics),
"success_rate": round(b_success / len(b_metrics) * 100, 1) if b_metrics else 0,
"avg_duration_ms": round(
sum(m.duration_ms for m in b_metrics) / len(b_metrics)
) if b_metrics else 0,
}
return {
"overall_health": "healthy" if (total == 0 or success / total > 0.9) else "degraded",
"workflows_today": total,
"success_rate": round(success / total * 100, 1) if total else 100,
"total_cost_today_usd": round(sum(m.estimated_cost_usd for m in daily), 4),
"backends": backend_health,
}
async def get_executive_summary(self, period: str = "weekly") -> str:
filtered = self._filter_by_period(period)
total = len(filtered)
success = sum(1 for m in filtered if m.success)
cost = sum(m.estimated_cost_usd for m in filtered)
success_rate = round(success / total * 100) if total else 100
period_ar = {"daily": "اليوم", "weekly": "هذا الأسبوع", "monthly": "هذا الشهر"}.get(period, period)
summary = (
f"📊 ملخص {period_ar}:\n"
f"{total} مهمة منفذة\n"
f"{success_rate}% نسبة النجاح\n"
f"• ${cost:.2f} التكلفة الإجمالية\n"
)
errors = [m for m in filtered if not m.success]
if errors:
summary += f"{len(errors)} خطأ يحتاج مراجعة\n"
else:
summary += "• لا أخطاء حرجة ✅\n"
return summary
async def detect_anomalies(self) -> list[dict]:
anomalies = []
hourly = self._filter_by_period("hourly")
daily = self._filter_by_period("daily")
if hourly:
hourly_cost = sum(m.estimated_cost_usd for m in hourly)
if hourly_cost > 5.0:
anomalies.append({
"type": "cost_spike",
"severity": "high",
"message": f"تكلفة الساعة الأخيرة ${hourly_cost:.2f} — أعلى من الحد الطبيعي",
"value": hourly_cost,
})
hourly_errors = sum(1 for m in hourly if not m.success)
if len(hourly) > 5 and hourly_errors / len(hourly) > 0.3:
anomalies.append({
"type": "error_spike",
"severity": "critical",
"message": f"معدل أخطاء مرتفع: {hourly_errors}/{len(hourly)} في الساعة الأخيرة",
"value": hourly_errors,
})
return anomalies
observability = ObservabilityService()