system-prompts-and-models-o.../salesflow-saas/backend/app/services/observability.py

"""
Observability Service — Dealix AI Revenue OS
Cost tracking, workflow metrics, health monitoring, and Arabic executive summaries.
"""
import logging
from datetime import datetime, timezone, timedelta
from typing import Optional

from pydantic import BaseModel, Field

logger = logging.getLogger(__name__)


class WorkflowMetric(BaseModel):
    workflow_name: str
    profile_id: str
    backend: str
    started_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
    duration_ms: int = 0
    token_count: int = 0
    estimated_cost_usd: float = 0.0
    success: bool = True
    error: Optional[str] = None


class ObservabilityService:
    """Track cost, performance, and health across all agent workflows."""

    def __init__(self):
        self._metrics: list[WorkflowMetric] = []
        self._max_metrics = 50000

    async def record_workflow(self, metric: WorkflowMetric) -> None:
        self._metrics.append(metric)
        if len(self._metrics) > self._max_metrics:
            self._metrics = self._metrics[-self._max_metrics:]
        logger.debug(
            f"Recorded: {metric.workflow_name} "
            f"cost=${metric.estimated_cost_usd:.4f} "
            f"{'OK' if metric.success else 'FAIL'}"
        )

    def _filter_by_period(
        self, period: str, metrics: list[WorkflowMetric] = None
    ) -> list[WorkflowMetric]:
        source = metrics or self._metrics
        now = datetime.now(timezone.utc)
        if period == "hourly":
            cutoff = now - timedelta(hours=1)
        elif period == "daily":
            cutoff = now - timedelta(days=1)
        elif period == "weekly":
            cutoff = now - timedelta(weeks=1)
        elif period == "monthly":
            cutoff = now - timedelta(days=30)
        else:
            cutoff = now - timedelta(days=1)
        return [m for m in source if m.started_at >= cutoff]

    async def get_cost_report(
        self, period: str = "daily", profile: str = None
    ) -> dict:
        filtered = self._filter_by_period(period)
        if profile:
            filtered = [m for m in filtered if m.profile_id == profile]

        total_cost = sum(m.estimated_cost_usd for m in filtered)
        by_profile: dict[str, float] = {}
        by_backend: dict[str, float] = {}
        by_workflow: dict[str, float] = {}

        for m in filtered:
            by_profile[m.profile_id] = by_profile.get(m.profile_id, 0) + m.estimated_cost_usd
            by_backend[m.backend] = by_backend.get(m.backend, 0) + m.estimated_cost_usd
            by_workflow[m.workflow_name] = by_workflow.get(m.workflow_name, 0) + m.estimated_cost_usd

        top_expensive = sorted(by_workflow.items(), key=lambda x: x[1], reverse=True)[:5]

        return {
            "period": period,
            "total_cost_usd": round(total_cost, 4),
            "total_workflows": len(filtered),
            "by_profile": {k: round(v, 4) for k, v in by_profile.items()},
            "by_backend": {k: round(v, 4) for k, v in by_backend.items()},
            "top_expensive": [{"name": k, "cost": round(v, 4)} for k, v in top_expensive],
        }

    async def get_performance_report(self, period: str = "daily") -> dict:
        filtered = self._filter_by_period(period)
        if not filtered:
            return {"period": period, "total": 0}

        durations = [m.duration_ms for m in filtered]
        durations.sort()
        total = len(durations)
        success_count = sum(1 for m in filtered if m.success)

        p95_idx = min(int(total * 0.95), total - 1)
        errors = [m for m in filtered if not m.success]

        return {
            "period": period,
            "total_workflows": total,
            "success_rate": round(success_count / total * 100, 1) if total else 0,
            "avg_duration_ms": round(sum(durations) / total) if total else 0,
            "p95_duration_ms": durations[p95_idx] if durations else 0,
            "error_count": len(errors),
            "error_rate": round(len(errors) / total * 100, 1) if total else 0,
            "recent_errors": [
                {"workflow": e.workflow_name, "error": e.error, "at": e.started_at.isoformat()}
                for e in errors[-5:]
            ],
        }

    async def get_health_report(self) -> dict:
        daily = self._filter_by_period("daily")
        total = len(daily)
        success = sum(1 for m in daily if m.success)

        backends_used = set(m.backend for m in daily)
        backend_health = {}
        for b in backends_used:
            b_metrics = [m for m in daily if m.backend == b]
            b_success = sum(1 for m in b_metrics if m.success)
            backend_health[b] = {
                "total": len(b_metrics),
                "success_rate": round(b_success / len(b_metrics) * 100, 1) if b_metrics else 0,
                "avg_duration_ms": round(
                    sum(m.duration_ms for m in b_metrics) / len(b_metrics)
                ) if b_metrics else 0,
            }

        return {
            "overall_health": "healthy" if (total == 0 or success / total > 0.9) else "degraded",
            "workflows_today": total,
            "success_rate": round(success / total * 100, 1) if total else 100,
            "total_cost_today_usd": round(sum(m.estimated_cost_usd for m in daily), 4),
            "backends": backend_health,
        }

    async def get_executive_summary(self, period: str = "weekly") -> str:
        filtered = self._filter_by_period(period)
        total = len(filtered)
        success = sum(1 for m in filtered if m.success)
        cost = sum(m.estimated_cost_usd for m in filtered)
        success_rate = round(success / total * 100) if total else 100

        period_ar = {"daily": "اليوم", "weekly": "هذا الأسبوع", "monthly": "هذا الشهر"}.get(period, period)

        summary = (
            f"📊 ملخص {period_ar}:\n"
            f"• {total} مهمة منفذة\n"
            f"• {success_rate}% نسبة النجاح\n"
            f"• ${cost:.2f} التكلفة الإجمالية\n"
        )

        errors = [m for m in filtered if not m.success]
        if errors:
            summary += f"• {len(errors)} خطأ يحتاج مراجعة\n"
        else:
            summary += "• لا أخطاء حرجة ✅\n"

        return summary

    async def detect_anomalies(self) -> list[dict]:
        anomalies = []
        hourly = self._filter_by_period("hourly")
        daily = self._filter_by_period("daily")

        if hourly:
            hourly_cost = sum(m.estimated_cost_usd for m in hourly)
            if hourly_cost > 5.0:
                anomalies.append({
                    "type": "cost_spike",
                    "severity": "high",
                    "message": f"تكلفة الساعة الأخيرة ${hourly_cost:.2f} — أعلى من الحد الطبيعي",
                    "value": hourly_cost,
                })

            hourly_errors = sum(1 for m in hourly if not m.success)
            if len(hourly) > 5 and hourly_errors / len(hourly) > 0.3:
                anomalies.append({
                    "type": "error_spike",
                    "severity": "critical",
                    "message": f"معدل أخطاء مرتفع: {hourly_errors}/{len(hourly)} في الساعة الأخيرة",
                    "value": hourly_errors,
                })

        return anomalies


observability = ObservabilityService()