system-prompts-and-models-o.../salesflow-saas/backend/app/services/sla_escalation_alerts.py

"""
Approval SLA: auto-escalation metadata on pending rows + breach alerts (webhook / Slack).

Persists escalation state under ApprovalRequest.payload["_dealix_sla"].
Aggregated breach notifications use a per-tenant cooldown to avoid spam.
"""

from __future__ import annotations

import logging
from collections import Counter
from datetime import datetime, timedelta, timezone
from typing import Any, Dict, List, Optional
from uuid import UUID

import httpx
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm.attributes import flag_modified

from app.config import get_settings
from app.models.operations import ApprovalRequest
from app.services.operations_hub import emit_domain_event

logger = logging.getLogger(__name__)

SLA_KEY = "_dealix_sla"

# tenant_id -> last aggregate breach alert (UTC)
_last_aggregate_breach_alert: Dict[str, datetime] = {}


def _hours_between(now: datetime, then: Optional[datetime]) -> float:
    if not then:
        return 0.0
    return max(0.0, (now - then).total_seconds() / 3600.0)


def _escalation_level(age_h: float, warn_h: int, breach_h: int, l3_mult: float) -> int:
    if age_h < warn_h:
        return 0
    if age_h < breach_h:
        return 1
    breach_m = max(float(breach_h), float(warn_h))
    if age_h < breach_m * max(l3_mult, 1.01):
        return 2
    return 3


def _level_label_ar(level: int) -> str:
    return {
        0: "ضمن المهلة",
        1: "تحذير — يقترب من تجاوز SLA",
        2: "تجاوز SLA — يتطلب اهتماماً فورياً",
        3: "تصعيد حرج — تدخل المالك/الإدارة",
    }.get(level, "غير معروف")


async def refresh_pending_escalations(db: AsyncSession, tenant_id: UUID) -> Dict[str, Any]:
    """
    Update _dealix_sla on each pending approval; emit domain events when level increases.
    """
    s = get_settings()
    now = datetime.now(timezone.utc)
    warn_h = max(1, int(s.OPENCLAW_APPROVAL_SLA_HOURS_WARN))
    breach_h = max(warn_h, int(s.OPENCLAW_APPROVAL_SLA_HOURS_BREACH))
    l3_mult = max(1.01, float(s.OPENCLAW_APPROVAL_ESCALATION_L3_MULTIPLIER))

    q = await db.execute(
        select(ApprovalRequest).where(
            ApprovalRequest.tenant_id == tenant_id,
            ApprovalRequest.status == "pending",
        )
    )
    rows: List[ApprovalRequest] = list(q.scalars().all())
    counts: Counter[int] = Counter()
    bumped = 0

    for row in rows:
        age_h = _hours_between(now, row.created_at)
        level = _escalation_level(age_h, warn_h, breach_h, l3_mult)
        counts[level] += 1

        base = dict(row.payload) if isinstance(row.payload, dict) else {}
        prev = base.get(SLA_KEY) if isinstance(base.get(SLA_KEY), dict) else {}
        prev_level = int(prev.get("escalation_level", 0) or 0)

        sla_block = {
            **prev,
            "escalation_level": level,
            "escalation_label_ar": _level_label_ar(level),
            "age_hours": round(age_h, 2),
            "warn_threshold_hours": warn_h,
            "breach_threshold_hours": breach_h,
            "updated_at": now.isoformat(),
        }

        if level != prev_level:
            sla_block["escalation_changed_at"] = now.isoformat()

        base[SLA_KEY] = sla_block
        row.payload = base
        flag_modified(row, "payload")

        if level > prev_level:
            bumped += 1
            await emit_domain_event(
                db,
                tenant_id=tenant_id,
                event_type="approval.sla_escalated",
                payload={
                    "approval_id": str(row.id),
                    "from_level": prev_level,
                    "to_level": level,
                    "age_hours": round(age_h, 2),
                },
                source="sla_escalation",
            )

    by_level = {str(k): int(counts.get(k, 0)) for k in range(4)}
    return {
        "pending_escalation_total": len(rows),
        "by_level": by_level,
        "events_emitted": bumped,
    }


async def maybe_dispatch_sla_breach_alerts(
    db: AsyncSession,
    tenant_id: UUID,
    *,
    tenant_id_str: str,
    metrics: Dict[str, Any],
) -> Dict[str, Any]:
    """
    If breach count > 0 and alerts enabled, POST to webhook and/or Slack (respecting cooldown).
    """
    s = get_settings()
    out: Dict[str, Any] = {
        "attempted": False,
        "skipped_reason": None,
        "webhook_ok": None,
        "slack_ok": None,
        "cooldown_minutes": int(s.OPENCLAW_SLA_ALERT_COOLDOWN_MINUTES),
    }

    if not s.OPENCLAW_SLA_ALERTS_ENABLED:
        out["skipped_reason"] = "alerts_disabled"
        return out

    breach_n = int(metrics.get("pending_breach_count") or 0)
    if breach_n <= 0:
        out["skipped_reason"] = "no_breach"
        return out

    webhook_url = (s.OPENCLAW_SLA_WEBHOOK_URL or "").strip()
    slack_url = (s.OPENCLAW_SLA_SLACK_WEBHOOK_URL or "").strip()
    if not webhook_url and not slack_url:
        out["skipped_reason"] = "no_webhook_configured"
        return out

    now = datetime.now(timezone.utc)
    cool = timedelta(minutes=max(5, int(s.OPENCLAW_SLA_ALERT_COOLDOWN_MINUTES)))
    last = _last_aggregate_breach_alert.get(tenant_id_str)
    if last and (now - last) < cool:
        out["skipped_reason"] = "cooldown"
        out["next_eligible_at"] = (last + cool).isoformat()
        return out

    payload = {
        "event": "approval_sla.breach",
        "tenant_id": tenant_id_str,
        "pending_breach_count": breach_n,
        "pending_warn_count": int(metrics.get("pending_warn_count") or 0),
        "breach_threshold_hours": metrics.get("breach_threshold_hours"),
        "warn_threshold_hours": metrics.get("warn_threshold_hours"),
        "health": metrics.get("health"),
        "timestamp": now.isoformat(),
        "source": "dealix",
    }

    out["attempted"] = True
    timeout = httpx.Timeout(12.0)

    async with httpx.AsyncClient(timeout=timeout) as client:
        if webhook_url:
            try:
                r = await client.post(webhook_url, json=payload)
                out["webhook_ok"] = 200 <= r.status_code < 300
                if not out["webhook_ok"]:
                    out["webhook_status"] = r.status_code
            except Exception as e:
                logger.warning("SLA webhook failed: %s", e)
                out["webhook_ok"] = False
                out["webhook_error"] = str(e)[:200]

        if slack_url:
            text = (
                f":rotating_light: *Approval SLA breach* — tenant `{tenant_id_str[:8]}…`\n"
                f"*Pending breach:* {breach_n} (warn: {metrics.get('pending_warn_count', 0)})\n"
                f"*Thresholds:* warn {metrics.get('warn_threshold_hours')}h / breach {metrics.get('breach_threshold_hours')}h\n"
                f"*Time:* {now.isoformat()}"
            )
            slack_body = {"text": text}
            try:
                r2 = await client.post(slack_url, json=slack_body)
                out["slack_ok"] = 200 <= r2.status_code < 300
                if not out["slack_ok"]:
                    out["slack_status"] = r2.status_code
            except Exception as e:
                logger.warning("SLA Slack webhook failed: %s", e)
                out["slack_ok"] = False
                out["slack_error"] = str(e)[:200]

    delivered = bool(out.get("webhook_ok")) or bool(out.get("slack_ok"))
    if not delivered:
        out["skipped_reason"] = "delivery_failed"
        out["attempted"] = True
        return out

    _last_aggregate_breach_alert[tenant_id_str] = now
    out["dispatched_at"] = now.isoformat()

    # Mark pending breach rows with last aggregate notify (audit in payload)
    q = await db.execute(
        select(ApprovalRequest).where(
            ApprovalRequest.tenant_id == tenant_id,
            ApprovalRequest.status == "pending",
        )
    )
    breach_h = max(1, int(s.OPENCLAW_APPROVAL_SLA_HOURS_BREACH))
    for row in q.scalars().all():
        age_h = _hours_between(now, row.created_at)
        if age_h < breach_h:
            continue
        base = dict(row.payload) if isinstance(row.payload, dict) else {}
        sla = dict(base.get(SLA_KEY) or {})
        sla["last_aggregate_breach_alert_at"] = now.isoformat()
        base[SLA_KEY] = sla
        row.payload = base
        flag_modified(row, "payload")

    await emit_domain_event(
        db,
        tenant_id=tenant_id,
        event_type="approval.sla_breach_notified",
        payload={
            "pending_breach_count": breach_n,
            "webhook_ok": out.get("webhook_ok"),
            "slack_ok": out.get("slack_ok"),
        },
        source="sla_alerts",
    )

    return out