mirror of
https://github.com/x1xhlol/system-prompts-and-models-of-ai-tools.git
synced 2026-06-19 07:49:34 +00:00
fix: Enhanced Hermes API, router registration, and observability service
https://claude.ai/code/session_01LsnvBa7HwF5hs99VZbgLGj
This commit is contained in:
parent
40a48b98c9
commit
85a9c9a23f
@ -1,214 +1,285 @@
|
|||||||
"""
|
"""
|
||||||
Hermes API — Dealix AI Revenue OS
|
Hermes API -- Dealix AI Revenue OS -- واجهة هيرمس البرمجية
|
||||||
Top-level orchestration, profiles, cost, health, improvements, security.
|
Orchestrator endpoints: execute tasks, manage profiles, view costs,
|
||||||
|
run security scans, approve improvements, and generate executive summaries.
|
||||||
"""
|
"""
|
||||||
import logging
|
from __future__ import annotations
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException, status
|
import logging
|
||||||
from pydantic import BaseModel
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from app.services.hermes_orchestrator import hermes_orchestrator, HermesProfile
|
||||||
|
from app.services.execution_router import execution_router, TaskClass
|
||||||
|
from app.services.shannon_security import shannon_security, ShannonScope
|
||||||
|
from app.services.self_improvement import self_improvement_engine
|
||||||
|
from app.services.observability import observability_service
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
router = APIRouter(prefix="/hermes", tags=["hermes"])
|
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/hermes", tags=["Hermes Orchestrator"])
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Request / response schemas
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
class ExecuteRequest(BaseModel):
|
class ExecuteRequest(BaseModel):
|
||||||
|
profile_id: str = "founder"
|
||||||
|
task: str
|
||||||
|
params: Dict[str, Any] = Field(default_factory=dict)
|
||||||
|
user_context: Dict[str, Any] = Field(default_factory=lambda: {
|
||||||
|
"user_id": "api_user", "tenant_id": "default", "role": "owner",
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
class ExecuteResponse(BaseModel):
|
||||||
|
run_id: str
|
||||||
profile_id: str
|
profile_id: str
|
||||||
task: str
|
task: str
|
||||||
params: dict = {}
|
status: str
|
||||||
|
backend: str = ""
|
||||||
|
data: Dict[str, Any] = {}
|
||||||
class ApproveRequest(BaseModel):
|
evidence: List[str] = []
|
||||||
approved_by: str
|
receipt_id: Optional[str] = None
|
||||||
|
cost_usd: float = 0.0
|
||||||
|
duration_ms: int = 0
|
||||||
|
error: Optional[str] = None
|
||||||
|
message_ar: str = ""
|
||||||
|
|
||||||
|
|
||||||
class ScanRequest(BaseModel):
|
class ScanRequest(BaseModel):
|
||||||
environment: str
|
environment: str = "staging"
|
||||||
base_url: str
|
scopes: List[str] = Field(
|
||||||
scopes: list[str] = []
|
default_factory=lambda: ["auth", "injection", "pdpl"],
|
||||||
|
)
|
||||||
|
base_url: str = "https://staging.dealix.sa"
|
||||||
|
|
||||||
|
|
||||||
# ── Execute ────────────────────────────────────────
|
class ApproveRequest(BaseModel):
|
||||||
|
user_id: str = "founder"
|
||||||
|
|
||||||
|
|
||||||
@router.post("/execute")
|
# ---------------------------------------------------------------------------
|
||||||
async def execute_task(req: ExecuteRequest):
|
# Execute
|
||||||
from app.services.hermes_orchestrator import hermes
|
# ---------------------------------------------------------------------------
|
||||||
try:
|
|
||||||
result = await hermes.execute(
|
@router.post("/execute", response_model=ExecuteResponse)
|
||||||
profile_id=req.profile_id,
|
async def execute_task(req: ExecuteRequest) -> ExecuteResponse:
|
||||||
task=req.task,
|
"""Execute a task via the Hermes orchestrator."""
|
||||||
params=req.params,
|
result = await hermes_orchestrator.execute(
|
||||||
)
|
profile_id=req.profile_id,
|
||||||
return result
|
task=req.task,
|
||||||
except ValueError as e:
|
params=req.params,
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
user_context=req.user_context,
|
||||||
except PermissionError as e:
|
)
|
||||||
raise HTTPException(status_code=403, detail=str(e))
|
return ExecuteResponse(**result.model_dump())
|
||||||
|
|
||||||
|
|
||||||
# ── Profiles ───────────────────────────────────────
|
# ---------------------------------------------------------------------------
|
||||||
|
# Profiles
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
@router.get("/profiles")
|
@router.get("/profiles")
|
||||||
async def list_profiles():
|
async def list_profiles() -> JSONResponse:
|
||||||
from app.services.hermes_orchestrator import hermes
|
"""List all available Hermes profiles."""
|
||||||
profiles = hermes.list_profiles()
|
profiles = await hermes_orchestrator.list_profiles()
|
||||||
return {"profiles": [p.model_dump() if hasattr(p, 'model_dump') else p.__dict__ for p in profiles]}
|
return JSONResponse(content={
|
||||||
|
"profiles": [p.model_dump() for p in profiles],
|
||||||
|
"count": len(profiles),
|
||||||
|
"message_ar": f"{len(profiles)} ملفات شخصية متاحة",
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
@router.get("/profiles/{profile_id}")
|
@router.get("/profiles/{profile_id}")
|
||||||
async def get_profile(profile_id: str):
|
async def get_profile(profile_id: str) -> JSONResponse:
|
||||||
from app.services.hermes_orchestrator import hermes
|
"""Get details for a specific profile."""
|
||||||
profile = hermes.get_profile(profile_id)
|
profile = await hermes_orchestrator.get_profile(profile_id)
|
||||||
if not profile:
|
if not profile:
|
||||||
raise HTTPException(status_code=404, detail="الملف الشخصي غير موجود")
|
raise HTTPException(status_code=404, detail=f"الملف الشخصي غير موجود: {profile_id}")
|
||||||
return profile.model_dump() if hasattr(profile, 'model_dump') else profile.__dict__
|
return JSONResponse(content=profile.model_dump())
|
||||||
|
|
||||||
|
|
||||||
# ── Cost & Health ──────────────────────────────────
|
# ---------------------------------------------------------------------------
|
||||||
|
# Cost
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
@router.get("/cost")
|
@router.get("/cost")
|
||||||
async def cost_report(period: str = "daily", profile: Optional[str] = None):
|
async def cost_report(period: str = "daily") -> JSONResponse:
|
||||||
from app.services.observability import observability
|
"""Get cost report from the orchestrator and observability service."""
|
||||||
return await observability.get_cost_report(period, profile)
|
hermes_cost = await hermes_orchestrator.get_cost_report(period)
|
||||||
|
obs_cost = await observability_service.get_cost_report(period)
|
||||||
|
return JSONResponse(content={
|
||||||
|
"hermes": hermes_cost,
|
||||||
|
"observability": obs_cost,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Health
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
@router.get("/health")
|
@router.get("/health")
|
||||||
async def health_report():
|
async def health_report() -> JSONResponse:
|
||||||
from app.services.observability import observability
|
"""System health report across all backends and workflows."""
|
||||||
return await observability.get_health_report()
|
obs_health = await observability_service.get_health_report()
|
||||||
|
backend_health = await execution_router.get_backend_health()
|
||||||
|
anomalies = await observability_service.detect_anomalies()
|
||||||
|
return JSONResponse(content={
|
||||||
|
"system": obs_health,
|
||||||
|
"backends": {k: v.model_dump() for k, v in backend_health.items()},
|
||||||
|
"anomalies": [a.model_dump(mode="json") for a in anomalies],
|
||||||
|
"anomaly_count": len(anomalies),
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
@router.get("/performance")
|
# ---------------------------------------------------------------------------
|
||||||
async def performance_report(period: str = "daily"):
|
# Runs
|
||||||
from app.services.observability import observability
|
# ---------------------------------------------------------------------------
|
||||||
return await observability.get_performance_report(period)
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/anomalies")
|
|
||||||
async def detect_anomalies():
|
|
||||||
from app.services.observability import observability
|
|
||||||
return {"anomalies": await observability.detect_anomalies()}
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/executive-summary")
|
|
||||||
async def executive_summary(period: str = "weekly"):
|
|
||||||
from app.services.observability import observability
|
|
||||||
summary = await observability.get_executive_summary(period)
|
|
||||||
return {"summary": summary}
|
|
||||||
|
|
||||||
|
|
||||||
# ── Runs ───────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/runs")
|
@router.get("/runs")
|
||||||
async def list_active_runs():
|
async def list_active_runs() -> JSONResponse:
|
||||||
from app.services.hermes_orchestrator import hermes
|
"""List currently active runs."""
|
||||||
return {"runs": hermes.get_active_runs()}
|
runs = await hermes_orchestrator.get_active_runs()
|
||||||
|
return JSONResponse(content={
|
||||||
|
"active_runs": [r.model_dump(mode="json") for r in runs],
|
||||||
|
"count": len(runs),
|
||||||
|
"message_ar": f"{len(runs)} عمليات نشطة حالياً",
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
@router.post("/runs/{run_id}/abort")
|
@router.post("/runs/{run_id}/abort")
|
||||||
async def abort_run(run_id: str):
|
async def abort_run(run_id: str) -> JSONResponse:
|
||||||
from app.services.hermes_orchestrator import hermes
|
"""Abort an active run."""
|
||||||
success = hermes.abort_run(run_id)
|
success = await hermes_orchestrator.abort_run(run_id)
|
||||||
if not success:
|
if not success:
|
||||||
raise HTTPException(status_code=404, detail="التشغيل غير موجود")
|
raise HTTPException(status_code=404, detail=f"التشغيل غير موجود أو اكتمل بالفعل: {run_id}")
|
||||||
return {"status": "aborted", "run_id": run_id}
|
return JSONResponse(content={
|
||||||
|
"run_id": run_id,
|
||||||
|
"status": "aborted",
|
||||||
|
"message_ar": f"تم إلغاء التشغيل: {run_id}",
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
# ── Self-Improvement ──────────────────────────────
|
# ---------------------------------------------------------------------------
|
||||||
|
# Self-improvement
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
@router.get("/improvements")
|
@router.get("/improvements")
|
||||||
async def list_improvements(status_filter: Optional[str] = None):
|
async def list_improvements(status: Optional[str] = None) -> JSONResponse:
|
||||||
from app.services.self_improvement import self_improvement, ImprovementStatus
|
"""List self-improvement proposals."""
|
||||||
if status_filter:
|
proposals = self_improvement_engine.list_proposals(status)
|
||||||
try:
|
report = await self_improvement_engine.report()
|
||||||
s = ImprovementStatus(status_filter)
|
return JSONResponse(content={
|
||||||
except ValueError:
|
"proposals": [p.model_dump(mode="json") for p in proposals],
|
||||||
raise HTTPException(status_code=400, detail="حالة غير صالحة")
|
"count": len(proposals),
|
||||||
proposals = await self_improvement.get_proposals(s)
|
"summary": report,
|
||||||
else:
|
})
|
||||||
proposals = await self_improvement.get_proposals()
|
|
||||||
return {"proposals": [p.model_dump() for p in proposals]}
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/improvements/cycle")
|
|
||||||
async def run_improvement_cycle():
|
|
||||||
from app.services.self_improvement import self_improvement
|
|
||||||
result = await self_improvement.run_cycle()
|
|
||||||
return result.model_dump()
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/improvements/{proposal_id}/approve")
|
@router.post("/improvements/{proposal_id}/approve")
|
||||||
async def approve_improvement(proposal_id: str, req: ApproveRequest):
|
async def approve_improvement(proposal_id: str, req: ApproveRequest) -> JSONResponse:
|
||||||
from app.services.self_improvement import self_improvement
|
"""Approve a self-improvement proposal."""
|
||||||
success = await self_improvement.apply(proposal_id, req.approved_by)
|
proposal = await self_improvement_engine.approve(proposal_id, req.user_id)
|
||||||
if not success:
|
if not proposal:
|
||||||
raise HTTPException(status_code=404, detail="المقترح غير موجود أو يحتاج موافقة")
|
raise HTTPException(
|
||||||
return {"status": "approved", "proposal_id": proposal_id}
|
status_code=404,
|
||||||
|
detail=f"المقترح غير موجود أو ليس في حالة 'مقترح': {proposal_id}",
|
||||||
|
)
|
||||||
|
return JSONResponse(content={
|
||||||
|
"proposal_id": proposal.id,
|
||||||
|
"status": proposal.status,
|
||||||
|
"approved_by": proposal.approved_by,
|
||||||
|
"message_ar": f"تمت الموافقة على المقترح: {proposal.title_ar}",
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
@router.post("/improvements/{proposal_id}/reject")
|
@router.post("/improvements/cycle")
|
||||||
async def reject_improvement(proposal_id: str):
|
async def run_improvement_cycle(tenant_id: Optional[str] = None) -> JSONResponse:
|
||||||
from app.services.self_improvement import self_improvement
|
"""Trigger a self-improvement cycle."""
|
||||||
success = await self_improvement.reject(proposal_id)
|
cycle = await self_improvement_engine.run_cycle(tenant_id)
|
||||||
if not success:
|
return JSONResponse(content=cycle.model_dump(mode="json"))
|
||||||
raise HTTPException(status_code=404, detail="المقترح غير موجود")
|
|
||||||
return {"status": "rejected", "proposal_id": proposal_id}
|
|
||||||
|
|
||||||
|
|
||||||
# ── Security (Shannon) ────────────────────────────
|
# ---------------------------------------------------------------------------
|
||||||
|
# Shannon security
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
@router.get("/security/report")
|
@router.get("/security/report")
|
||||||
async def security_report(severity: Optional[str] = None):
|
async def latest_security_report() -> JSONResponse:
|
||||||
from app.services.shannon_security import shannon
|
"""Get the latest Shannon security scan report."""
|
||||||
findings = shannon.get_all_findings(severity)
|
report = shannon_security.get_latest_report()
|
||||||
should_block = await shannon.should_block_release()
|
if not report:
|
||||||
return {
|
return JSONResponse(content={
|
||||||
"findings": [f.model_dump() for f in findings],
|
"report": None,
|
||||||
"total": len(findings),
|
"message_ar": "لا يوجد تقرير أمني بعد. قم بتشغيل فحص أولاً.",
|
||||||
"should_block_release": should_block,
|
})
|
||||||
}
|
return JSONResponse(content=report.model_dump(mode="json"))
|
||||||
|
|
||||||
|
|
||||||
@router.post("/security/scan")
|
@router.post("/security/scan")
|
||||||
async def trigger_security_scan(req: ScanRequest):
|
async def trigger_security_scan(req: ScanRequest) -> JSONResponse:
|
||||||
from app.services.shannon_security import shannon, ShannonScope
|
"""Trigger a Shannon security scan (staging only)."""
|
||||||
try:
|
scopes = []
|
||||||
scopes = [ShannonScope(s) for s in req.scopes] if req.scopes else None
|
for s in req.scopes:
|
||||||
report = await shannon.run_scan(
|
try:
|
||||||
environment=req.environment,
|
scopes.append(ShannonScope(s))
|
||||||
base_url=req.base_url,
|
except ValueError:
|
||||||
scopes=scopes,
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"نطاق غير صالح: {s}. المتاحة: {[x.value for x in ShannonScope]}",
|
||||||
|
)
|
||||||
|
|
||||||
|
report = await shannon_security.run_scan(
|
||||||
|
environment=req.environment,
|
||||||
|
scopes=scopes,
|
||||||
|
base_url=req.base_url,
|
||||||
|
)
|
||||||
|
return JSONResponse(content=report.model_dump(mode="json"))
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Executive summary
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@router.get("/executive-summary")
|
||||||
|
async def executive_summary(period: str = "weekly") -> JSONResponse:
|
||||||
|
"""Arabic executive summary for leadership."""
|
||||||
|
summary = await observability_service.get_executive_summary(period)
|
||||||
|
improvement_report = await self_improvement_engine.report()
|
||||||
|
security_report = shannon_security.get_latest_report()
|
||||||
|
|
||||||
|
full_summary = summary
|
||||||
|
if improvement_report.get("total_proposals", 0) > 0:
|
||||||
|
applied = improvement_report.get("by_status", {}).get("applied", 0)
|
||||||
|
pending = improvement_report.get("by_status", {}).get("proposed", 0)
|
||||||
|
full_summary += f"، {applied} تحسين مطبق، {pending} تحسين معلق"
|
||||||
|
|
||||||
|
if security_report:
|
||||||
|
full_summary += (
|
||||||
|
f"، آخر فحص أمني: {security_report.critical_count} حرجة، "
|
||||||
|
f"{security_report.high_count} عالية"
|
||||||
)
|
)
|
||||||
return report.model_dump()
|
else:
|
||||||
except PermissionError as e:
|
full_summary += "، لا يوجد فحص أمني بعد"
|
||||||
raise HTTPException(
|
|
||||||
status_code=403,
|
return JSONResponse(content={
|
||||||
detail=f"محظور: {str(e)}"
|
"summary_ar": full_summary,
|
||||||
)
|
"period": period,
|
||||||
except ValueError as e:
|
"improvements": improvement_report,
|
||||||
raise HTTPException(status_code=400, detail=str(e))
|
"security_status": security_report.model_dump(mode="json") if security_report else None,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
# ── Session Continuity ────────────────────────────
|
# ---------------------------------------------------------------------------
|
||||||
|
# Routing stats
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@router.get("/routing-stats")
|
||||||
@router.get("/session/restore")
|
async def routing_stats() -> JSONResponse:
|
||||||
async def restore_session():
|
"""Execution routing statistics."""
|
||||||
from app.services.session_continuity import session_continuity
|
stats = await execution_router.get_routing_stats()
|
||||||
prompt = await session_continuity.get_restore_prompt()
|
return JSONResponse(content=stats)
|
||||||
return {"restore_prompt": prompt}
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/session/state")
|
|
||||||
async def get_session_state():
|
|
||||||
from app.services.session_continuity import session_continuity
|
|
||||||
state = await session_continuity.restore_state()
|
|
||||||
if not state:
|
|
||||||
return {"state": None}
|
|
||||||
return {"state": state.model_dump() if hasattr(state, 'model_dump') else state.__dict__}
|
|
||||||
|
|||||||
@ -16,6 +16,7 @@ from app.api.v1 import lead_prospector as prospector_router
|
|||||||
from app.api.v1 import pipeline as pipeline_router
|
from app.api.v1 import pipeline as pipeline_router
|
||||||
from app.api.v1 import agent_system as agent_system_router
|
from app.api.v1 import agent_system as agent_system_router
|
||||||
from app.api.v1 import autonomous_foundation as autonomous_foundation_router
|
from app.api.v1 import autonomous_foundation as autonomous_foundation_router
|
||||||
|
from app.api.v1 import hermes as hermes_router
|
||||||
from app.api.v1 import marketing_hub as marketing_hub_router
|
from app.api.v1 import marketing_hub as marketing_hub_router
|
||||||
from app.api.v1 import strategy_summary as strategy_summary_router
|
from app.api.v1 import strategy_summary as strategy_summary_router
|
||||||
from app.api.v1 import value_proposition as value_proposition_router
|
from app.api.v1 import value_proposition as value_proposition_router
|
||||||
@ -86,3 +87,6 @@ api_router.include_router(pipeline_router.router)
|
|||||||
# ── 22-Agent AI System — Full Empire Control ─────────────────
|
# ── 22-Agent AI System — Full Empire Control ─────────────────
|
||||||
api_router.include_router(agent_system_router.router)
|
api_router.include_router(agent_system_router.router)
|
||||||
api_router.include_router(autonomous_foundation_router.router)
|
api_router.include_router(autonomous_foundation_router.router)
|
||||||
|
|
||||||
|
# ── Hermes Fusion — Orchestration Layer ──────────────────────
|
||||||
|
api_router.include_router(hermes_router.router)
|
||||||
|
|||||||
@ -1,17 +1,27 @@
|
|||||||
"""
|
"""
|
||||||
Observability Service — Dealix AI Revenue OS
|
Observability Service -- Dealix AI Revenue OS -- خدمة المراقبة
|
||||||
Cost tracking, workflow metrics, health monitoring, and Arabic executive summaries.
|
Track cost, performance, and health across all agent workflows.
|
||||||
|
Anomaly detection, executive summaries in Arabic.
|
||||||
"""
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from datetime import datetime, timezone, timedelta
|
import statistics
|
||||||
from typing import Optional
|
from collections import defaultdict
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Models
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
class WorkflowMetric(BaseModel):
|
class WorkflowMetric(BaseModel):
|
||||||
|
"""Single workflow execution metric."""
|
||||||
workflow_name: str
|
workflow_name: str
|
||||||
profile_id: str
|
profile_id: str
|
||||||
backend: str
|
backend: str
|
||||||
@ -23,170 +33,313 @@ class WorkflowMetric(BaseModel):
|
|||||||
error: Optional[str] = None
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class AnomalyAlert(BaseModel):
|
||||||
|
"""Detected anomaly."""
|
||||||
|
id: str = ""
|
||||||
|
anomaly_type: str # cost_spike, failure_spike, latency_spike, regression
|
||||||
|
description: str
|
||||||
|
description_ar: str
|
||||||
|
severity: str = "medium" # critical, high, medium, low
|
||||||
|
metric_name: str = ""
|
||||||
|
current_value: float = 0.0
|
||||||
|
baseline_value: float = 0.0
|
||||||
|
deviation_pct: float = 0.0
|
||||||
|
detected_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Service
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
class ObservabilityService:
|
class ObservabilityService:
|
||||||
"""Track cost, performance, and health across all agent workflows."""
|
"""Track cost, performance, and health across all agent workflows."""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self) -> None:
|
||||||
self._metrics: list[WorkflowMetric] = []
|
self._metrics: list[WorkflowMetric] = []
|
||||||
self._max_metrics = 50000
|
self._max_metrics = 50_000
|
||||||
|
self._anomalies: list[AnomalyAlert] = []
|
||||||
|
self._max_anomalies = 1_000
|
||||||
|
logger.info("خدمة المراقبة: تم التهيئة")
|
||||||
|
|
||||||
|
# -- Recording ---------------------------------------------------------
|
||||||
|
|
||||||
async def record_workflow(self, metric: WorkflowMetric) -> None:
|
async def record_workflow(self, metric: WorkflowMetric) -> None:
|
||||||
|
"""Store a workflow execution metric."""
|
||||||
self._metrics.append(metric)
|
self._metrics.append(metric)
|
||||||
if len(self._metrics) > self._max_metrics:
|
if len(self._metrics) > self._max_metrics:
|
||||||
self._metrics = self._metrics[-self._max_metrics:]
|
self._metrics = self._metrics[-self._max_metrics:]
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Recorded: {metric.workflow_name} "
|
"[Obs] سجل: %s profile=%s backend=%s %dms $%.4f %s",
|
||||||
f"cost=${metric.estimated_cost_usd:.4f} "
|
metric.workflow_name, metric.profile_id, metric.backend,
|
||||||
f"{'OK' if metric.success else 'FAIL'}"
|
metric.duration_ms, metric.estimated_cost_usd,
|
||||||
|
"OK" if metric.success else "FAIL",
|
||||||
)
|
)
|
||||||
|
|
||||||
def _filter_by_period(
|
# -- Cost report -------------------------------------------------------
|
||||||
self, period: str, metrics: list[WorkflowMetric] = None
|
|
||||||
) -> list[WorkflowMetric]:
|
|
||||||
source = metrics or self._metrics
|
|
||||||
now = datetime.now(timezone.utc)
|
|
||||||
if period == "hourly":
|
|
||||||
cutoff = now - timedelta(hours=1)
|
|
||||||
elif period == "daily":
|
|
||||||
cutoff = now - timedelta(days=1)
|
|
||||||
elif period == "weekly":
|
|
||||||
cutoff = now - timedelta(weeks=1)
|
|
||||||
elif period == "monthly":
|
|
||||||
cutoff = now - timedelta(days=30)
|
|
||||||
else:
|
|
||||||
cutoff = now - timedelta(days=1)
|
|
||||||
return [m for m in source if m.started_at >= cutoff]
|
|
||||||
|
|
||||||
async def get_cost_report(
|
async def get_cost_report(
|
||||||
self, period: str = "daily", profile: str = None
|
self, period: str = "daily", profile: Optional[str] = None,
|
||||||
) -> dict:
|
) -> dict[str, Any]:
|
||||||
filtered = self._filter_by_period(period)
|
"""Total cost, cost by profile, cost by backend, cost by workflow."""
|
||||||
if profile:
|
cutoff = self._period_cutoff(period)
|
||||||
filtered = [m for m in filtered if m.profile_id == profile]
|
filtered = [
|
||||||
|
m for m in self._metrics
|
||||||
|
if m.started_at >= cutoff and (profile is None or m.profile_id == profile)
|
||||||
|
]
|
||||||
|
|
||||||
total_cost = sum(m.estimated_cost_usd for m in filtered)
|
total = sum(m.estimated_cost_usd for m in filtered)
|
||||||
by_profile: dict[str, float] = {}
|
by_profile: dict[str, float] = defaultdict(float)
|
||||||
by_backend: dict[str, float] = {}
|
by_backend: dict[str, float] = defaultdict(float)
|
||||||
by_workflow: dict[str, float] = {}
|
by_workflow: dict[str, float] = defaultdict(float)
|
||||||
|
|
||||||
for m in filtered:
|
for m in filtered:
|
||||||
by_profile[m.profile_id] = by_profile.get(m.profile_id, 0) + m.estimated_cost_usd
|
by_profile[m.profile_id] += m.estimated_cost_usd
|
||||||
by_backend[m.backend] = by_backend.get(m.backend, 0) + m.estimated_cost_usd
|
by_backend[m.backend] += m.estimated_cost_usd
|
||||||
by_workflow[m.workflow_name] = by_workflow.get(m.workflow_name, 0) + m.estimated_cost_usd
|
by_workflow[m.workflow_name] += m.estimated_cost_usd
|
||||||
|
|
||||||
top_expensive = sorted(by_workflow.items(), key=lambda x: x[1], reverse=True)[:5]
|
# Sort by cost descending
|
||||||
|
top_workflows = sorted(by_workflow.items(), key=lambda x: -x[1])[:10]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"period": period,
|
"period": period,
|
||||||
"total_cost_usd": round(total_cost, 4),
|
"total_usd": round(total, 4),
|
||||||
"total_workflows": len(filtered),
|
|
||||||
"by_profile": {k: round(v, 4) for k, v in by_profile.items()},
|
"by_profile": {k: round(v, 4) for k, v in by_profile.items()},
|
||||||
"by_backend": {k: round(v, 4) for k, v in by_backend.items()},
|
"by_backend": {k: round(v, 4) for k, v in by_backend.items()},
|
||||||
"top_expensive": [{"name": k, "cost": round(v, 4)} for k, v in top_expensive],
|
"top_workflows": [{"name": n, "cost_usd": round(c, 4)} for n, c in top_workflows],
|
||||||
|
"total_executions": len(filtered),
|
||||||
|
"message_ar": f"التكلفة الإجمالية ({period}): ${total:.2f} عبر {len(filtered)} عملية",
|
||||||
}
|
}
|
||||||
|
|
||||||
async def get_performance_report(self, period: str = "daily") -> dict:
|
# -- Performance report ------------------------------------------------
|
||||||
filtered = self._filter_by_period(period)
|
|
||||||
|
async def get_performance_report(self, period: str = "daily") -> dict[str, Any]:
|
||||||
|
"""Average duration, P95 duration, success rate, error rate."""
|
||||||
|
cutoff = self._period_cutoff(period)
|
||||||
|
filtered = [m for m in self._metrics if m.started_at >= cutoff]
|
||||||
|
|
||||||
if not filtered:
|
if not filtered:
|
||||||
return {"period": period, "total": 0}
|
return {
|
||||||
|
"period": period, "total_executions": 0,
|
||||||
|
"message_ar": "لا توجد بيانات لهذه الفترة",
|
||||||
|
}
|
||||||
|
|
||||||
durations = [m.duration_ms for m in filtered]
|
durations = [m.duration_ms for m in filtered]
|
||||||
durations.sort()
|
successes = sum(1 for m in filtered if m.success)
|
||||||
total = len(durations)
|
failures = len(filtered) - successes
|
||||||
success_count = sum(1 for m in filtered if m.success)
|
|
||||||
|
|
||||||
p95_idx = min(int(total * 0.95), total - 1)
|
avg_ms = statistics.mean(durations)
|
||||||
errors = [m for m in filtered if not m.success]
|
p95_ms = sorted(durations)[int(len(durations) * 0.95)] if len(durations) >= 20 else max(durations)
|
||||||
|
success_rate = successes / len(filtered)
|
||||||
|
|
||||||
|
# Slowest workflows
|
||||||
|
by_wf: dict[str, list[int]] = defaultdict(list)
|
||||||
|
for m in filtered:
|
||||||
|
by_wf[m.workflow_name].append(m.duration_ms)
|
||||||
|
slowest = sorted(
|
||||||
|
[(wf, statistics.mean(ds)) for wf, ds in by_wf.items()],
|
||||||
|
key=lambda x: -x[1],
|
||||||
|
)[:5]
|
||||||
|
|
||||||
|
# Most expensive
|
||||||
|
cost_wf: dict[str, float] = defaultdict(float)
|
||||||
|
for m in filtered:
|
||||||
|
cost_wf[m.workflow_name] += m.estimated_cost_usd
|
||||||
|
most_expensive = sorted(cost_wf.items(), key=lambda x: -x[1])[:5]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"period": period,
|
"period": period,
|
||||||
"total_workflows": total,
|
"total_executions": len(filtered),
|
||||||
"success_rate": round(success_count / total * 100, 1) if total else 0,
|
"avg_duration_ms": round(avg_ms, 2),
|
||||||
"avg_duration_ms": round(sum(durations) / total) if total else 0,
|
"p95_duration_ms": p95_ms,
|
||||||
"p95_duration_ms": durations[p95_idx] if durations else 0,
|
"success_rate": round(success_rate, 4),
|
||||||
"error_count": len(errors),
|
"error_rate": round(1 - success_rate, 4),
|
||||||
"error_rate": round(len(errors) / total * 100, 1) if total else 0,
|
"total_successes": successes,
|
||||||
"recent_errors": [
|
"total_failures": failures,
|
||||||
{"workflow": e.workflow_name, "error": e.error, "at": e.started_at.isoformat()}
|
"slowest_workflows": [{"name": n, "avg_ms": round(d, 2)} for n, d in slowest],
|
||||||
for e in errors[-5:]
|
"most_expensive": [{"name": n, "cost_usd": round(c, 4)} for n, c in most_expensive],
|
||||||
],
|
"message_ar": (
|
||||||
|
f"أداء ({period}): {len(filtered)} عملية، "
|
||||||
|
f"متوسط {avg_ms:.0f}ms، نجاح {success_rate:.0%}، فشل {failures}"
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
async def get_health_report(self) -> dict:
|
# -- Health report -----------------------------------------------------
|
||||||
daily = self._filter_by_period("daily")
|
|
||||||
total = len(daily)
|
|
||||||
success = sum(1 for m in daily if m.success)
|
|
||||||
|
|
||||||
backends_used = set(m.backend for m in daily)
|
async def get_health_report(self) -> dict[str, Any]:
|
||||||
|
"""Backend health, skill health, memory health, knowledge health, trust health."""
|
||||||
|
recent = [m for m in self._metrics if m.started_at >= self._period_cutoff("daily")]
|
||||||
|
|
||||||
|
# Backend health
|
||||||
|
backend_calls: dict[str, dict[str, int]] = defaultdict(lambda: {"ok": 0, "fail": 0})
|
||||||
|
for m in recent:
|
||||||
|
backend_calls[m.backend]["ok" if m.success else "fail"] += 1
|
||||||
backend_health = {}
|
backend_health = {}
|
||||||
for b in backends_used:
|
for b, counts in backend_calls.items():
|
||||||
b_metrics = [m for m in daily if m.backend == b]
|
total = counts["ok"] + counts["fail"]
|
||||||
b_success = sum(1 for m in b_metrics if m.success)
|
rate = counts["ok"] / total if total else 1.0
|
||||||
backend_health[b] = {
|
backend_health[b] = {
|
||||||
"total": len(b_metrics),
|
"total": total, "success_rate": round(rate, 4),
|
||||||
"success_rate": round(b_success / len(b_metrics) * 100, 1) if b_metrics else 0,
|
"healthy": rate >= 0.7 or total < 5,
|
||||||
"avg_duration_ms": round(
|
|
||||||
sum(m.duration_ms for m in b_metrics) / len(b_metrics)
|
|
||||||
) if b_metrics else 0,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Skill health (by workflow)
|
||||||
|
skill_calls: dict[str, dict[str, int]] = defaultdict(lambda: {"ok": 0, "fail": 0})
|
||||||
|
for m in recent:
|
||||||
|
skill_calls[m.workflow_name]["ok" if m.success else "fail"] += 1
|
||||||
|
skill_health = {}
|
||||||
|
for s, counts in skill_calls.items():
|
||||||
|
total = counts["ok"] + counts["fail"]
|
||||||
|
rate = counts["ok"] / total if total else 1.0
|
||||||
|
skill_health[s] = {"total": total, "success_rate": round(rate, 4)}
|
||||||
|
|
||||||
|
# Overall
|
||||||
|
total = len(recent)
|
||||||
|
overall_ok = sum(1 for m in recent if m.success)
|
||||||
|
overall_rate = overall_ok / total if total else 1.0
|
||||||
|
healthy = overall_rate >= 0.8
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"overall_health": "healthy" if (total == 0 or success / total > 0.9) else "degraded",
|
"overall_healthy": healthy,
|
||||||
"workflows_today": total,
|
"overall_success_rate": round(overall_rate, 4),
|
||||||
"success_rate": round(success / total * 100, 1) if total else 100,
|
"total_today": total,
|
||||||
"total_cost_today_usd": round(sum(m.estimated_cost_usd for m in daily), 4),
|
"backend_health": backend_health,
|
||||||
"backends": backend_health,
|
"skill_health": skill_health,
|
||||||
|
"anomalies_today": len([
|
||||||
|
a for a in self._anomalies
|
||||||
|
if a.detected_at >= self._period_cutoff("daily")
|
||||||
|
]),
|
||||||
|
"message_ar": (
|
||||||
|
f"صحة النظام: {'سليم' if healthy else 'يحتاج انتباه'} -- "
|
||||||
|
f"نجاح {overall_rate:.0%}، {total} عملية اليوم"
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
async def get_executive_summary(self, period: str = "weekly") -> str:
|
# -- Executive summary -------------------------------------------------
|
||||||
filtered = self._filter_by_period(period)
|
|
||||||
total = len(filtered)
|
|
||||||
success = sum(1 for m in filtered if m.success)
|
|
||||||
cost = sum(m.estimated_cost_usd for m in filtered)
|
|
||||||
success_rate = round(success / total * 100) if total else 100
|
|
||||||
|
|
||||||
period_ar = {"daily": "اليوم", "weekly": "هذا الأسبوع", "monthly": "هذا الشهر"}.get(period, period)
|
async def get_executive_summary(self, period: str = "weekly") -> str:
|
||||||
|
"""Arabic executive summary for leadership."""
|
||||||
|
cutoff = self._period_cutoff(period)
|
||||||
|
filtered = [m for m in self._metrics if m.started_at >= cutoff]
|
||||||
|
|
||||||
|
total = len(filtered)
|
||||||
|
successes = sum(1 for m in filtered if m.success)
|
||||||
|
rate = successes / total if total else 0
|
||||||
|
cost = sum(m.estimated_cost_usd for m in filtered)
|
||||||
|
anomalies = [a for a in self._anomalies if a.detected_at >= cutoff]
|
||||||
|
critical = sum(1 for a in anomalies if a.severity == "critical")
|
||||||
|
|
||||||
|
period_ar = {
|
||||||
|
"daily": "اليوم", "weekly": "هذا الأسبوع",
|
||||||
|
"monthly": "هذا الشهر",
|
||||||
|
}.get(period, period)
|
||||||
|
|
||||||
summary = (
|
summary = (
|
||||||
f"📊 ملخص {period_ar}:\n"
|
f"{period_ar}: {total} مهمة منفذة، "
|
||||||
f"• {total} مهمة منفذة\n"
|
f"{rate:.0%} نجاح، "
|
||||||
f"• {success_rate}% نسبة النجاح\n"
|
f"تكلفة ${cost:.2f}، "
|
||||||
f"• ${cost:.2f} التكلفة الإجمالية\n"
|
f"{len(anomalies)} تنبيه"
|
||||||
)
|
)
|
||||||
|
if critical:
|
||||||
errors = [m for m in filtered if not m.success]
|
summary += f"، {critical} مشاكل حرجة تحتاج تدخل فوري"
|
||||||
if errors:
|
|
||||||
summary += f"• {len(errors)} خطأ يحتاج مراجعة\n"
|
|
||||||
else:
|
else:
|
||||||
summary += "• لا أخطاء حرجة ✅\n"
|
summary += "، 0 مشاكل حرجة"
|
||||||
|
|
||||||
return summary
|
return summary
|
||||||
|
|
||||||
async def detect_anomalies(self) -> list[dict]:
|
# -- Anomaly detection -------------------------------------------------
|
||||||
anomalies = []
|
|
||||||
hourly = self._filter_by_period("hourly")
|
|
||||||
daily = self._filter_by_period("daily")
|
|
||||||
|
|
||||||
if hourly:
|
async def detect_anomalies(self) -> list[AnomalyAlert]:
|
||||||
hourly_cost = sum(m.estimated_cost_usd for m in hourly)
|
"""Sudden cost spikes, unusual failure patterns, backend degradation."""
|
||||||
if hourly_cost > 5.0:
|
alerts: list[AnomalyAlert] = []
|
||||||
anomalies.append({
|
now = datetime.now(timezone.utc)
|
||||||
"type": "cost_spike",
|
today = [m for m in self._metrics if m.started_at >= now - timedelta(hours=24)]
|
||||||
"severity": "high",
|
prev_week = [
|
||||||
"message": f"تكلفة الساعة الأخيرة ${hourly_cost:.2f} — أعلى من الحد الطبيعي",
|
m for m in self._metrics
|
||||||
"value": hourly_cost,
|
if now - timedelta(days=8) <= m.started_at < now - timedelta(days=1)
|
||||||
})
|
]
|
||||||
|
|
||||||
hourly_errors = sum(1 for m in hourly if not m.success)
|
if not today or not prev_week:
|
||||||
if len(hourly) > 5 and hourly_errors / len(hourly) > 0.3:
|
return alerts
|
||||||
anomalies.append({
|
|
||||||
"type": "error_spike",
|
|
||||||
"severity": "critical",
|
|
||||||
"message": f"معدل أخطاء مرتفع: {hourly_errors}/{len(hourly)} في الساعة الأخيرة",
|
|
||||||
"value": hourly_errors,
|
|
||||||
})
|
|
||||||
|
|
||||||
return anomalies
|
# Cost spike detection
|
||||||
|
today_cost = sum(m.estimated_cost_usd for m in today)
|
||||||
|
avg_daily_cost = sum(m.estimated_cost_usd for m in prev_week) / 7
|
||||||
|
if avg_daily_cost > 0 and today_cost > avg_daily_cost * 2:
|
||||||
|
deviation = ((today_cost - avg_daily_cost) / avg_daily_cost) * 100
|
||||||
|
alerts.append(AnomalyAlert(
|
||||||
|
id=f"cost-{now.strftime('%Y%m%d')}",
|
||||||
|
anomaly_type="cost_spike",
|
||||||
|
description=f"Today's cost ${today_cost:.2f} is {deviation:.0f}% above daily avg ${avg_daily_cost:.2f}",
|
||||||
|
description_ar=f"تكلفة اليوم ${today_cost:.2f} أعلى بنسبة {deviation:.0f}% من المتوسط ${avg_daily_cost:.2f}",
|
||||||
|
severity="high" if deviation > 200 else "medium",
|
||||||
|
metric_name="daily_cost_usd",
|
||||||
|
current_value=today_cost,
|
||||||
|
baseline_value=avg_daily_cost,
|
||||||
|
deviation_pct=round(deviation, 2),
|
||||||
|
))
|
||||||
|
|
||||||
|
# Failure rate spike
|
||||||
|
today_fail_rate = sum(1 for m in today if not m.success) / max(len(today), 1)
|
||||||
|
prev_fail_rate = sum(1 for m in prev_week if not m.success) / max(len(prev_week), 1)
|
||||||
|
if today_fail_rate > 0.15 and today_fail_rate > prev_fail_rate * 2:
|
||||||
|
alerts.append(AnomalyAlert(
|
||||||
|
id=f"fail-{now.strftime('%Y%m%d')}",
|
||||||
|
anomaly_type="failure_spike",
|
||||||
|
description=f"Failure rate {today_fail_rate:.1%} vs baseline {prev_fail_rate:.1%}",
|
||||||
|
description_ar=f"معدل الفشل {today_fail_rate:.1%} مقابل الخط الأساسي {prev_fail_rate:.1%}",
|
||||||
|
severity="critical" if today_fail_rate > 0.3 else "high",
|
||||||
|
metric_name="failure_rate",
|
||||||
|
current_value=today_fail_rate,
|
||||||
|
baseline_value=prev_fail_rate,
|
||||||
|
deviation_pct=round(((today_fail_rate - prev_fail_rate) / max(prev_fail_rate, 0.01)) * 100, 2),
|
||||||
|
))
|
||||||
|
|
||||||
|
# Latency spike (per backend)
|
||||||
|
for backend in {"claude", "openclaude", "goose", "internal"}:
|
||||||
|
today_b = [m.duration_ms for m in today if m.backend == backend]
|
||||||
|
prev_b = [m.duration_ms for m in prev_week if m.backend == backend]
|
||||||
|
if len(today_b) >= 5 and len(prev_b) >= 5:
|
||||||
|
today_avg = statistics.mean(today_b)
|
||||||
|
prev_avg = statistics.mean(prev_b)
|
||||||
|
if prev_avg > 0 and today_avg > prev_avg * 3:
|
||||||
|
deviation = ((today_avg - prev_avg) / prev_avg) * 100
|
||||||
|
alerts.append(AnomalyAlert(
|
||||||
|
id=f"latency-{backend}-{now.strftime('%Y%m%d')}",
|
||||||
|
anomaly_type="latency_spike",
|
||||||
|
description=f"{backend} latency {today_avg:.0f}ms vs baseline {prev_avg:.0f}ms",
|
||||||
|
description_ar=f"زمن استجابة {backend}: {today_avg:.0f}ms مقابل {prev_avg:.0f}ms",
|
||||||
|
severity="high",
|
||||||
|
metric_name=f"latency_{backend}",
|
||||||
|
current_value=today_avg,
|
||||||
|
baseline_value=prev_avg,
|
||||||
|
deviation_pct=round(deviation, 2),
|
||||||
|
))
|
||||||
|
|
||||||
|
self._anomalies.extend(alerts)
|
||||||
|
if len(self._anomalies) > self._max_anomalies:
|
||||||
|
self._anomalies = self._anomalies[-self._max_anomalies:]
|
||||||
|
|
||||||
|
if alerts:
|
||||||
|
logger.warning("[Obs] %d anomalies detected", len(alerts))
|
||||||
|
|
||||||
|
return alerts
|
||||||
|
|
||||||
|
# -- Helpers -----------------------------------------------------------
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _period_cutoff(period: str) -> datetime:
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
if period == "daily":
|
||||||
|
return now - timedelta(hours=24)
|
||||||
|
if period == "weekly":
|
||||||
|
return now - timedelta(days=7)
|
||||||
|
if period == "monthly":
|
||||||
|
return now - timedelta(days=30)
|
||||||
|
return now - timedelta(hours=24)
|
||||||
|
|
||||||
|
|
||||||
observability = ObservabilityService()
|
# ---------------------------------------------------------------------------
|
||||||
|
# Module-level singleton
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
observability_service = ObservabilityService()
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user