fix: Enhanced self-improvement engine with detailed inspection and metrics

https://claude.ai/code/session_01LsnvBa7HwF5hs99VZbgLGj
This commit is contained in:
Claude 2026-04-11 08:34:09 +00:00
parent d7d428d0a1
commit 40a48b98c9
No known key found for this signature in database

View File

@ -1,251 +1,431 @@
""" """
Self-Improvement Engine Dealix AI Revenue OS Self-Improvement Engine -- Dealix AI Revenue OS -- محرك التحسين الذاتي
Bounded cycle: inspect measure propose verify apply report. Bounded self-improvement loop: Inspect -> Measure -> Propose -> Verify -> Apply.
Max 5 proposals per cycle, max 2 auto-applies (trivial only). Max 5 proposals per cycle, max 2 auto-applies (trivial only).
""" """
from __future__ import annotations
import logging import logging
from datetime import datetime, timezone import uuid
from enum import Enum from collections import defaultdict
from typing import Optional from datetime import datetime, timedelta, timezone
from typing import Any, Optional
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class ImprovementCategory(str, Enum): # ---------------------------------------------------------------------------
SKILL_FIX = "skill_fix" # Models
KNOWLEDGE_UPDATE = "knowledge_update" # ---------------------------------------------------------------------------
COST_REDUCTION = "cost_reduction"
QUALITY = "quality"
PERFORMANCE = "performance"
SECURITY = "security"
class Metric(BaseModel):
class ImprovementStatus(str, Enum): """Quantified measurement of a detected issue."""
PROPOSED = "proposed" name: str
APPROVED = "approved" name_ar: str
APPLIED = "applied" value: float
REJECTED = "rejected" unit: str = ""
TESTED = "tested" threshold: float = 0.0
FAILED = "failed" exceeds_threshold: bool = False
frequency: int = 0
estimated_cost_usd: float = 0.0
impact: str = "low"
class ImprovementProposal(BaseModel): class ImprovementProposal(BaseModel):
id: str """A single bounded improvement proposal."""
category: ImprovementCategory id: str = Field(default_factory=lambda: str(uuid.uuid4()))
category: str = "quality" # skill_fix, knowledge_update, cost_reduction, quality
title: str title: str
title_ar: str title_ar: str
description: str description: str
evidence: list[str] = [] evidence: list[str] = []
impact: str = "medium" # high, medium, low impact: str = "medium" # high, medium, low
effort: str = "small" # trivial, small, medium, large effort: str = "small" # trivial, small, medium, large
proposed_action: str proposed_action: str = ""
requires_approval: bool = True requires_approval: bool = True
status: ImprovementStatus = ImprovementStatus.PROPOSED status: str = "proposed" # proposed, approved, applied, rejected, tested
approved_by: Optional[str] = None
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
applied_at: Optional[datetime] = None applied_at: Optional[datetime] = None
approved_by: Optional[str] = None rollback_action: str = ""
test_result: Optional[str] = None
class Metric(BaseModel):
name: str
value: float
unit: str
trend: str = "stable" # improving, degrading, stable
severity: str = "info" # critical, warning, info
class CycleResult(BaseModel): class CycleResult(BaseModel):
cycle_id: str """Result of a full self-improvement cycle."""
inspected_areas: list[str] cycle_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
metrics: list[Metric] tenant_id: str = ""
proposals: list[ImprovementProposal] started_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
auto_applied: int completed_at: Optional[datetime] = None
awaiting_approval: int duration_ms: int = 0
started_at: datetime issues_found: int = 0
completed_at: datetime metrics_measured: int = 0
summary: str proposals_generated: int = 0
summary_ar: str proposals_auto_applied: int = 0
proposals_pending: int = 0
summary: str = ""
summary_ar: str = ""
# ---------------------------------------------------------------------------
# Engine
# ---------------------------------------------------------------------------
class SelfImprovementEngine: class SelfImprovementEngine:
"""Bounded self-improvement. Inspect -> Measure -> Propose -> Verify -> Apply."""
MAX_PROPOSALS_PER_CYCLE = 5 MAX_PROPOSALS_PER_CYCLE = 5
MAX_AUTO_APPLY = 2 MAX_AUTO_APPLY = 2 # only trivial improvements auto-apply
def __init__(self): def __init__(self) -> None:
self._proposals: list[ImprovementProposal] = [] self._proposals: list[ImprovementProposal] = []
self._cycle_count = 0 self._cycles: list[CycleResult] = []
self._metrics_history: list[dict] = [] self._max_proposals = 5_000
self._max_cycles = 500
logger.info("محرك التحسين الذاتي: تم التهيئة")
async def inspect(self, tenant_id: str = None) -> dict: # -- Phase 1: Inspect --------------------------------------------------
issues = {}
issues["skill_failures"] = {
"check": "مهارات فاشلة",
"description": "Skills with >20% failure rate in last 7 days",
"action": "Review and fix or disable failing skills",
}
issues["expensive_workflows"] = {
"check": "سير عمل مكلف",
"description": "Workflows costing >$1/run",
"action": "Optimize prompts or switch to cheaper model",
}
issues["stale_knowledge"] = {
"check": "معرفة قديمة",
"description": "Wiki pages not updated in 30+ days",
"action": "Review and update or archive",
}
issues["repeated_escalations"] = {
"check": "تصعيدات متكررة",
"description": "Same escalation reason >5 times in 7 days",
"action": "Automate the resolution or improve the workflow",
}
issues["low_trust_calls"] = {
"check": "استدعاءات منخفضة الثقة",
"description": "Tool calls with <50% verification rate",
"action": "Add better verification or restrict the tool",
}
logger.info(f"Self-improvement inspection: {len(issues)} areas checked")
return issues
async def measure(self, inspection: dict) -> list[Metric]: async def inspect(self, tenant_id: Optional[str] = None) -> dict[str, Any]:
metrics = [ """Check for issues: skill failures, expensive workflows, stale knowledge,
Metric(name="skill_success_rate", value=87.5, unit="%", trend="stable"), repeated escalations, low-trust tool calls, slow endpoints."""
Metric(name="avg_workflow_cost", value=0.12, unit="USD", trend="improving"), issues: list[dict[str, Any]] = []
Metric(name="knowledge_freshness", value=72.0, unit="%", trend="degrading", severity="warning"),
Metric(name="escalation_rate", value=8.3, unit="%", trend="stable"), # Simulated inspection checks (in production, these query real services)
Metric(name="tool_trust_score", value=91.0, unit="%", trend="improving"), issues.append({
Metric(name="avg_response_time", value=1.2, unit="seconds", trend="stable"), "type": "skill_failure_rate",
] "description": "Some skills have failure rate above 10%",
self._metrics_history.append({ "description_ar": "بعض المهارات لديها معدل فشل أعلى من 10%",
"timestamp": datetime.now(timezone.utc).isoformat(), "data": {"skill_id": "messaging.whatsapp.send", "failure_rate": 0.12, "sample_size": 200},
"metrics": [m.model_dump() for m in metrics],
}) })
issues.append({
"type": "expensive_workflow",
"description": "Content generation workflow uses 3x expected budget",
"description_ar": "سير عمل توليد المحتوى يستخدم 3 أضعاف الميزانية المتوقعة",
"data": {"workflow": "content_generation", "cost_usd": 0.15, "expected_usd": 0.05},
})
issues.append({
"type": "stale_knowledge",
"description": "12 knowledge pages not updated in 30+ days",
"description_ar": "12 صفحة معرفة لم تُحدّث منذ أكثر من 30 يوماً",
"data": {"stale_count": 12, "threshold_days": 30},
})
issues.append({
"type": "repeated_escalation",
"description": "Consent-expired escalations repeat 5+ times/week",
"description_ar": "تصعيدات انتهاء الموافقة تتكرر أكثر من 5 مرات أسبوعياً",
"data": {"escalation_type": "consent_expired", "weekly_count": 7},
})
issues.append({
"type": "low_trust_calls",
"description": "Agent 'growth' has 15% unverified tool calls",
"description_ar": "وكيل 'النمو' لديه 15% مكالمات أدوات غير متحقق منها",
"data": {"agent_id": "growth", "unverified_rate": 0.15},
})
logger.info("[SelfImprove] فحص: %d مشكلة اكتُشفت tenant=%s", len(issues), tenant_id or "global")
return {"tenant_id": tenant_id or "global", "issues": issues, "count": len(issues)}
# -- Phase 2: Measure --------------------------------------------------
async def measure(self, inspection: dict[str, Any]) -> list[Metric]:
"""Quantify each issue: frequency, cost, impact."""
metrics: list[Metric] = []
for issue in inspection.get("issues", []):
data = issue.get("data", {})
itype = issue.get("type", "unknown")
if itype == "skill_failure_rate":
metrics.append(Metric(
name=f"failure_rate:{data.get('skill_id', '?')}",
name_ar=f"معدل الفشل: {data.get('skill_id', '?')}",
value=data.get("failure_rate", 0),
unit="rate", threshold=0.10,
exceeds_threshold=data.get("failure_rate", 0) > 0.10,
frequency=data.get("sample_size", 0),
estimated_cost_usd=data.get("failure_rate", 0) * data.get("sample_size", 0) * 0.01,
impact="high" if data.get("failure_rate", 0) > 0.2 else "medium",
))
elif itype == "expensive_workflow":
excess = data.get("cost_usd", 0) - data.get("expected_usd", 0)
metrics.append(Metric(
name=f"cost_excess:{data.get('workflow', '?')}",
name_ar=f"تكلفة زائدة: {data.get('workflow', '?')}",
value=excess, unit="usd", threshold=0.05,
exceeds_threshold=excess > 0.05,
estimated_cost_usd=excess * 100, # projected monthly
impact="high" if excess > 0.10 else "medium",
))
elif itype == "stale_knowledge":
metrics.append(Metric(
name="stale_knowledge_pages",
name_ar="صفحات معرفة قديمة",
value=data.get("stale_count", 0),
unit="pages", threshold=5,
exceeds_threshold=data.get("stale_count", 0) > 5,
impact="low",
))
elif itype == "repeated_escalation":
metrics.append(Metric(
name=f"repeated_escalation:{data.get('escalation_type', '?')}",
name_ar=f"تصعيد متكرر: {data.get('escalation_type', '?')}",
value=data.get("weekly_count", 0),
unit="per_week", threshold=3,
exceeds_threshold=data.get("weekly_count", 0) > 3,
impact="medium",
))
elif itype == "low_trust_calls":
metrics.append(Metric(
name=f"unverified_rate:{data.get('agent_id', '?')}",
name_ar=f"معدل غير متحقق: {data.get('agent_id', '?')}",
value=data.get("unverified_rate", 0),
unit="rate", threshold=0.10,
exceeds_threshold=data.get("unverified_rate", 0) > 0.10,
impact="medium",
))
logger.info("[SelfImprove] قياس: %d مقاييس", len(metrics))
return metrics return metrics
# -- Phase 3: Propose --------------------------------------------------
async def propose(self, metrics: list[Metric]) -> list[ImprovementProposal]: async def propose(self, metrics: list[Metric]) -> list[ImprovementProposal]:
proposals = [] """Generate max 5 proposals per cycle, prioritized by impact/effort."""
for metric in metrics: proposals: list[ImprovementProposal] = []
if metric.severity == "warning" or metric.trend == "degrading":
proposal = self._create_proposal(metric) for m in sorted(metrics, key=lambda x: -x.estimated_cost_usd):
if proposal: if len(proposals) >= self.MAX_PROPOSALS_PER_CYCLE:
proposals.append(proposal) break
proposals = proposals[:self.MAX_PROPOSALS_PER_CYCLE] if not m.exceeds_threshold:
continue
if "failure_rate" in m.name:
proposals.append(ImprovementProposal(
category="skill_fix",
title=f"Fix high failure rate on {m.name.split(':')[-1]}",
title_ar=f"إصلاح معدل الفشل المرتفع في {m.name.split(':')[-1]}",
description=f"Failure rate {m.value:.1%} exceeds {m.threshold:.1%} threshold.",
evidence=[f"Measured {m.frequency} calls, {m.value:.1%} failed"],
impact=m.impact, effort="small",
proposed_action="Add retry logic and improve error handling for the skill handler",
requires_approval=True,
rollback_action="Revert skill handler to previous version",
))
elif "cost_excess" in m.name:
proposals.append(ImprovementProposal(
category="cost_reduction",
title=f"Reduce cost of {m.name.split(':')[-1]} workflow",
title_ar=f"تقليل تكلفة سير عمل {m.name.split(':')[-1]}",
description=f"Excess cost: ${m.value:.3f}/call (projected: ${m.estimated_cost_usd:.2f}/mo).",
evidence=[f"Current: ${m.value + m.threshold:.3f}, Expected: ${m.threshold:.3f}"],
impact=m.impact, effort="medium",
proposed_action="Switch to cheaper model for simple content or add caching layer",
requires_approval=True,
rollback_action="Revert model routing configuration",
))
elif "stale_knowledge" in m.name:
proposals.append(ImprovementProposal(
category="knowledge_update",
title="Refresh stale knowledge pages",
title_ar="تحديث صفحات المعرفة القديمة",
description=f"{int(m.value)} pages older than {int(m.threshold)} days.",
evidence=[f"{int(m.value)} pages not updated in 30+ days"],
impact="low", effort="trivial",
proposed_action="Flag stale pages for review and auto-mark as needs-update",
requires_approval=False,
rollback_action="Remove stale flags",
))
elif "repeated_escalation" in m.name:
proposals.append(ImprovementProposal(
category="quality",
title=f"Automate resolution for {m.name.split(':')[-1]}",
title_ar=f"أتمتة الحل لتصعيد {m.name.split(':')[-1]}",
description=f"{int(m.value)} escalations/week exceeds threshold of {int(m.threshold)}.",
evidence=[f"{int(m.value)} weekly occurrences"],
impact="medium", effort="medium",
proposed_action="Add auto-consent-renewal reminder workflow before expiry",
requires_approval=True,
rollback_action="Disable auto-reminder workflow",
))
elif "unverified_rate" in m.name:
proposals.append(ImprovementProposal(
category="quality",
title=f"Improve verification for agent {m.name.split(':')[-1]}",
title_ar=f"تحسين التحقق لوكيل {m.name.split(':')[-1]}",
description=f"Unverified rate {m.value:.1%} exceeds {m.threshold:.1%}.",
evidence=[f"{m.value:.1%} of tool calls unverified"],
impact="medium", effort="small",
proposed_action="Add post-execution verification step for this agent profile",
requires_approval=False,
rollback_action="Remove extra verification step",
))
self._proposals.extend(proposals) self._proposals.extend(proposals)
if len(self._proposals) > self._max_proposals:
self._proposals = self._proposals[-self._max_proposals:]
logger.info("[SelfImprove] اقتراحات: %d", len(proposals))
return proposals return proposals
def _create_proposal(self, metric: Metric) -> Optional[ImprovementProposal]: # -- Phase 4: Verify ---------------------------------------------------
self._cycle_count += 1
pid = f"IMP-{self._cycle_count:04d}"
if metric.name == "knowledge_freshness" and metric.value < 80:
return ImprovementProposal(
id=pid,
category=ImprovementCategory.KNOWLEDGE_UPDATE,
title="Update stale wiki pages",
title_ar="تحديث صفحات الويكي القديمة",
description=f"Knowledge freshness at {metric.value}%, below 80% threshold",
evidence=[f"{metric.name}={metric.value}{metric.unit}"],
impact="medium",
effort="trivial",
proposed_action="Run knowledge_brain.lint() and update flagged pages",
requires_approval=False,
)
if metric.name == "avg_workflow_cost" and metric.value > 0.50:
return ImprovementProposal(
id=pid,
category=ImprovementCategory.COST_REDUCTION,
title="Optimize expensive workflows",
title_ar="تحسين سير العمل المكلف",
description=f"Average workflow cost ${metric.value}, above $0.50 threshold",
evidence=[f"{metric.name}=${metric.value}"],
impact="high",
effort="medium",
proposed_action="Switch to Groq for classification tasks, reduce prompt tokens",
requires_approval=True,
)
return None
async def verify(self, proposal: ImprovementProposal) -> bool: async def verify(self, proposal: ImprovementProposal) -> bool:
if proposal.effort == "trivial" and not proposal.requires_approval: """Can we test this safely? Does it have rollback?"""
return True if not proposal.rollback_action:
if proposal.category == ImprovementCategory.SECURITY: logger.warning("[SelfImprove] لا يوجد إجراء تراجع: %s", proposal.id)
return False # Security changes always need approval
return proposal.effort in ("trivial", "small")
async def apply(
self, proposal_id: str, approved_by: str = None
) -> bool:
proposal = next((p for p in self._proposals if p.id == proposal_id), None)
if not proposal:
return False return False
if proposal.effort == "large":
logger.info("[SelfImprove] جهد كبير يتطلب مراجعة يدوية: %s", proposal.id)
return False
return True
# -- Phase 5: Apply ----------------------------------------------------
async def apply(self, proposal: ImprovementProposal, approved_by: Optional[str] = None) -> bool:
"""Apply only if approved (or trivial + auto-allowed). Log everything."""
if proposal.status in ("applied", "rejected"):
logger.info("[SelfImprove] اقتراح بالفعل %s: %s", proposal.status, proposal.id)
return False
if proposal.requires_approval and not approved_by: if proposal.requires_approval and not approved_by:
logger.warning(f"Proposal {proposal_id} requires approval") logger.info("[SelfImprove] يتطلب موافقة: %s", proposal.id)
return False return False
proposal.status = ImprovementStatus.APPLIED
proposal.applied_at = datetime.now(timezone.utc) safe = await self.verify(proposal)
if not safe:
proposal.status = "rejected"
proposal.test_result = "Failed safety verification"
logger.warning("[SelfImprove] رفض: %s -- فشل التحقق من السلامة", proposal.id)
return False
proposal.status = "applied"
proposal.approved_by = approved_by or "auto" proposal.approved_by = approved_by or "auto"
logger.info(f"Self-improvement applied: {proposal.title}") proposal.applied_at = datetime.now(timezone.utc)
proposal.test_result = "Applied successfully"
logger.info(
"[SelfImprove] تطبيق: %s cat=%s by=%s",
proposal.id, proposal.category, proposal.approved_by,
)
return True return True
async def reject(self, proposal_id: str, reason: str = "") -> bool: # -- Approval ----------------------------------------------------------
proposal = next((p for p in self._proposals if p.id == proposal_id), None)
if not proposal:
return False
proposal.status = ImprovementStatus.REJECTED
logger.info(f"Self-improvement rejected: {proposal.title}{reason}")
return True
async def run_cycle(self, tenant_id: str = None) -> CycleResult: async def approve(self, proposal_id: str, user_id: str) -> Optional[ImprovementProposal]:
started_at = datetime.now(timezone.utc) """Approve a pending proposal."""
for p in self._proposals:
if p.id == proposal_id and p.status == "proposed":
p.status = "approved"
p.approved_by = user_id
await self.apply(p, approved_by=user_id)
return p
return None
async def reject(self, proposal_id: str, user_id: str) -> Optional[ImprovementProposal]:
"""Reject a pending proposal."""
for p in self._proposals:
if p.id == proposal_id and p.status in ("proposed", "approved"):
p.status = "rejected"
p.approved_by = user_id
return p
return None
# -- Full cycle --------------------------------------------------------
async def run_cycle(self, tenant_id: Optional[str] = None) -> CycleResult:
"""Full cycle: inspect -> measure -> propose -> verify -> approve -> apply -> report."""
start = datetime.now(timezone.utc)
# 1. Inspect
inspection = await self.inspect(tenant_id) inspection = await self.inspect(tenant_id)
# 2. Measure
metrics = await self.measure(inspection) metrics = await self.measure(inspection)
# 3. Propose
proposals = await self.propose(metrics) proposals = await self.propose(metrics)
# 4+5. Auto-apply trivial proposals (up to MAX_AUTO_APPLY)
auto_applied = 0 auto_applied = 0
for proposal in proposals: for p in proposals:
can_verify = await self.verify(proposal) if auto_applied >= self.MAX_AUTO_APPLY:
if can_verify and not proposal.requires_approval: break
if auto_applied < self.MAX_AUTO_APPLY: if not p.requires_approval and p.effort == "trivial":
await self.apply(proposal.id) if await self.apply(p, approved_by="auto"):
auto_applied += 1 auto_applied += 1
awaiting = sum( pending = sum(1 for p in proposals if p.status == "proposed")
1 for p in proposals
if p.status == ImprovementStatus.PROPOSED now = datetime.now(timezone.utc)
cycle = CycleResult(
tenant_id=tenant_id or "global",
started_at=start,
completed_at=now,
duration_ms=int((now - start).total_seconds() * 1000),
issues_found=inspection["count"],
metrics_measured=len(metrics),
proposals_generated=len(proposals),
proposals_auto_applied=auto_applied,
proposals_pending=pending,
summary=(
f"Found {inspection['count']} issues, measured {len(metrics)} metrics, "
f"generated {len(proposals)} proposals, auto-applied {auto_applied}"
),
summary_ar=(
f"اكتشاف {inspection['count']} مشاكل، قياس {len(metrics)} مقاييس، "
f"إنشاء {len(proposals)} مقترحات، تطبيق تلقائي {auto_applied}"
),
) )
summary = ( self._cycles.append(cycle)
f"Cycle complete: {len(metrics)} metrics, {len(proposals)} proposals, " if len(self._cycles) > self._max_cycles:
f"{auto_applied} auto-applied, {awaiting} awaiting approval" self._cycles = self._cycles[-self._max_cycles:]
)
summary_ar = (
f"اكتملت الدورة: {len(metrics)} مقاييس، {len(proposals)} مقترحات، "
f"{auto_applied} تطبيق تلقائي، {awaiting} بانتظار الموافقة"
)
return CycleResult( logger.info(
cycle_id=f"CYCLE-{self._cycle_count}", "[SelfImprove] دورة اكتملت: issues=%d proposals=%d auto=%d pending=%d %dms",
inspected_areas=list(inspection.keys()), inspection["count"], len(proposals), auto_applied, pending, cycle.duration_ms,
metrics=metrics,
proposals=proposals,
auto_applied=auto_applied,
awaiting_approval=awaiting,
started_at=started_at,
completed_at=datetime.now(timezone.utc),
summary=summary,
summary_ar=summary_ar,
) )
return cycle
async def get_proposals( # -- Reporting ---------------------------------------------------------
self, status: ImprovementStatus = None
) -> list[ImprovementProposal]: async def report(self) -> dict[str, Any]:
"""Summary: proposals made, applied, rejected, impact measured."""
by_status: dict[str, int] = defaultdict(int)
by_category: dict[str, int] = defaultdict(int)
for p in self._proposals:
by_status[p.status] += 1
by_category[p.category] += 1
return {
"total_proposals": len(self._proposals),
"by_status": dict(by_status),
"by_category": dict(by_category),
"total_cycles": len(self._cycles),
"last_cycle": self._cycles[-1].model_dump(mode="json") if self._cycles else None,
"message_ar": (
f"مقترحات: {len(self._proposals)}، "
f"مطبقة: {by_status.get('applied', 0)}، "
f"مرفوضة: {by_status.get('rejected', 0)}، "
f"معلقة: {by_status.get('proposed', 0)}"
),
}
def list_proposals(self, status: Optional[str] = None) -> list[ImprovementProposal]:
"""List all proposals, optionally filtered by status."""
if status: if status:
return [p for p in self._proposals if p.status == status] return [p for p in self._proposals if p.status == status]
return self._proposals return list(self._proposals)
async def get_metrics_history(self) -> list[dict]: def get_proposal(self, proposal_id: str) -> Optional[ImprovementProposal]:
return self._metrics_history """Get a single proposal by ID."""
return next((p for p in self._proposals if p.id == proposal_id), None)
self_improvement = SelfImprovementEngine() # ---------------------------------------------------------------------------
# Module-level singleton
# ---------------------------------------------------------------------------
self_improvement_engine = SelfImprovementEngine()