system-prompts-and-models-o.../dealix/autonomous_growth/agents/enrichment.py
2026-05-01 14:03:52 +03:00

129 lines
4.7 KiB
Python

"""
Enrichment Agent — augments lead data with public info.
وكيل الإثراء — يثري بيانات العميل من مصادر عامة.
Note: production enrichment typically uses providers like Clearbit, Apollo,
or company-domain lookups. This agent provides:
1. Domain-based inference (guess company size / sector from email domain)
2. LLM-based inference from company name (best effort)
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
from auto_client_acquisition.agents.intake import Lead
from core.agents.base import BaseAgent
from core.config.models import Task
from core.llm.base import Message
@dataclass
class EnrichmentData:
inferred_sector: str | None = None
inferred_size: str | None = None
inferred_region: str | None = None
website: str | None = None
linkedin_handle: str | None = None
notes: list[str] = field(default_factory=list)
confidence: float = 0.0 # 0-1
def to_dict(self) -> dict[str, Any]:
return {
"inferred_sector": self.inferred_sector,
"inferred_size": self.inferred_size,
"inferred_region": self.inferred_region,
"website": self.website,
"linkedin_handle": self.linkedin_handle,
"notes": self.notes,
"confidence": round(self.confidence, 2),
}
# Domain → sector hints (extend as needed)
DOMAIN_HINTS: dict[str, str] = {
".edu.sa": "education",
".gov.sa": "government",
".med.sa": "healthcare",
"aramco.com": "oil_gas",
"sabic.com": "manufacturing",
"stc.com.sa": "technology",
"alrajhibank.com.sa": "finance",
"saudiairlines.com": "tourism",
}
class EnrichmentAgent(BaseAgent):
"""Enriches lead data using heuristics + LLM inference."""
name = "enrichment"
async def run(
self,
*,
lead: Lead,
use_llm: bool = True,
**_: Any,
) -> EnrichmentData:
data = EnrichmentData()
confidence = 0.0
# 1. Email-domain hints
if lead.contact_email and "@" in lead.contact_email:
domain = lead.contact_email.split("@", 1)[1].lower()
data.website = f"https://{domain}"
for key, sector in DOMAIN_HINTS.items():
if key in domain:
data.inferred_sector = sector
confidence = max(confidence, 0.8)
data.notes.append(f"Sector from domain: {domain}{sector}")
break
if domain.endswith(".sa") or domain.endswith(".ksa"):
data.inferred_region = "Saudi Arabia"
confidence = max(confidence, 0.7)
# 2. Phone country hint
if lead.contact_phone and lead.contact_phone.startswith("+966"):
data.inferred_region = data.inferred_region or "Saudi Arabia"
confidence = max(confidence, 0.7)
elif lead.contact_phone and lead.contact_phone.startswith("+971"):
data.inferred_region = data.inferred_region or "UAE"
confidence = max(confidence, 0.7)
# 3. LLM inference from company name
if use_llm and lead.company_name and not data.inferred_sector:
try:
prompt = (
f"Given the Saudi/GCC company name '{lead.company_name}', "
f"infer the most likely sector from this list: "
f"technology, real_estate, healthcare, education, logistics, "
f"retail, finance, manufacturing, consulting, construction, "
f"oil_gas, tourism, other. "
f'Respond with JSON: {{"sector": str, "confidence": 0-1, "note": str}}. '
f"If you don't know, say 'other' with low confidence."
)
response = await self.router.run(
task=Task.CLASSIFICATION,
messages=[Message(role="user", content=prompt)],
max_tokens=200,
temperature=0.1,
)
parsed = self.parse_json_response(response.content)
if parsed.get("sector") and parsed["sector"] != "other":
data.inferred_sector = parsed["sector"]
confidence = max(confidence, float(parsed.get("confidence", 0.3)))
data.notes.append(f"Sector from name: {lead.company_name}{parsed['sector']}")
except Exception as e:
self.log.warning("enrichment_llm_failed", error=str(e))
data.confidence = confidence
self.log.info(
"enriched",
lead_id=lead.id,
sector=data.inferred_sector,
region=data.inferred_region,
confidence=confidence,
)
return data