system-prompts-and-models-o.../dealix/auto_client_acquisition/pipelines/enrichment.py
2026-05-01 14:03:52 +03:00

183 lines
7.2 KiB
Python

"""
Full enrichment pipeline — composes provider chains.
Steps (each step degrades gracefully if its provider is missing):
1. Domain normalization
2. Optional Google CSE search for homepage / contact / pricing
3. Crawler fetch (Firecrawl → requests_bs4)
4. Tech detection (internal → +Wappalyzer)
5. Public contact extraction
6. Email intel (Hunter/Abstract — only if domain + key)
7. Lead scoring + DQ scoring
8. Channel recommendation
Returns a flat dict for storage in lead_scores / signals.
"""
from __future__ import annotations
import logging
from typing import Any
from auto_client_acquisition.connectors.tech_detect import extract_contact_info
from auto_client_acquisition.pipelines.normalize import normalize_domain
from auto_client_acquisition.pipelines.scoring import (
ScoreBreakdown,
compute_data_quality,
compute_lead_score,
)
from auto_client_acquisition.providers.crawler import fetch_with_chain
from auto_client_acquisition.providers.email_intel import find_emails_with_chain
from auto_client_acquisition.providers.search import search_with_chain
from auto_client_acquisition.providers.tech import detect_with_chain
log = logging.getLogger(__name__)
async def enrich_account(
account: dict[str, Any],
*,
enrichment_level: str = "standard", # basic / standard / deep
) -> dict[str, Any]:
"""
Enrich a single normalized account.
Args:
account: dict from pipelines.normalize.normalize_row OR an existing
AccountRecord-shaped dict. Must have at minimum company_name.
enrichment_level:
basic → tech detect + extract_contact_info
standard → +crawler text + Google CSE for homepage if missing
deep → +email intel domain search
Returns: dict with keys:
account, technologies, signals, contacts, score, dq_score,
recommended_channel, providers_used, status
"""
domain = account.get("domain") or normalize_domain(account.get("website"))
company = account.get("company_name") or ""
providers_used: list[str] = []
technologies: list[dict[str, Any]] = []
signals: list[dict[str, Any]] = []
contacts: list[dict[str, Any]] = []
crawl_text: str = ""
crawl_title: str = ""
# Step 1: discover homepage if no domain
if not domain and enrichment_level in ("standard", "deep") and company:
srch = await search_with_chain(f"{company} الموقع الرسمي", num=3, lang="ar")
providers_used.append(f"search:{srch.provider}")
if srch.status == "ok" and srch.data:
for r in srch.data.get("results", []):
d = normalize_domain(r.get("link"))
if d and "facebook" not in d and "linkedin" not in d:
domain = d
account["domain"] = d
account["website"] = f"https://{d}"
account["best_source"] = srch.provider
break
# Step 2: crawl homepage (text-only)
if domain and enrichment_level in ("standard", "deep"):
crawl = await fetch_with_chain(f"https://{domain}", timeout=10.0)
providers_used.append(f"crawler:{crawl.provider}")
if crawl.status == "ok" and crawl.data:
crawl_text = (crawl.data.get("text") or "")[:6000]
crawl_title = crawl.data.get("title") or ""
# Step 3: tech detection
if domain:
tech = await detect_with_chain(f"https://{domain}")
providers_used.append(f"tech:{tech.provider}")
if tech.status == "ok" and tech.data:
tools = tech.data.get("tools") or tech.data.get("technologies") or []
for t in tools:
if isinstance(t, dict):
technologies.append({
"name": t.get("name") or t.get("tool"),
"category": t.get("category"),
"source": tech.provider,
})
for s in tech.data.get("signals") or []:
if isinstance(s, dict):
signals.append({
"signal_type": "tech",
"signal_value": s.get("name") or s.get("description") or str(s),
"confidence": float(s.get("confidence", 0.7)),
"source_url": f"https://{domain}",
})
# Step 4: public contact extraction
if domain:
try:
contact_info = await extract_contact_info(domain)
providers_used.append("contact_extract:internal")
for e in contact_info.get("emails", []):
contacts.append({"type": "email", "value": e, "source": "public_pages"})
for ph in contact_info.get("phones", []):
contacts.append({"type": "phone", "value": ph, "source": "public_pages"})
for wa in contact_info.get("whatsapp", []):
contacts.append({"type": "whatsapp", "value": wa, "source": "public_pages"})
for li in contact_info.get("linkedin", []):
contacts.append({"type": "linkedin", "value": li, "source": "public_pages"})
except Exception as exc: # noqa: BLE001
log.warning("contact_extract_failed domain=%s err=%s", domain, exc)
# Step 5: email intel (deep only)
if domain and enrichment_level == "deep":
ei = await find_emails_with_chain(domain, limit=5)
providers_used.append(f"email_intel:{ei.provider}")
if ei.status == "ok" and ei.data:
for em in ei.data.get("emails", []):
if isinstance(em, dict) and em.get("value"):
contacts.append({
"type": "email",
"value": em["value"],
"role": em.get("position"),
"name": (
f"{em.get('first_name', '')} {em.get('last_name', '')}"
).strip() or None,
"source": ei.provider,
})
# Step 6: scoring
score: ScoreBreakdown = compute_lead_score(
{**account, "signals": signals},
signals=signals,
technologies=technologies,
)
dq_score, dq_reasons = compute_data_quality({
**account,
"signals": signals,
"email": account.get("email") or next(
(c["value"] for c in contacts if c["type"] == "email"), None
),
"phone": account.get("phone") or next(
(c["value"] for c in contacts if c["type"] == "phone"), None
),
})
return {
"account": account,
"domain": domain,
"title": crawl_title,
"summary": crawl_text[:600] if crawl_text else "",
"technologies": technologies,
"signals": signals,
"contacts": contacts,
"score": {
"fit": score.fit,
"intent": score.intent,
"urgency": score.urgency,
"risk": score.risk,
"total": score.total,
"priority": score.priority,
"recommended_channel": score.recommended_channel,
"reason": score.reason,
},
"data_quality": {"score": dq_score, "reasons": dq_reasons},
"recommended_channel": score.recommended_channel,
"providers_used": providers_used,
"status": "ok",
}