system-prompts-and-models-o.../dealix/auto_client_acquisition/pipelines/scoring.py
2026-05-01 14:03:52 +03:00

210 lines
7.9 KiB
Python

"""
Lead scoring + Data Quality scoring.
Both deterministic — no LLM required. Used by /leads/enrich/* + the data
ingestion pipeline. LLM scoring (qualitative) lives in agents/qualification.py.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
@dataclass
class ScoreBreakdown:
fit: float = 0.0
intent: float = 0.0
urgency: float = 0.0
risk: float = 0.0
total: float = 0.0
priority: str = "P3"
recommended_channel: str | None = None
reason: str = ""
# ── Data Quality (0..100) ──────────────────────────────────────────
DQ_WEIGHTS = {
"has_domain": 12,
"has_website": 6,
"has_city": 8,
"has_sector": 8,
"has_source_url": 8,
"has_email_or_phone": 12,
"has_signal": 12,
"has_place_id": 8,
"low_dup_risk": 6,
# Negatives (subtracted)
"missing_source": -10,
"personal_only_contact": -8,
"no_allowed_use": -15,
"opt_out": -100,
"high_risk": -20,
}
def compute_data_quality(account: dict[str, Any]) -> tuple[float, list[str]]:
"""
Compute DQ score from a normalized account dict.
Returns (score, reasons).
"""
reasons: list[str] = []
score = 0.0
if account.get("domain"):
score += DQ_WEIGHTS["has_domain"]; reasons.append("+domain")
if account.get("website"):
score += DQ_WEIGHTS["has_website"]; reasons.append("+website")
if account.get("city"):
score += DQ_WEIGHTS["has_city"]; reasons.append("+city")
if account.get("sector"):
score += DQ_WEIGHTS["has_sector"]; reasons.append("+sector")
if account.get("source_url") or account.get("best_source"):
score += DQ_WEIGHTS["has_source_url"]; reasons.append("+source_url")
has_business_contact = bool(account.get("email")) or bool(account.get("phone"))
if has_business_contact:
score += DQ_WEIGHTS["has_email_or_phone"]; reasons.append("+contact")
if account.get("google_place_id"):
score += DQ_WEIGHTS["has_place_id"]; reasons.append("+place_id")
if account.get("signals"):
score += DQ_WEIGHTS["has_signal"]; reasons.append("+signals")
if int(account.get("source_count") or 1) >= 2:
score += DQ_WEIGHTS["low_dup_risk"]; reasons.append("+multi_source")
if not account.get("source_type") and not account.get("best_source"):
score += DQ_WEIGHTS["missing_source"]; reasons.append("-no_source")
if account.get("allowed_use") in (None, "", "unknown"):
score += DQ_WEIGHTS["no_allowed_use"]; reasons.append("-no_allowed_use")
if account.get("opt_out"):
score += DQ_WEIGHTS["opt_out"]; reasons.append("-opt_out")
if (account.get("risk_level") or "").lower() == "high":
score += DQ_WEIGHTS["high_risk"]; reasons.append("-high_risk")
return max(0.0, min(100.0, score)), reasons
# ── Lead Score (0..100, P0..P3) ────────────────────────────────────
def compute_lead_score(
account: dict[str, Any],
*,
signals: list[dict[str, Any]] | None = None,
technologies: list[dict[str, Any]] | None = None,
) -> ScoreBreakdown:
"""
Deterministic ICP score. Mirrors the Saudi B2B 100-point spec:
Fit 40 + Intent 30 + Access 15 + Revenue 15 → priority bucket.
"""
sig_list = signals or []
tech_list = technologies or []
# ── FIT (40) ─────────────────────────────────────────────────
fit = 0.0
sector = (account.get("sector") or "").lower()
high_value_sectors = {
"saas", "fintech", "ecommerce", "real_estate", "real_estate_developer",
"marketing_agency", "training_center", "consulting_firm", "accounting_firm",
"law_firm", "logistics", "education",
# Lead-driven hospitality + events: every inquiry = booking-value
"events", "hospitality", "hotel", "wedding_hall", "event_venue",
"tourism_agency",
}
medium_value_sectors = {
"dental_clinic", "medical_clinic", "cosmetic_clinic",
"restaurant", "retail_store", "fitness_gym", "salon_spa", "auto_dealer",
"construction", "food_manufacturing", "retail",
}
if sector in high_value_sectors:
fit += 25
elif sector in medium_value_sectors:
fit += 18
elif sector:
fit += 10
if (account.get("country") or "SA").upper() == "SA":
fit += 10
if account.get("city") in {"الرياض", "Riyadh", "riyadh", "جدة", "Jeddah", "jeddah",
"الدمام", "Dammam", "dammam"}:
fit += 5
fit = min(40.0, fit)
# ── INTENT (30) ───────────────────────────────────────────────
intent = 0.0
intent_signal_types = {"intent", "hire", "funding", "news"}
intent += min(15, sum(s.get("confidence", 0.5) * 5 for s in sig_list
if s.get("signal_type") in intent_signal_types))
# Tech signals (CRM/booking/payments) imply they're already commerce-active
tech_categories = {t.get("category") for t in tech_list}
if tech_categories & {"booking", "crm", "ecom_mena", "payment_mena", "chat_mena"}:
intent += 10
if any(t.get("name", "").lower() in {"calendly", "hubspot", "salla", "zid"} for t in tech_list):
intent += 5
intent = min(30.0, intent)
# ── ACCESSIBILITY (15) ────────────────────────────────────────
access = 0.0
if account.get("phone"):
access += 5
if account.get("email"):
access += 5
if account.get("website") or account.get("domain"):
access += 3
if account.get("google_place_id"):
access += 2
access = min(15.0, access)
# ── REVENUE POTENTIAL (15) ────────────────────────────────────
revenue = 0.0
rev_hints = (account.get("revenue_hint") or "").lower()
if "enterprise" in rev_hints or "large" in rev_hints:
revenue = 15
elif "mid" in rev_hints or "growth" in rev_hints:
revenue = 10
elif "smb" in rev_hints or "small" in rev_hints:
revenue = 6
else:
revenue = 8 # default neutral
revenue = min(15.0, revenue)
total = fit + intent + access + revenue
# ── RISK (subtractive, 0..30) ─────────────────────────────────
risk = 0.0
if (account.get("risk_level") or "").lower() == "high":
risk += 20
if account.get("opt_out"):
risk += 30
if not account.get("allowed_use") or account["allowed_use"] == "unknown":
risk += 8
if total >= 80:
priority = "P0"
elif total >= 65:
priority = "P1"
elif total >= 45:
priority = "P2"
else:
priority = "P3"
# ── Channel recommendation ───────────────────────────────────
if account.get("opt_out") or risk >= 30:
channel = None
elif account.get("email") and intent >= 15:
channel = "email_warm"
elif account.get("website"):
channel = "website_form_or_phone_task"
elif account.get("phone"):
channel = "phone_task"
elif account.get("google_place_id"):
channel = "in_person_or_phone"
else:
channel = "needs_enrichment"
reason = (
f"fit={fit:.0f} intent={intent:.0f} access={access:.0f} rev={revenue:.0f} "
f"risk={risk:.0f}{priority} via {channel or 'BLOCKED'}"
)
return ScoreBreakdown(
fit=fit, intent=intent, urgency=intent, # urgency mirrors intent for now
risk=risk, total=total, priority=priority,
recommended_channel=channel, reason=reason,
)