system-prompts-and-models-o.../dealix/auto_client_acquisition/pipelines/scoring.py

"""
Lead scoring + Data Quality scoring.

Both deterministic — no LLM required. Used by /leads/enrich/* + the data
ingestion pipeline. LLM scoring (qualitative) lives in agents/qualification.py.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Any


@dataclass
class ScoreBreakdown:
    fit: float = 0.0
    intent: float = 0.0
    urgency: float = 0.0
    risk: float = 0.0
    total: float = 0.0
    priority: str = "P3"
    recommended_channel: str | None = None
    reason: str = ""


# ── Data Quality (0..100) ──────────────────────────────────────────
DQ_WEIGHTS = {
    "has_domain": 12,
    "has_website": 6,
    "has_city": 8,
    "has_sector": 8,
    "has_source_url": 8,
    "has_email_or_phone": 12,
    "has_signal": 12,
    "has_place_id": 8,
    "low_dup_risk": 6,
    # Negatives (subtracted)
    "missing_source": -10,
    "personal_only_contact": -8,
    "no_allowed_use": -15,
    "opt_out": -100,
    "high_risk": -20,
}


def compute_data_quality(account: dict[str, Any]) -> tuple[float, list[str]]:
    """
    Compute DQ score from a normalized account dict.
    Returns (score, reasons).
    """
    reasons: list[str] = []
    score = 0.0
    if account.get("domain"):
        score += DQ_WEIGHTS["has_domain"]; reasons.append("+domain")
    if account.get("website"):
        score += DQ_WEIGHTS["has_website"]; reasons.append("+website")
    if account.get("city"):
        score += DQ_WEIGHTS["has_city"]; reasons.append("+city")
    if account.get("sector"):
        score += DQ_WEIGHTS["has_sector"]; reasons.append("+sector")
    if account.get("source_url") or account.get("best_source"):
        score += DQ_WEIGHTS["has_source_url"]; reasons.append("+source_url")
    has_business_contact = bool(account.get("email")) or bool(account.get("phone"))
    if has_business_contact:
        score += DQ_WEIGHTS["has_email_or_phone"]; reasons.append("+contact")
    if account.get("google_place_id"):
        score += DQ_WEIGHTS["has_place_id"]; reasons.append("+place_id")
    if account.get("signals"):
        score += DQ_WEIGHTS["has_signal"]; reasons.append("+signals")
    if int(account.get("source_count") or 1) >= 2:
        score += DQ_WEIGHTS["low_dup_risk"]; reasons.append("+multi_source")

    if not account.get("source_type") and not account.get("best_source"):
        score += DQ_WEIGHTS["missing_source"]; reasons.append("-no_source")
    if account.get("allowed_use") in (None, "", "unknown"):
        score += DQ_WEIGHTS["no_allowed_use"]; reasons.append("-no_allowed_use")
    if account.get("opt_out"):
        score += DQ_WEIGHTS["opt_out"]; reasons.append("-opt_out")
    if (account.get("risk_level") or "").lower() == "high":
        score += DQ_WEIGHTS["high_risk"]; reasons.append("-high_risk")

    return max(0.0, min(100.0, score)), reasons


# ── Lead Score (0..100, P0..P3) ────────────────────────────────────
def compute_lead_score(
    account: dict[str, Any],
    *,
    signals: list[dict[str, Any]] | None = None,
    technologies: list[dict[str, Any]] | None = None,
) -> ScoreBreakdown:
    """
    Deterministic ICP score. Mirrors the Saudi B2B 100-point spec:
        Fit 40 + Intent 30 + Access 15 + Revenue 15 → priority bucket.
    """
    sig_list = signals or []
    tech_list = technologies or []

    # ── FIT (40) ─────────────────────────────────────────────────
    fit = 0.0
    sector = (account.get("sector") or "").lower()
    high_value_sectors = {
        "saas", "fintech", "ecommerce", "real_estate", "real_estate_developer",
        "marketing_agency", "training_center", "consulting_firm", "accounting_firm",
        "law_firm", "logistics", "education",
        # Lead-driven hospitality + events: every inquiry = booking-value
        "events", "hospitality", "hotel", "wedding_hall", "event_venue",
        "tourism_agency",
    }
    medium_value_sectors = {
        "dental_clinic", "medical_clinic", "cosmetic_clinic",
        "restaurant", "retail_store", "fitness_gym", "salon_spa", "auto_dealer",
        "construction", "food_manufacturing", "retail",
    }
    if sector in high_value_sectors:
        fit += 25
    elif sector in medium_value_sectors:
        fit += 18
    elif sector:
        fit += 10
    if (account.get("country") or "SA").upper() == "SA":
        fit += 10
    if account.get("city") in {"الرياض", "Riyadh", "riyadh", "جدة", "Jeddah", "jeddah",
                                "الدمام", "Dammam", "dammam"}:
        fit += 5
    fit = min(40.0, fit)

    # ── INTENT (30) ───────────────────────────────────────────────
    intent = 0.0
    intent_signal_types = {"intent", "hire", "funding", "news"}
    intent += min(15, sum(s.get("confidence", 0.5) * 5 for s in sig_list
                          if s.get("signal_type") in intent_signal_types))
    # Tech signals (CRM/booking/payments) imply they're already commerce-active
    tech_categories = {t.get("category") for t in tech_list}
    if tech_categories & {"booking", "crm", "ecom_mena", "payment_mena", "chat_mena"}:
        intent += 10
    if any(t.get("name", "").lower() in {"calendly", "hubspot", "salla", "zid"} for t in tech_list):
        intent += 5
    intent = min(30.0, intent)

    # ── ACCESSIBILITY (15) ────────────────────────────────────────
    access = 0.0
    if account.get("phone"):
        access += 5
    if account.get("email"):
        access += 5
    if account.get("website") or account.get("domain"):
        access += 3
    if account.get("google_place_id"):
        access += 2
    access = min(15.0, access)

    # ── REVENUE POTENTIAL (15) ────────────────────────────────────
    revenue = 0.0
    rev_hints = (account.get("revenue_hint") or "").lower()
    if "enterprise" in rev_hints or "large" in rev_hints:
        revenue = 15
    elif "mid" in rev_hints or "growth" in rev_hints:
        revenue = 10
    elif "smb" in rev_hints or "small" in rev_hints:
        revenue = 6
    else:
        revenue = 8  # default neutral
    revenue = min(15.0, revenue)

    total = fit + intent + access + revenue

    # ── RISK (subtractive, 0..30) ─────────────────────────────────
    risk = 0.0
    if (account.get("risk_level") or "").lower() == "high":
        risk += 20
    if account.get("opt_out"):
        risk += 30
    if not account.get("allowed_use") or account["allowed_use"] == "unknown":
        risk += 8

    if total >= 80:
        priority = "P0"
    elif total >= 65:
        priority = "P1"
    elif total >= 45:
        priority = "P2"
    else:
        priority = "P3"

    # ── Channel recommendation ───────────────────────────────────
    if account.get("opt_out") or risk >= 30:
        channel = None
    elif account.get("email") and intent >= 15:
        channel = "email_warm"
    elif account.get("website"):
        channel = "website_form_or_phone_task"
    elif account.get("phone"):
        channel = "phone_task"
    elif account.get("google_place_id"):
        channel = "in_person_or_phone"
    else:
        channel = "needs_enrichment"

    reason = (
        f"fit={fit:.0f} intent={intent:.0f} access={access:.0f} rev={revenue:.0f} "
        f"risk={risk:.0f} → {priority} via {channel or 'BLOCKED'}"
    )

    return ScoreBreakdown(
        fit=fit, intent=intent, urgency=intent,  # urgency mirrors intent for now
        risk=risk, total=total, priority=priority,
        recommended_channel=channel, reason=reason,
    )