mirror of
https://github.com/x1xhlol/system-prompts-and-models-of-ai-tools.git
synced 2026-06-18 15:29:36 +00:00
210 lines
7.9 KiB
Python
210 lines
7.9 KiB
Python
"""
|
|
Lead scoring + Data Quality scoring.
|
|
|
|
Both deterministic — no LLM required. Used by /leads/enrich/* + the data
|
|
ingestion pipeline. LLM scoring (qualitative) lives in agents/qualification.py.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
from typing import Any
|
|
|
|
|
|
@dataclass
|
|
class ScoreBreakdown:
|
|
fit: float = 0.0
|
|
intent: float = 0.0
|
|
urgency: float = 0.0
|
|
risk: float = 0.0
|
|
total: float = 0.0
|
|
priority: str = "P3"
|
|
recommended_channel: str | None = None
|
|
reason: str = ""
|
|
|
|
|
|
# ── Data Quality (0..100) ──────────────────────────────────────────
|
|
DQ_WEIGHTS = {
|
|
"has_domain": 12,
|
|
"has_website": 6,
|
|
"has_city": 8,
|
|
"has_sector": 8,
|
|
"has_source_url": 8,
|
|
"has_email_or_phone": 12,
|
|
"has_signal": 12,
|
|
"has_place_id": 8,
|
|
"low_dup_risk": 6,
|
|
# Negatives (subtracted)
|
|
"missing_source": -10,
|
|
"personal_only_contact": -8,
|
|
"no_allowed_use": -15,
|
|
"opt_out": -100,
|
|
"high_risk": -20,
|
|
}
|
|
|
|
|
|
def compute_data_quality(account: dict[str, Any]) -> tuple[float, list[str]]:
|
|
"""
|
|
Compute DQ score from a normalized account dict.
|
|
Returns (score, reasons).
|
|
"""
|
|
reasons: list[str] = []
|
|
score = 0.0
|
|
if account.get("domain"):
|
|
score += DQ_WEIGHTS["has_domain"]; reasons.append("+domain")
|
|
if account.get("website"):
|
|
score += DQ_WEIGHTS["has_website"]; reasons.append("+website")
|
|
if account.get("city"):
|
|
score += DQ_WEIGHTS["has_city"]; reasons.append("+city")
|
|
if account.get("sector"):
|
|
score += DQ_WEIGHTS["has_sector"]; reasons.append("+sector")
|
|
if account.get("source_url") or account.get("best_source"):
|
|
score += DQ_WEIGHTS["has_source_url"]; reasons.append("+source_url")
|
|
has_business_contact = bool(account.get("email")) or bool(account.get("phone"))
|
|
if has_business_contact:
|
|
score += DQ_WEIGHTS["has_email_or_phone"]; reasons.append("+contact")
|
|
if account.get("google_place_id"):
|
|
score += DQ_WEIGHTS["has_place_id"]; reasons.append("+place_id")
|
|
if account.get("signals"):
|
|
score += DQ_WEIGHTS["has_signal"]; reasons.append("+signals")
|
|
if int(account.get("source_count") or 1) >= 2:
|
|
score += DQ_WEIGHTS["low_dup_risk"]; reasons.append("+multi_source")
|
|
|
|
if not account.get("source_type") and not account.get("best_source"):
|
|
score += DQ_WEIGHTS["missing_source"]; reasons.append("-no_source")
|
|
if account.get("allowed_use") in (None, "", "unknown"):
|
|
score += DQ_WEIGHTS["no_allowed_use"]; reasons.append("-no_allowed_use")
|
|
if account.get("opt_out"):
|
|
score += DQ_WEIGHTS["opt_out"]; reasons.append("-opt_out")
|
|
if (account.get("risk_level") or "").lower() == "high":
|
|
score += DQ_WEIGHTS["high_risk"]; reasons.append("-high_risk")
|
|
|
|
return max(0.0, min(100.0, score)), reasons
|
|
|
|
|
|
# ── Lead Score (0..100, P0..P3) ────────────────────────────────────
|
|
def compute_lead_score(
|
|
account: dict[str, Any],
|
|
*,
|
|
signals: list[dict[str, Any]] | None = None,
|
|
technologies: list[dict[str, Any]] | None = None,
|
|
) -> ScoreBreakdown:
|
|
"""
|
|
Deterministic ICP score. Mirrors the Saudi B2B 100-point spec:
|
|
Fit 40 + Intent 30 + Access 15 + Revenue 15 → priority bucket.
|
|
"""
|
|
sig_list = signals or []
|
|
tech_list = technologies or []
|
|
|
|
# ── FIT (40) ─────────────────────────────────────────────────
|
|
fit = 0.0
|
|
sector = (account.get("sector") or "").lower()
|
|
high_value_sectors = {
|
|
"saas", "fintech", "ecommerce", "real_estate", "real_estate_developer",
|
|
"marketing_agency", "training_center", "consulting_firm", "accounting_firm",
|
|
"law_firm", "logistics", "education",
|
|
# Lead-driven hospitality + events: every inquiry = booking-value
|
|
"events", "hospitality", "hotel", "wedding_hall", "event_venue",
|
|
"tourism_agency",
|
|
}
|
|
medium_value_sectors = {
|
|
"dental_clinic", "medical_clinic", "cosmetic_clinic",
|
|
"restaurant", "retail_store", "fitness_gym", "salon_spa", "auto_dealer",
|
|
"construction", "food_manufacturing", "retail",
|
|
}
|
|
if sector in high_value_sectors:
|
|
fit += 25
|
|
elif sector in medium_value_sectors:
|
|
fit += 18
|
|
elif sector:
|
|
fit += 10
|
|
if (account.get("country") or "SA").upper() == "SA":
|
|
fit += 10
|
|
if account.get("city") in {"الرياض", "Riyadh", "riyadh", "جدة", "Jeddah", "jeddah",
|
|
"الدمام", "Dammam", "dammam"}:
|
|
fit += 5
|
|
fit = min(40.0, fit)
|
|
|
|
# ── INTENT (30) ───────────────────────────────────────────────
|
|
intent = 0.0
|
|
intent_signal_types = {"intent", "hire", "funding", "news"}
|
|
intent += min(15, sum(s.get("confidence", 0.5) * 5 for s in sig_list
|
|
if s.get("signal_type") in intent_signal_types))
|
|
# Tech signals (CRM/booking/payments) imply they're already commerce-active
|
|
tech_categories = {t.get("category") for t in tech_list}
|
|
if tech_categories & {"booking", "crm", "ecom_mena", "payment_mena", "chat_mena"}:
|
|
intent += 10
|
|
if any(t.get("name", "").lower() in {"calendly", "hubspot", "salla", "zid"} for t in tech_list):
|
|
intent += 5
|
|
intent = min(30.0, intent)
|
|
|
|
# ── ACCESSIBILITY (15) ────────────────────────────────────────
|
|
access = 0.0
|
|
if account.get("phone"):
|
|
access += 5
|
|
if account.get("email"):
|
|
access += 5
|
|
if account.get("website") or account.get("domain"):
|
|
access += 3
|
|
if account.get("google_place_id"):
|
|
access += 2
|
|
access = min(15.0, access)
|
|
|
|
# ── REVENUE POTENTIAL (15) ────────────────────────────────────
|
|
revenue = 0.0
|
|
rev_hints = (account.get("revenue_hint") or "").lower()
|
|
if "enterprise" in rev_hints or "large" in rev_hints:
|
|
revenue = 15
|
|
elif "mid" in rev_hints or "growth" in rev_hints:
|
|
revenue = 10
|
|
elif "smb" in rev_hints or "small" in rev_hints:
|
|
revenue = 6
|
|
else:
|
|
revenue = 8 # default neutral
|
|
revenue = min(15.0, revenue)
|
|
|
|
total = fit + intent + access + revenue
|
|
|
|
# ── RISK (subtractive, 0..30) ─────────────────────────────────
|
|
risk = 0.0
|
|
if (account.get("risk_level") or "").lower() == "high":
|
|
risk += 20
|
|
if account.get("opt_out"):
|
|
risk += 30
|
|
if not account.get("allowed_use") or account["allowed_use"] == "unknown":
|
|
risk += 8
|
|
|
|
if total >= 80:
|
|
priority = "P0"
|
|
elif total >= 65:
|
|
priority = "P1"
|
|
elif total >= 45:
|
|
priority = "P2"
|
|
else:
|
|
priority = "P3"
|
|
|
|
# ── Channel recommendation ───────────────────────────────────
|
|
if account.get("opt_out") or risk >= 30:
|
|
channel = None
|
|
elif account.get("email") and intent >= 15:
|
|
channel = "email_warm"
|
|
elif account.get("website"):
|
|
channel = "website_form_or_phone_task"
|
|
elif account.get("phone"):
|
|
channel = "phone_task"
|
|
elif account.get("google_place_id"):
|
|
channel = "in_person_or_phone"
|
|
else:
|
|
channel = "needs_enrichment"
|
|
|
|
reason = (
|
|
f"fit={fit:.0f} intent={intent:.0f} access={access:.0f} rev={revenue:.0f} "
|
|
f"risk={risk:.0f} → {priority} via {channel or 'BLOCKED'}"
|
|
)
|
|
|
|
return ScoreBreakdown(
|
|
fit=fit, intent=intent, urgency=intent, # urgency mirrors intent for now
|
|
risk=risk, total=total, priority=priority,
|
|
recommended_channel=channel, reason=reason,
|
|
)
|