""" Saudi B2B Revenue Graph — pure data structures + similarity + propagation. The Revenue Graph is the company's defensive moat. Every interaction (signal detected, message sent, reply received, deal won/lost) updates the graph. New leads borrow probability estimates from similar past outcomes, so the system's accuracy compounds with usage. This module is pure-Python — persistence is layered on top via a Repository adapter (SQLAlchemy in production, dict in tests). """ from __future__ import annotations import math from dataclasses import dataclass, field from datetime import datetime from typing import Any # ── Node types ───────────────────────────────────────────────────── NODE_TYPES: tuple[str, ...] = ( "company", "contact", "signal", "channel", "message", "objection", "outcome", "sector", "city", "campaign", "playbook", ) @dataclass class GraphNode: """A node in the Saudi Revenue Graph.""" node_id: str node_type: str label: str properties: dict[str, Any] = field(default_factory=dict) last_updated: datetime | None = None # ── Edge types — directional, typed relationships ───────────────── EDGE_TYPES: tuple[str, ...] = ( "operates_in", # company -> sector / city "decides_at", # contact -> company "shows_signal", # company -> signal "received", # company -> message "responded_with", # company -> objection / outcome "engaged_via", # company -> channel "matches_playbook", # company -> playbook "similar_to", # company -> company "originated", # campaign -> message "led_to", # message -> outcome ) @dataclass class GraphEdge: """A typed edge between two nodes with a confidence weight.""" src_id: str dst_id: str edge_type: str weight: float = 1.0 properties: dict[str, Any] = field(default_factory=dict) last_updated: datetime | None = None # ── Outcome types — what we learn from each interaction ────────── OUTCOME_TYPES: tuple[str, ...] = ( "no_response", "negative_reply", "neutral_reply", "positive_reply", "meeting_booked", "demo_held", "proposal_sent", "deal_won", "deal_lost", "expansion", "churn", ) # ── Similarity scoring ──────────────────────────────────────────── @dataclass class CompanyVector: """Numeric representation of a company for similarity search.""" company_id: str sector: str | None = None city: str | None = None size_bucket: str | None = None # micro / small / mid / large has_website: bool = False has_booking_page: bool = False has_whatsapp_business: bool = False is_hiring: bool = False runs_ads: bool = False has_government_clients: bool = False arabic_first: bool = True multi_branch: bool = False revenue_estimate_sar: float = 0.0 def _categorical_match(a: str | None, b: str | None) -> float: if a is None or b is None: return 0.0 return 1.0 if a == b else 0.0 def _bool_match(a: bool, b: bool) -> float: return 1.0 if a == b else 0.0 def cosine_similarity(a: CompanyVector, b: CompanyVector) -> float: """ Hybrid similarity: - Sector + city + size match → 0.5 weight - Capability flags match → 0.4 weight - Revenue tier proximity → 0.1 weight Returns [0, 1] score. """ cat_score = ( _categorical_match(a.sector, b.sector) * 0.25 + _categorical_match(a.city, b.city) * 0.15 + _categorical_match(a.size_bucket, b.size_bucket) * 0.10 ) flag_pairs = [ (a.has_website, b.has_website), (a.has_booking_page, b.has_booking_page), (a.has_whatsapp_business, b.has_whatsapp_business), (a.is_hiring, b.is_hiring), (a.runs_ads, b.runs_ads), (a.has_government_clients, b.has_government_clients), (a.arabic_first, b.arabic_first), (a.multi_branch, b.multi_branch), ] flag_match = sum(_bool_match(x, y) for x, y in flag_pairs) / len(flag_pairs) flag_score = flag_match * 0.4 # Revenue proximity: closer => higher rev_score = 0.0 if a.revenue_estimate_sar > 0 and b.revenue_estimate_sar > 0: ratio = min(a.revenue_estimate_sar, b.revenue_estimate_sar) / max( a.revenue_estimate_sar, b.revenue_estimate_sar ) rev_score = ratio * 0.10 return round(cat_score + flag_score + rev_score, 4) def find_similar_companies( *, target: CompanyVector, candidates: list[CompanyVector], top_k: int = 5 ) -> list[tuple[CompanyVector, float]]: """Top-k similar companies by hybrid similarity.""" scored = [(c, cosine_similarity(target, c)) for c in candidates if c.company_id != target.company_id] scored.sort(key=lambda x: x[1], reverse=True) return scored[:top_k] # ── Outcome propagation — borrow stats from similar past wins ──── @dataclass class OutcomeStats: """Aggregated outcome distribution for a cohort.""" cohort_size: int reply_rate: float booking_rate: float win_rate: float avg_deal_size_sar: float median_cycle_days: float confidence: float # 0..1, scales with cohort size def aggregate_outcomes( outcomes: list[dict[str, Any]], min_cohort: int = 5 ) -> OutcomeStats | None: """ Aggregate outcome dicts into stats. Returns None if cohort too small — enforces statistical & privacy minimum. Each outcome dict expected: {responded: bool, booked: bool, won: bool, deal_size_sar: float, cycle_days: int} """ n = len(outcomes) if n < min_cohort: return None replied = sum(1 for o in outcomes if o.get("responded")) booked = sum(1 for o in outcomes if o.get("booked")) won = sum(1 for o in outcomes if o.get("won")) deal_sizes = [o.get("deal_size_sar", 0) for o in outcomes if o.get("won")] cycles = sorted(o.get("cycle_days", 0) for o in outcomes if o.get("cycle_days")) median = cycles[len(cycles) // 2] if cycles else 0.0 avg_deal = sum(deal_sizes) / len(deal_sizes) if deal_sizes else 0.0 # Confidence climbs with cohort size; logarithmic, plateaus around 100 confidence = min(1.0, math.log10(n + 1) / 2.0) return OutcomeStats( cohort_size=n, reply_rate=round(replied / n, 4), booking_rate=round(booked / n, 4), win_rate=round(won / n, 4), avg_deal_size_sar=round(avg_deal, 2), median_cycle_days=round(median, 1), confidence=round(confidence, 4), ) # ── Probability borrowing — predict for new lead from similar past ──── def predict_outcome_probabilities( *, target: CompanyVector, historical: list[tuple[CompanyVector, dict[str, Any]]], top_k: int = 10, min_cohort: int = 5, ) -> dict[str, float] | None: """ Predict reply/booking/win probabilities for a new lead by borrowing from the top-k most similar historical outcomes. historical: list of (vector, outcome_dict) tuples. """ similar = find_similar_companies( target=target, candidates=[v for v, _ in historical], top_k=top_k ) if not similar: return None # Build cohort of outcomes from the top-k similar companies sim_ids = {v.company_id for v, _ in similar} cohort_outcomes = [o for v, o in historical if v.company_id in sim_ids] stats = aggregate_outcomes(cohort_outcomes, min_cohort=min_cohort) if stats is None: return None return { "reply_probability": stats.reply_rate, "booking_probability": stats.booking_rate, "win_probability": stats.win_rate, "expected_deal_size_sar": stats.avg_deal_size_sar, "expected_cycle_days": stats.median_cycle_days, "cohort_size": float(stats.cohort_size), "confidence": stats.confidence, } # ── Next-best-action — graph-based recommendation ──────────────── @dataclass class NextBestAction: """A single recommended action with rationale.""" action: str # e.g. "send_whatsapp_template_v3" channel: str # whatsapp / email / linkedin / call rationale: str # human-readable explanation in Arabic expected_reply_lift: float # delta vs baseline confidence: float # 0..1 playbook_id: str | None = None def recommend_next_action( *, target: CompanyVector, last_outcome: str | None, days_since_last_touch: int, win_history: dict[str, OutcomeStats] | None = None, ) -> NextBestAction: """ Recommend next best action using simple decision tree on graph state. Production version would consult the full graph; this is the initial heuristic encoding of Saudi B2B best practice. """ # No prior touch yet if last_outcome is None: if target.has_whatsapp_business: return NextBestAction( action="open_whatsapp_with_arabic_personalization", channel="whatsapp", rationale=( "الشركة تستخدم WhatsApp Business — فتح المحادثة برسالة " "عربية مخصصة يرفع معدل الرد بنسبة 3× مقارنة بالإيميل البارد." ), expected_reply_lift=2.4, confidence=0.7, ) return NextBestAction( action="send_email_with_value_first_intro", channel="email", rationale="نبدأ بإيميل قصير يقدم قيمة محددة قبل أي طلب اجتماع.", expected_reply_lift=1.0, confidence=0.55, ) # Stalled — no response in >5 days if last_outcome == "no_response" and days_since_last_touch > 5: return NextBestAction( action="multi_channel_followup", channel="whatsapp", rationale=( "5+ أيام بدون رد — التحول لـ WhatsApp برسالة قصيرة + إعادة " "صياغة العرض بزاوية مختلفة (مثلاً ROI بدلاً من ميزات)." ), expected_reply_lift=1.6, confidence=0.62, ) # Negative reply — extract objection, route to library if last_outcome == "negative_reply": return NextBestAction( action="objection_handling_response", channel="whatsapp", rationale="رد سلبي — استخراج الاعتراض من المحتوى وتطبيق المسار المناسب من Objection Library.", expected_reply_lift=0.8, confidence=0.7, ) # Positive reply — accelerate to demo if last_outcome == "positive_reply": return NextBestAction( action="propose_demo_within_24h", channel="whatsapp", rationale="رد إيجابي — السرعة (≤24 ساعة) ترفع نسبة الحجز إلى 3.2× في B2B السعودي.", expected_reply_lift=3.2, confidence=0.85, ) # Fallback return NextBestAction( action="hold_and_review", channel="manual", rationale="حالة غير اعتيادية — يحتاج مراجعة بشرية قبل الإجراء التالي.", expected_reply_lift=0.0, confidence=0.4, ) # ── Public summary — what powers the in-product Insights panel ──── def graph_health_summary( *, n_companies: int, n_signals: int, n_messages: int, n_outcomes: int, n_won_deals: int, ) -> dict[str, Any]: """High-level health metrics for the Revenue Graph dashboard tile.""" learning_density = round(n_outcomes / n_companies, 2) if n_companies else 0 moat_score = min(100, int((n_outcomes * 0.4 + n_signals * 0.3 + n_won_deals * 5) / max(1, n_companies / 100))) return { "nodes": { "companies": n_companies, "signals": n_signals, "messages": n_messages, "outcomes": n_outcomes, "won_deals": n_won_deals, }, "learning_density": learning_density, "moat_score": moat_score, # higher = stronger competitive moat "ready_for_predictions": n_outcomes >= 50, }