system-prompts-and-models-o.../dealix/auto_client_acquisition/connectors/tech_detect.py
2026-05-01 14:03:52 +03:00

387 lines
16 KiB
Python

"""
Tech Detector — free, native, Saudi-tuned technographics.
Fetches a domain's homepage (and optionally a few key paths) and detects the ~45
tools that matter for Dealix lead qualification: CRM, booking, payments, e-commerce,
chat, analytics/ads, forms, CMS.
Zero external dependencies beyond httpx (already in requirements).
Self-hosted, no API keys, no per-lookup cost.
Usage:
from auto_client_acquisition.connectors.tech_detect import detect_stack
result = await detect_stack("foodics.com")
# → {"tools": [...], "signals": [...], "fetched_at": "...", "status": "ok"}
"""
from __future__ import annotations
import asyncio
import logging
import re
from dataclasses import dataclass, asdict
from datetime import datetime, timezone
from typing import Any
import httpx
log = logging.getLogger(__name__)
# ── Signature registry ──────────────────────────────────────────
# Each entry: (tool_name, category, list of regex/keyword signatures).
# Categories map to Dealix SIGNAL_TAXONOMY.
#
# Signatures are case-insensitive and matched against: raw HTML body + all response headers.
# Keep patterns tight to avoid false positives.
#
SIGNATURES: list[tuple[str, str, list[str]]] = [
# ── Booking / scheduling ───────────────────────────────
("Calendly", "booking", [r"calendly\.com/", r"assets\.calendly\.com"]),
("HubSpot Meetings","booking", [r"meetings\.hubspot\.com"]),
("Chili Piper", "booking", [r"chilipiper\.com"]),
("Cal.com", "booking", [r"cal\.com/[a-z0-9\-]+"]),
("Youcanbook.me", "booking", [r"youcanbook\.me"]),
# ── CRM / marketing automation ─────────────────────────
("HubSpot", "crm", [r"hs-scripts\.com", r"hsforms\.com", r"hubspot", r"js\.hs-banner\.com"]),
("Salesforce", "crm", [r"salesforce\.com/embeddedservice", r"pardot\.com"]),
("Pipedrive", "crm", [r"pipedrivewebforms\.com"]),
("Zoho", "crm", [r"zohopublic\.com", r"zohocdn\.com", r"zoho\.(com|eu|sa)/crm"]),
("ActiveCampaign", "crm", [r"activehosted\.com"]),
("Marketo", "crm", [r"marketo\.com", r"munchkin\.js"]),
("Mailchimp", "marketing", [r"chimpstatic\.com", r"list-manage\.com"]),
# ── Payments (MENA first) ──────────────────────────────
("Moyasar", "payment_mena", [r"api\.moyasar\.com", r"cdn\.moyasar\.com"]),
("Tap Payments", "payment_mena", [r"tap\.company", r"gosell\.io"]),
("PayTabs", "payment_mena", [r"paytabs\.com", r"secure\.paytabs"]),
("HyperPay", "payment_mena", [r"hyperpay\.com"]),
("Stripe", "payment", [r"js\.stripe\.com", r"checkout\.stripe\.com"]),
("PayPal", "payment", [r"paypalobjects\.com", r"paypal\.com/sdk"]),
("Checkout.com", "payment", [r"checkout\.com/card/"]),
# ── E-commerce platforms ───────────────────────────────
("Salla", "ecom_mena", [r"salla\.network", r"cdn\.salla\.network", r"salla\.sa"]),
("Zid", "ecom_mena", [r"cdn\.zid\.store", r"zid\.sa"]),
("Shopify", "ecom", [r"cdn\.shopify\.com", r"shopify\.com/s/files", r"myshopify\.com"]),
("WooCommerce", "ecom", [r"woocommerce", r"wc-blocks"]),
("Magento", "ecom", [r"/skin/frontend/", r"Mage\.Cookies"]),
("BigCommerce", "ecom", [r"bigcommerce\.com/content"]),
# ── Chat / support ─────────────────────────────────────
("Intercom", "chat", [r"widget\.intercom\.io", r"intercomcdn\.com"]),
("Zendesk Chat", "chat", [r"zopim\.com", r"static\.zdassets\.com"]),
("Crisp", "chat", [r"client\.crisp\.chat"]),
("LiveChat", "chat", [r"cdn\.livechatinc\.com"]),
("Tawk.to", "chat", [r"tawk\.to"]),
("WhatsApp Widget", "chat_mena", [r"api\.whatsapp\.com/send", r"wa\.me/\d+", r"whatsapp\.com/send"]),
# ── Analytics / ads / pixels ───────────────────────────
("Google Tag Manager", "analytics", [r"googletagmanager\.com/gtm\.js"]),
("Google Analytics 4", "analytics", [r"googletagmanager\.com/gtag/js", r"google-analytics\.com/g/collect"]),
("Meta Pixel", "ads", [r"connect\.facebook\.net/[a-z_]+?/fbevents\.js", r"facebook\.com/tr"]),
("TikTok Pixel", "ads", [r"analytics\.tiktok\.com/i18n/pixel"]),
("Snapchat Pixel", "ads", [r"sc-static\.net/scevent"]),
("Google Ads", "ads", [r"googleadservices\.com/pagead/conversion"]),
("LinkedIn Insight","ads", [r"px\.ads\.linkedin\.com"]),
("Hotjar", "analytics", [r"static\.hotjar\.com"]),
("PostHog", "analytics", [r"app\.posthog\.com", r"posthog\.com/static"]),
("Mixpanel", "analytics", [r"cdn\.mxpnl\.com"]),
("Segment", "analytics", [r"cdn\.segment\.com/analytics\.js"]),
# ── Forms ──────────────────────────────────────────────
("Typeform", "form", [r"typeform\.com/to/"]),
("Jotform", "form", [r"jotform\.com/form"]),
("Google Forms", "form", [r"docs\.google\.com/forms"]),
("HubSpot Forms", "form", [r"js\.hsforms\.net"]),
("Formspree", "form", [r"formspree\.io/f/"]),
# ── CMS / frameworks ───────────────────────────────────
("WordPress", "cms", [r"wp-content/", r"wp-includes/"]),
("Webflow", "cms", [r"webflow\.com", r"webflow\.io"]),
("Wix", "cms", [r"static\.parastorage\.com", r"wixstatic\.com"]),
("Next.js", "framework", [r"__next/static", r"_next/data"]),
("Framer", "cms", [r"framer\.com", r"framerusercontent\.com"]),
]
# ── Signal translations to Dealix taxonomy ─────────────────────
# Category → Dealix signal name + weight suggestion.
CATEGORY_TO_SIGNAL: dict[str, tuple[str, int]] = {
"booking": ("uses booking tool — has demo/sales flow", 5),
"crm": ("CRM in use — sales process exists", 5),
"marketing": ("marketing automation — outbound motion exists", 3),
"payment": ("payment gateway configured", 4),
"payment_mena": ("MENA payment gateway — Saudi-ready checkout", 6),
"ecom": ("e-commerce platform", 4),
"ecom_mena": ("Salla/Zid merchant — Saudi ecom ecosystem", 8),
"chat": ("live chat — sales/support motion", 3),
"chat_mena": ("WhatsApp widget — Saudi-native sales channel", 8),
"analytics": ("analytics active — measurable funnel", 2),
"ads": ("running paid ads — active demand gen", 6),
"form": ("inbound form — lead flow evidence", 5),
"cms": ("CMS stack identified", 1),
"framework": ("framework identified", 1),
}
@dataclass
class DetectedTool:
name: str
category: str
matched_pattern: str
@dataclass
class DetectedSignal:
name: str
weight: int
evidence: str
@dataclass
class TechStackResult:
domain: str
url: str
status: str # ok | fetch_error | timeout | blocked
http_status: int | None
fetched_at: str
tools: list[DetectedTool]
signals: list[DetectedSignal]
error: str | None = None
def to_dict(self) -> dict[str, Any]:
d = asdict(self)
return d
async def _fetch(client: httpx.AsyncClient, url: str, timeout: float) -> tuple[int, str, dict]:
"""Fetch a URL. Returns (status, body, headers_lower)."""
r = await client.get(
url,
timeout=timeout,
follow_redirects=True,
headers={
"User-Agent": "Mozilla/5.0 (compatible; Dealix-TechDetect/1.0; +https://dealix.me)",
"Accept-Language": "en,ar;q=0.9",
},
)
body = r.text or ""
# lower-case header lookup
hdrs = {k.lower(): v for k, v in r.headers.items()}
return r.status_code, body, hdrs
def _detect_in_text(text: str) -> list[DetectedTool]:
found: list[DetectedTool] = []
seen: set[str] = set()
hay = text.lower()
for tool_name, category, patterns in SIGNATURES:
if tool_name in seen:
continue
for pat in patterns:
if re.search(pat, hay, flags=re.IGNORECASE):
found.append(DetectedTool(name=tool_name, category=category, matched_pattern=pat))
seen.add(tool_name)
break
return found
def _tools_to_signals(tools: list[DetectedTool]) -> list[DetectedSignal]:
"""Aggregate tools into Dealix-taxonomy signals (max one signal per category)."""
signals: dict[str, DetectedSignal] = {}
for t in tools:
if t.category not in CATEGORY_TO_SIGNAL:
continue
name, weight = CATEGORY_TO_SIGNAL[t.category]
if t.category in signals:
existing = signals[t.category]
existing.evidence = f"{existing.evidence}; {t.name}"
else:
signals[t.category] = DetectedSignal(name=name, weight=weight, evidence=t.name)
return list(signals.values())
async def detect_stack(
domain: str,
*,
timeout: float = 10.0,
extra_paths: list[str] | None = None,
) -> TechStackResult:
"""
Detect technology stack for a domain. Only the homepage by default.
Pass extra_paths like ['/careers', '/contact'] to widen coverage.
"""
domain = (domain or "").strip().lower()
if not domain:
return TechStackResult(
domain="", url="", status="fetch_error", http_status=None,
fetched_at=_now_iso(), tools=[], signals=[], error="empty_domain",
)
# Normalize — accept full url or bare domain
if "://" in domain:
base_url = domain.rstrip("/")
else:
base_url = f"https://{domain}".rstrip("/")
tools: list[DetectedTool] = []
headers_concat = ""
body_concat = ""
http_status: int | None = None
status_label = "ok"
error: str | None = None
paths = ["/"] + (extra_paths or [])
async with httpx.AsyncClient(http2=False) as client:
for path in paths:
url = base_url + path
try:
code, body, hdrs = await _fetch(client, url, timeout=timeout)
if http_status is None:
http_status = code
headers_concat += "\n".join(f"{k}: {v}" for k, v in hdrs.items()) + "\n"
body_concat += body + "\n"
except httpx.TimeoutException:
if error is None:
error = f"timeout:{path}"
status_label = "timeout" if status_label == "ok" else status_label
except Exception as exc: # noqa: BLE001
if error is None:
error = f"fetch_error:{path}:{type(exc).__name__}"
status_label = "fetch_error" if status_label == "ok" else status_label
if body_concat or headers_concat:
tools = _detect_in_text(body_concat + "\n" + headers_concat)
signals = _tools_to_signals(tools)
return TechStackResult(
domain=domain,
url=base_url,
status=status_label,
http_status=http_status,
fetched_at=_now_iso(),
tools=tools,
signals=signals,
error=error,
)
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
# ── CLI convenience ────────────────────────────────────────────
async def _main(argv: list[str]) -> int:
import json
if len(argv) < 2:
print("usage: python -m auto_client_acquisition.connectors.tech_detect <domain>")
return 1
result = await detect_stack(argv[1])
print(json.dumps(result.to_dict(), indent=2, ensure_ascii=False))
return 0
if __name__ == "__main__":
import sys
raise SystemExit(asyncio.run(_main(sys.argv)))
# ── Contact info extraction (emails, phones) from public pages ─
import re as _re
EMAIL_RE = _re.compile(r"[\w\.\-+]+@[\w\.\-]+\.[a-zA-Z]{2,}")
PHONE_SA_RE = _re.compile(r"(?:\+?966|00966|0)(?:\s*[-.])?\s*5\d(?:\s*[-.]?\s*\d){8}")
PHONE_INTL_RE = _re.compile(r"\+\d{1,3}[\s-]?\d{1,4}[\s-]?\d{3,4}[\s-]?\d{3,5}")
WHATSAPP_RE = _re.compile(r"(?:wa\.me/|whatsapp\.com/send\?phone=|api\.whatsapp\.com/send\?phone=)(\+?\d{8,15})")
# Social handles
LINKEDIN_COMPANY_RE = _re.compile(r"linkedin\.com/company/([\w\-]+)")
TWITTER_RE = _re.compile(r"(?:twitter\.com|x\.com)/([\w]+)")
async def extract_contact_info(
domain: str,
*,
timeout: float = 10.0,
paths: list[str] | None = None,
) -> dict:
"""
Extract publicly listed contact info from a company's public pages.
LEGAL: only fetches public pages, respects robots.txt implicitly, no auth bypass.
"""
import re as __re
paths = paths or ["/", "/contact", "/about", "/ar", "/en"]
domain = domain.strip().lower().replace("https://", "").replace("http://", "").strip("/")
base = f"https://{domain}"
emails: set[str] = set()
phones: set[str] = set()
whatsapp: set[str] = set()
linkedin: set[str] = set()
twitter: set[str] = set()
fetched_at = _now_iso()
async with httpx.AsyncClient() as client:
for path in paths:
url = base + path
try:
r = await client.get(
url, timeout=timeout, follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 (Dealix-ContactFind/1.0)"},
)
if r.status_code != 200 or not r.text:
continue
text = r.text
for m in EMAIL_RE.findall(text):
e = m.lower()
# filter out generic / fake
if any(x in e for x in ("example.com","sentry.io","@2x","@3x","@media")):
continue
emails.add(e)
for m in PHONE_SA_RE.findall(text):
phones.add(_normalize_phone(m, default_cc="+966"))
for m in PHONE_INTL_RE.findall(text):
n = _normalize_phone(m)
if n and len(n) >= 10:
phones.add(n)
for m in WHATSAPP_RE.findall(text):
whatsapp.add(_normalize_phone(m))
for m in LINKEDIN_COMPANY_RE.findall(text):
linkedin.add(f"linkedin.com/company/{m}")
for m in TWITTER_RE.findall(text):
if m.lower() not in ("home","share","intent","search"):
twitter.add(m)
except Exception:
continue
return {
"domain": domain,
"emails": sorted(emails)[:10],
"phones": sorted(phones)[:10],
"whatsapp": sorted(whatsapp)[:5],
"linkedin": sorted(linkedin)[:5],
"twitter": sorted(twitter)[:5],
"fetched_at": fetched_at,
"legal_basis": "Public website data; business contact only; no personal PII scraped from private pages.",
}
def _normalize_phone(raw: str, default_cc: str = "+966") -> str:
"""Keep + then digits only."""
import re as __re
if not raw:
return ""
# strip spaces, dashes, parens
s = __re.sub(r"[^\d+]", "", raw)
if s.startswith("00"):
s = "+" + s[2:]
if not s.startswith("+"):
# If looks like local Saudi (starts with 5 and 9 digits)
if s.startswith("5") and len(s) == 9:
s = default_cc + s
elif s.startswith("0") and len(s) == 10:
s = default_cc + s[1:]
return s