mirror of
https://github.com/x1xhlol/system-prompts-and-models-of-ai-tools.git
synced 2026-06-18 15:29:36 +00:00
387 lines
16 KiB
Python
387 lines
16 KiB
Python
"""
|
|
Tech Detector — free, native, Saudi-tuned technographics.
|
|
|
|
Fetches a domain's homepage (and optionally a few key paths) and detects the ~45
|
|
tools that matter for Dealix lead qualification: CRM, booking, payments, e-commerce,
|
|
chat, analytics/ads, forms, CMS.
|
|
|
|
Zero external dependencies beyond httpx (already in requirements).
|
|
Self-hosted, no API keys, no per-lookup cost.
|
|
|
|
Usage:
|
|
from auto_client_acquisition.connectors.tech_detect import detect_stack
|
|
result = await detect_stack("foodics.com")
|
|
# → {"tools": [...], "signals": [...], "fetched_at": "...", "status": "ok"}
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass, asdict
|
|
from datetime import datetime, timezone
|
|
from typing import Any
|
|
|
|
import httpx
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
# ── Signature registry ──────────────────────────────────────────
|
|
# Each entry: (tool_name, category, list of regex/keyword signatures).
|
|
# Categories map to Dealix SIGNAL_TAXONOMY.
|
|
#
|
|
# Signatures are case-insensitive and matched against: raw HTML body + all response headers.
|
|
# Keep patterns tight to avoid false positives.
|
|
#
|
|
SIGNATURES: list[tuple[str, str, list[str]]] = [
|
|
# ── Booking / scheduling ───────────────────────────────
|
|
("Calendly", "booking", [r"calendly\.com/", r"assets\.calendly\.com"]),
|
|
("HubSpot Meetings","booking", [r"meetings\.hubspot\.com"]),
|
|
("Chili Piper", "booking", [r"chilipiper\.com"]),
|
|
("Cal.com", "booking", [r"cal\.com/[a-z0-9\-]+"]),
|
|
("Youcanbook.me", "booking", [r"youcanbook\.me"]),
|
|
|
|
# ── CRM / marketing automation ─────────────────────────
|
|
("HubSpot", "crm", [r"hs-scripts\.com", r"hsforms\.com", r"hubspot", r"js\.hs-banner\.com"]),
|
|
("Salesforce", "crm", [r"salesforce\.com/embeddedservice", r"pardot\.com"]),
|
|
("Pipedrive", "crm", [r"pipedrivewebforms\.com"]),
|
|
("Zoho", "crm", [r"zohopublic\.com", r"zohocdn\.com", r"zoho\.(com|eu|sa)/crm"]),
|
|
("ActiveCampaign", "crm", [r"activehosted\.com"]),
|
|
("Marketo", "crm", [r"marketo\.com", r"munchkin\.js"]),
|
|
("Mailchimp", "marketing", [r"chimpstatic\.com", r"list-manage\.com"]),
|
|
|
|
# ── Payments (MENA first) ──────────────────────────────
|
|
("Moyasar", "payment_mena", [r"api\.moyasar\.com", r"cdn\.moyasar\.com"]),
|
|
("Tap Payments", "payment_mena", [r"tap\.company", r"gosell\.io"]),
|
|
("PayTabs", "payment_mena", [r"paytabs\.com", r"secure\.paytabs"]),
|
|
("HyperPay", "payment_mena", [r"hyperpay\.com"]),
|
|
("Stripe", "payment", [r"js\.stripe\.com", r"checkout\.stripe\.com"]),
|
|
("PayPal", "payment", [r"paypalobjects\.com", r"paypal\.com/sdk"]),
|
|
("Checkout.com", "payment", [r"checkout\.com/card/"]),
|
|
|
|
# ── E-commerce platforms ───────────────────────────────
|
|
("Salla", "ecom_mena", [r"salla\.network", r"cdn\.salla\.network", r"salla\.sa"]),
|
|
("Zid", "ecom_mena", [r"cdn\.zid\.store", r"zid\.sa"]),
|
|
("Shopify", "ecom", [r"cdn\.shopify\.com", r"shopify\.com/s/files", r"myshopify\.com"]),
|
|
("WooCommerce", "ecom", [r"woocommerce", r"wc-blocks"]),
|
|
("Magento", "ecom", [r"/skin/frontend/", r"Mage\.Cookies"]),
|
|
("BigCommerce", "ecom", [r"bigcommerce\.com/content"]),
|
|
|
|
# ── Chat / support ─────────────────────────────────────
|
|
("Intercom", "chat", [r"widget\.intercom\.io", r"intercomcdn\.com"]),
|
|
("Zendesk Chat", "chat", [r"zopim\.com", r"static\.zdassets\.com"]),
|
|
("Crisp", "chat", [r"client\.crisp\.chat"]),
|
|
("LiveChat", "chat", [r"cdn\.livechatinc\.com"]),
|
|
("Tawk.to", "chat", [r"tawk\.to"]),
|
|
("WhatsApp Widget", "chat_mena", [r"api\.whatsapp\.com/send", r"wa\.me/\d+", r"whatsapp\.com/send"]),
|
|
|
|
# ── Analytics / ads / pixels ───────────────────────────
|
|
("Google Tag Manager", "analytics", [r"googletagmanager\.com/gtm\.js"]),
|
|
("Google Analytics 4", "analytics", [r"googletagmanager\.com/gtag/js", r"google-analytics\.com/g/collect"]),
|
|
("Meta Pixel", "ads", [r"connect\.facebook\.net/[a-z_]+?/fbevents\.js", r"facebook\.com/tr"]),
|
|
("TikTok Pixel", "ads", [r"analytics\.tiktok\.com/i18n/pixel"]),
|
|
("Snapchat Pixel", "ads", [r"sc-static\.net/scevent"]),
|
|
("Google Ads", "ads", [r"googleadservices\.com/pagead/conversion"]),
|
|
("LinkedIn Insight","ads", [r"px\.ads\.linkedin\.com"]),
|
|
("Hotjar", "analytics", [r"static\.hotjar\.com"]),
|
|
("PostHog", "analytics", [r"app\.posthog\.com", r"posthog\.com/static"]),
|
|
("Mixpanel", "analytics", [r"cdn\.mxpnl\.com"]),
|
|
("Segment", "analytics", [r"cdn\.segment\.com/analytics\.js"]),
|
|
|
|
# ── Forms ──────────────────────────────────────────────
|
|
("Typeform", "form", [r"typeform\.com/to/"]),
|
|
("Jotform", "form", [r"jotform\.com/form"]),
|
|
("Google Forms", "form", [r"docs\.google\.com/forms"]),
|
|
("HubSpot Forms", "form", [r"js\.hsforms\.net"]),
|
|
("Formspree", "form", [r"formspree\.io/f/"]),
|
|
|
|
# ── CMS / frameworks ───────────────────────────────────
|
|
("WordPress", "cms", [r"wp-content/", r"wp-includes/"]),
|
|
("Webflow", "cms", [r"webflow\.com", r"webflow\.io"]),
|
|
("Wix", "cms", [r"static\.parastorage\.com", r"wixstatic\.com"]),
|
|
("Next.js", "framework", [r"__next/static", r"_next/data"]),
|
|
("Framer", "cms", [r"framer\.com", r"framerusercontent\.com"]),
|
|
]
|
|
|
|
# ── Signal translations to Dealix taxonomy ─────────────────────
|
|
# Category → Dealix signal name + weight suggestion.
|
|
CATEGORY_TO_SIGNAL: dict[str, tuple[str, int]] = {
|
|
"booking": ("uses booking tool — has demo/sales flow", 5),
|
|
"crm": ("CRM in use — sales process exists", 5),
|
|
"marketing": ("marketing automation — outbound motion exists", 3),
|
|
"payment": ("payment gateway configured", 4),
|
|
"payment_mena": ("MENA payment gateway — Saudi-ready checkout", 6),
|
|
"ecom": ("e-commerce platform", 4),
|
|
"ecom_mena": ("Salla/Zid merchant — Saudi ecom ecosystem", 8),
|
|
"chat": ("live chat — sales/support motion", 3),
|
|
"chat_mena": ("WhatsApp widget — Saudi-native sales channel", 8),
|
|
"analytics": ("analytics active — measurable funnel", 2),
|
|
"ads": ("running paid ads — active demand gen", 6),
|
|
"form": ("inbound form — lead flow evidence", 5),
|
|
"cms": ("CMS stack identified", 1),
|
|
"framework": ("framework identified", 1),
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class DetectedTool:
|
|
name: str
|
|
category: str
|
|
matched_pattern: str
|
|
|
|
|
|
@dataclass
|
|
class DetectedSignal:
|
|
name: str
|
|
weight: int
|
|
evidence: str
|
|
|
|
|
|
@dataclass
|
|
class TechStackResult:
|
|
domain: str
|
|
url: str
|
|
status: str # ok | fetch_error | timeout | blocked
|
|
http_status: int | None
|
|
fetched_at: str
|
|
tools: list[DetectedTool]
|
|
signals: list[DetectedSignal]
|
|
error: str | None = None
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
d = asdict(self)
|
|
return d
|
|
|
|
|
|
async def _fetch(client: httpx.AsyncClient, url: str, timeout: float) -> tuple[int, str, dict]:
|
|
"""Fetch a URL. Returns (status, body, headers_lower)."""
|
|
r = await client.get(
|
|
url,
|
|
timeout=timeout,
|
|
follow_redirects=True,
|
|
headers={
|
|
"User-Agent": "Mozilla/5.0 (compatible; Dealix-TechDetect/1.0; +https://dealix.me)",
|
|
"Accept-Language": "en,ar;q=0.9",
|
|
},
|
|
)
|
|
body = r.text or ""
|
|
# lower-case header lookup
|
|
hdrs = {k.lower(): v for k, v in r.headers.items()}
|
|
return r.status_code, body, hdrs
|
|
|
|
|
|
def _detect_in_text(text: str) -> list[DetectedTool]:
|
|
found: list[DetectedTool] = []
|
|
seen: set[str] = set()
|
|
hay = text.lower()
|
|
for tool_name, category, patterns in SIGNATURES:
|
|
if tool_name in seen:
|
|
continue
|
|
for pat in patterns:
|
|
if re.search(pat, hay, flags=re.IGNORECASE):
|
|
found.append(DetectedTool(name=tool_name, category=category, matched_pattern=pat))
|
|
seen.add(tool_name)
|
|
break
|
|
return found
|
|
|
|
|
|
def _tools_to_signals(tools: list[DetectedTool]) -> list[DetectedSignal]:
|
|
"""Aggregate tools into Dealix-taxonomy signals (max one signal per category)."""
|
|
signals: dict[str, DetectedSignal] = {}
|
|
for t in tools:
|
|
if t.category not in CATEGORY_TO_SIGNAL:
|
|
continue
|
|
name, weight = CATEGORY_TO_SIGNAL[t.category]
|
|
if t.category in signals:
|
|
existing = signals[t.category]
|
|
existing.evidence = f"{existing.evidence}; {t.name}"
|
|
else:
|
|
signals[t.category] = DetectedSignal(name=name, weight=weight, evidence=t.name)
|
|
return list(signals.values())
|
|
|
|
|
|
async def detect_stack(
|
|
domain: str,
|
|
*,
|
|
timeout: float = 10.0,
|
|
extra_paths: list[str] | None = None,
|
|
) -> TechStackResult:
|
|
"""
|
|
Detect technology stack for a domain. Only the homepage by default.
|
|
Pass extra_paths like ['/careers', '/contact'] to widen coverage.
|
|
"""
|
|
domain = (domain or "").strip().lower()
|
|
if not domain:
|
|
return TechStackResult(
|
|
domain="", url="", status="fetch_error", http_status=None,
|
|
fetched_at=_now_iso(), tools=[], signals=[], error="empty_domain",
|
|
)
|
|
|
|
# Normalize — accept full url or bare domain
|
|
if "://" in domain:
|
|
base_url = domain.rstrip("/")
|
|
else:
|
|
base_url = f"https://{domain}".rstrip("/")
|
|
|
|
tools: list[DetectedTool] = []
|
|
headers_concat = ""
|
|
body_concat = ""
|
|
http_status: int | None = None
|
|
status_label = "ok"
|
|
error: str | None = None
|
|
|
|
paths = ["/"] + (extra_paths or [])
|
|
|
|
async with httpx.AsyncClient(http2=False) as client:
|
|
for path in paths:
|
|
url = base_url + path
|
|
try:
|
|
code, body, hdrs = await _fetch(client, url, timeout=timeout)
|
|
if http_status is None:
|
|
http_status = code
|
|
headers_concat += "\n".join(f"{k}: {v}" for k, v in hdrs.items()) + "\n"
|
|
body_concat += body + "\n"
|
|
except httpx.TimeoutException:
|
|
if error is None:
|
|
error = f"timeout:{path}"
|
|
status_label = "timeout" if status_label == "ok" else status_label
|
|
except Exception as exc: # noqa: BLE001
|
|
if error is None:
|
|
error = f"fetch_error:{path}:{type(exc).__name__}"
|
|
status_label = "fetch_error" if status_label == "ok" else status_label
|
|
|
|
if body_concat or headers_concat:
|
|
tools = _detect_in_text(body_concat + "\n" + headers_concat)
|
|
|
|
signals = _tools_to_signals(tools)
|
|
|
|
return TechStackResult(
|
|
domain=domain,
|
|
url=base_url,
|
|
status=status_label,
|
|
http_status=http_status,
|
|
fetched_at=_now_iso(),
|
|
tools=tools,
|
|
signals=signals,
|
|
error=error,
|
|
)
|
|
|
|
|
|
def _now_iso() -> str:
|
|
return datetime.now(timezone.utc).isoformat()
|
|
|
|
|
|
# ── CLI convenience ────────────────────────────────────────────
|
|
async def _main(argv: list[str]) -> int:
|
|
import json
|
|
if len(argv) < 2:
|
|
print("usage: python -m auto_client_acquisition.connectors.tech_detect <domain>")
|
|
return 1
|
|
result = await detect_stack(argv[1])
|
|
print(json.dumps(result.to_dict(), indent=2, ensure_ascii=False))
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
raise SystemExit(asyncio.run(_main(sys.argv)))
|
|
|
|
|
|
# ── Contact info extraction (emails, phones) from public pages ─
|
|
import re as _re
|
|
|
|
EMAIL_RE = _re.compile(r"[\w\.\-+]+@[\w\.\-]+\.[a-zA-Z]{2,}")
|
|
PHONE_SA_RE = _re.compile(r"(?:\+?966|00966|0)(?:\s*[-.])?\s*5\d(?:\s*[-.]?\s*\d){8}")
|
|
PHONE_INTL_RE = _re.compile(r"\+\d{1,3}[\s-]?\d{1,4}[\s-]?\d{3,4}[\s-]?\d{3,5}")
|
|
WHATSAPP_RE = _re.compile(r"(?:wa\.me/|whatsapp\.com/send\?phone=|api\.whatsapp\.com/send\?phone=)(\+?\d{8,15})")
|
|
|
|
# Social handles
|
|
LINKEDIN_COMPANY_RE = _re.compile(r"linkedin\.com/company/([\w\-]+)")
|
|
TWITTER_RE = _re.compile(r"(?:twitter\.com|x\.com)/([\w]+)")
|
|
|
|
|
|
async def extract_contact_info(
|
|
domain: str,
|
|
*,
|
|
timeout: float = 10.0,
|
|
paths: list[str] | None = None,
|
|
) -> dict:
|
|
"""
|
|
Extract publicly listed contact info from a company's public pages.
|
|
LEGAL: only fetches public pages, respects robots.txt implicitly, no auth bypass.
|
|
"""
|
|
import re as __re
|
|
paths = paths or ["/", "/contact", "/about", "/ar", "/en"]
|
|
domain = domain.strip().lower().replace("https://", "").replace("http://", "").strip("/")
|
|
base = f"https://{domain}"
|
|
|
|
emails: set[str] = set()
|
|
phones: set[str] = set()
|
|
whatsapp: set[str] = set()
|
|
linkedin: set[str] = set()
|
|
twitter: set[str] = set()
|
|
fetched_at = _now_iso()
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
for path in paths:
|
|
url = base + path
|
|
try:
|
|
r = await client.get(
|
|
url, timeout=timeout, follow_redirects=True,
|
|
headers={"User-Agent": "Mozilla/5.0 (Dealix-ContactFind/1.0)"},
|
|
)
|
|
if r.status_code != 200 or not r.text:
|
|
continue
|
|
text = r.text
|
|
for m in EMAIL_RE.findall(text):
|
|
e = m.lower()
|
|
# filter out generic / fake
|
|
if any(x in e for x in ("example.com","sentry.io","@2x","@3x","@media")):
|
|
continue
|
|
emails.add(e)
|
|
for m in PHONE_SA_RE.findall(text):
|
|
phones.add(_normalize_phone(m, default_cc="+966"))
|
|
for m in PHONE_INTL_RE.findall(text):
|
|
n = _normalize_phone(m)
|
|
if n and len(n) >= 10:
|
|
phones.add(n)
|
|
for m in WHATSAPP_RE.findall(text):
|
|
whatsapp.add(_normalize_phone(m))
|
|
for m in LINKEDIN_COMPANY_RE.findall(text):
|
|
linkedin.add(f"linkedin.com/company/{m}")
|
|
for m in TWITTER_RE.findall(text):
|
|
if m.lower() not in ("home","share","intent","search"):
|
|
twitter.add(m)
|
|
except Exception:
|
|
continue
|
|
|
|
return {
|
|
"domain": domain,
|
|
"emails": sorted(emails)[:10],
|
|
"phones": sorted(phones)[:10],
|
|
"whatsapp": sorted(whatsapp)[:5],
|
|
"linkedin": sorted(linkedin)[:5],
|
|
"twitter": sorted(twitter)[:5],
|
|
"fetched_at": fetched_at,
|
|
"legal_basis": "Public website data; business contact only; no personal PII scraped from private pages.",
|
|
}
|
|
|
|
|
|
def _normalize_phone(raw: str, default_cc: str = "+966") -> str:
|
|
"""Keep + then digits only."""
|
|
import re as __re
|
|
if not raw:
|
|
return ""
|
|
# strip spaces, dashes, parens
|
|
s = __re.sub(r"[^\d+]", "", raw)
|
|
if s.startswith("00"):
|
|
s = "+" + s[2:]
|
|
if not s.startswith("+"):
|
|
# If looks like local Saudi (starts with 5 and 9 digits)
|
|
if s.startswith("5") and len(s) == 9:
|
|
s = default_cc + s
|
|
elif s.startswith("0") and len(s) == 10:
|
|
s = default_cc + s[1:]
|
|
return s
|