system-prompts-and-models-o.../dealix/scripts/dealix_prospect.py
2026-05-01 14:03:52 +03:00

180 lines
6.2 KiB
Python

#!/usr/bin/env python3
"""
Dealix Prospect CLI — run tech detection + lead scoring on any domain or CSV.
Usage:
# Single domain
python scripts/dealix_prospect.py foodics.com
# Bulk from CSV (CSV must have a 'domain' column)
python scripts/dealix_prospect.py --csv leads.csv --out enriched.csv
# Bulk via live Dealix API
python scripts/dealix_prospect.py --api https://web-dealix.up.railway.app foodics.com salla.sa
Works offline (no API keys) — uses the Dealix native tech detector.
Outputs: JSON to stdout, or CSV with added columns if --out given.
"""
from __future__ import annotations
import argparse
import asyncio
import csv
import json
import sys
from pathlib import Path
# Project root on path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
try:
from auto_client_acquisition.connectors.tech_detect import detect_stack
except ImportError as exc:
print(f"ERROR: {exc}. Run from project root or install project.", file=sys.stderr)
sys.exit(2)
async def detect_one(domain: str, timeout: float = 10.0) -> dict:
r = await detect_stack(domain, timeout=timeout, extra_paths=["/careers", "/about"])
return r.to_dict()
async def detect_api(domain: str, api_base: str) -> dict:
import httpx
async with httpx.AsyncClient() as c:
r = await c.post(
f"{api_base.rstrip('/')}/api/v1/prospect/enrich-tech",
json={"domain": domain},
timeout=30.0,
)
return r.json()
async def run_bulk(domains: list[str], concurrency: int, api_base: str | None) -> dict[str, dict]:
sem = asyncio.Semaphore(concurrency)
async def _one(d: str) -> tuple[str, dict]:
async with sem:
try:
if api_base:
res = await detect_api(d, api_base)
else:
res = await detect_one(d)
return d, res
except Exception as exc: # noqa: BLE001
return d, {"status": "error", "error": str(exc), "domain": d}
pairs = await asyncio.gather(*(_one(d) for d in domains))
return dict(pairs)
def _read_csv_domains(path: Path) -> list[tuple[int, dict]]:
with open(path, newline="") as f:
reader = csv.DictReader(f)
rows = list(reader)
if not rows:
return []
# Find domain column: exact 'domain', or first col containing 'domain'
fieldnames = list(rows[0].keys())
col = next(
(c for c in fieldnames if c.lower() == "domain"),
next((c for c in fieldnames if "domain" in c.lower()), None),
)
if not col:
print(f"ERROR: CSV has no 'domain' column. Columns: {fieldnames}", file=sys.stderr)
sys.exit(3)
out: list[tuple[int, dict]] = []
for i, row in enumerate(rows):
d = (row.get(col) or "").strip()
if d and "." in d:
out.append((i, row))
return out
def _flatten(detection: dict) -> dict:
tools = [t.get("name", "") for t in detection.get("tools", []) or []]
signals = [s.get("evidence", "") for s in detection.get("signals", []) or []]
return {
"tech_status": detection.get("status", ""),
"tech_tools": "; ".join(tools[:8]),
"tech_signals": "; ".join(signals[:5]),
"tech_tools_count": len(tools),
}
async def _main(args: argparse.Namespace) -> int:
api_base = args.api or None
# Source: CSV or positional args
if args.csv:
csv_path = Path(args.csv)
if not csv_path.exists():
print(f"ERROR: {csv_path} not found", file=sys.stderr)
return 3
pairs = _read_csv_domains(csv_path)
print(f"found {len(pairs)} domains in {csv_path}", file=sys.stderr)
domains = [p[1]["domain"] if "domain" in p[1] else next((v for k, v in p[1].items() if "domain" in k.lower()), "") for p in pairs]
domains = [d for d in domains if d]
else:
domains = args.domains
if not domains:
print("ERROR: no domains provided. Pass args or use --csv", file=sys.stderr)
return 1
print(f"enriching {len(domains)} domain(s) using {'Dealix API' if api_base else 'local detector'}...", file=sys.stderr)
results = await run_bulk(domains, concurrency=args.concurrency, api_base=api_base)
if args.csv and args.out:
# Re-read CSV and write augmented copy
with open(args.csv, newline="") as f:
reader = csv.DictReader(f)
original_rows = list(reader)
fieldnames = list(original_rows[0].keys())
new_fields = ["tech_status", "tech_tools", "tech_signals", "tech_tools_count"]
for nf in new_fields:
if nf not in fieldnames:
fieldnames.append(nf)
for row in original_rows:
d = row.get("domain") or next((v for k, v in row.items() if "domain" in k.lower()), None)
d = (d or "").strip()
res = results.get(d, {})
row.update(_flatten(res))
out_path = Path(args.out)
with open(out_path, "w", newline="") as f:
w = csv.DictWriter(f, fieldnames=fieldnames)
w.writeheader()
w.writerows(original_rows)
print(f"✓ wrote {out_path} ({len(original_rows)} rows)", file=sys.stderr)
return 0
# Default: JSON to stdout
if len(results) == 1:
_, v = next(iter(results.items()))
print(json.dumps(v, indent=2, ensure_ascii=False))
else:
print(json.dumps(results, indent=2, ensure_ascii=False))
return 0
def main():
ap = argparse.ArgumentParser(
prog="dealix-prospect",
description="Dealix Prospector CLI — tech detection + lead scoring for Saudi B2B",
)
ap.add_argument("domains", nargs="*", help="one or more domains (e.g. foodics.com salla.sa)")
ap.add_argument("--csv", help="CSV file with a 'domain' column to enrich")
ap.add_argument("--out", help="output CSV path (with --csv) — adds tech_status/tech_tools/tech_signals/tech_tools_count")
ap.add_argument("--api", help="use live Dealix API instead of local (e.g. https://web-dealix.up.railway.app)")
ap.add_argument("--concurrency", type=int, default=5, help="parallelism (1-10)")
args = ap.parse_args()
sys.exit(asyncio.run(_main(args)))
if __name__ == "__main__":
main()