system-prompts-and-models-o.../dealix/scripts/dealix_prospect.py

#!/usr/bin/env python3
"""
Dealix Prospect CLI — run tech detection + lead scoring on any domain or CSV.

Usage:
  # Single domain
  python scripts/dealix_prospect.py foodics.com

  # Bulk from CSV (CSV must have a 'domain' column)
  python scripts/dealix_prospect.py --csv leads.csv --out enriched.csv

  # Bulk via live Dealix API
  python scripts/dealix_prospect.py --api https://web-dealix.up.railway.app foodics.com salla.sa

Works offline (no API keys) — uses the Dealix native tech detector.
Outputs: JSON to stdout, or CSV with added columns if --out given.
"""
from __future__ import annotations

import argparse
import asyncio
import csv
import json
import sys
from pathlib import Path

# Project root on path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))

try:
    from auto_client_acquisition.connectors.tech_detect import detect_stack
except ImportError as exc:
    print(f"ERROR: {exc}. Run from project root or install project.", file=sys.stderr)
    sys.exit(2)


async def detect_one(domain: str, timeout: float = 10.0) -> dict:
    r = await detect_stack(domain, timeout=timeout, extra_paths=["/careers", "/about"])
    return r.to_dict()


async def detect_api(domain: str, api_base: str) -> dict:
    import httpx
    async with httpx.AsyncClient() as c:
        r = await c.post(
            f"{api_base.rstrip('/')}/api/v1/prospect/enrich-tech",
            json={"domain": domain},
            timeout=30.0,
        )
        return r.json()


async def run_bulk(domains: list[str], concurrency: int, api_base: str | None) -> dict[str, dict]:
    sem = asyncio.Semaphore(concurrency)

    async def _one(d: str) -> tuple[str, dict]:
        async with sem:
            try:
                if api_base:
                    res = await detect_api(d, api_base)
                else:
                    res = await detect_one(d)
                return d, res
            except Exception as exc:  # noqa: BLE001
                return d, {"status": "error", "error": str(exc), "domain": d}

    pairs = await asyncio.gather(*(_one(d) for d in domains))
    return dict(pairs)


def _read_csv_domains(path: Path) -> list[tuple[int, dict]]:
    with open(path, newline="") as f:
        reader = csv.DictReader(f)
        rows = list(reader)
    if not rows:
        return []
    # Find domain column: exact 'domain', or first col containing 'domain'
    fieldnames = list(rows[0].keys())
    col = next(
        (c for c in fieldnames if c.lower() == "domain"),
        next((c for c in fieldnames if "domain" in c.lower()), None),
    )
    if not col:
        print(f"ERROR: CSV has no 'domain' column. Columns: {fieldnames}", file=sys.stderr)
        sys.exit(3)
    out: list[tuple[int, dict]] = []
    for i, row in enumerate(rows):
        d = (row.get(col) or "").strip()
        if d and "." in d:
            out.append((i, row))
    return out


def _flatten(detection: dict) -> dict:
    tools = [t.get("name", "") for t in detection.get("tools", []) or []]
    signals = [s.get("evidence", "") for s in detection.get("signals", []) or []]
    return {
        "tech_status": detection.get("status", ""),
        "tech_tools": "; ".join(tools[:8]),
        "tech_signals": "; ".join(signals[:5]),
        "tech_tools_count": len(tools),
    }


async def _main(args: argparse.Namespace) -> int:
    api_base = args.api or None

    # Source: CSV or positional args
    if args.csv:
        csv_path = Path(args.csv)
        if not csv_path.exists():
            print(f"ERROR: {csv_path} not found", file=sys.stderr)
            return 3
        pairs = _read_csv_domains(csv_path)
        print(f"found {len(pairs)} domains in {csv_path}", file=sys.stderr)
        domains = [p[1]["domain"] if "domain" in p[1] else next((v for k, v in p[1].items() if "domain" in k.lower()), "") for p in pairs]
        domains = [d for d in domains if d]
    else:
        domains = args.domains

    if not domains:
        print("ERROR: no domains provided. Pass args or use --csv", file=sys.stderr)
        return 1

    print(f"enriching {len(domains)} domain(s) using {'Dealix API' if api_base else 'local detector'}...", file=sys.stderr)
    results = await run_bulk(domains, concurrency=args.concurrency, api_base=api_base)

    if args.csv and args.out:
        # Re-read CSV and write augmented copy
        with open(args.csv, newline="") as f:
            reader = csv.DictReader(f)
            original_rows = list(reader)
            fieldnames = list(original_rows[0].keys())

        new_fields = ["tech_status", "tech_tools", "tech_signals", "tech_tools_count"]
        for nf in new_fields:
            if nf not in fieldnames:
                fieldnames.append(nf)

        for row in original_rows:
            d = row.get("domain") or next((v for k, v in row.items() if "domain" in k.lower()), None)
            d = (d or "").strip()
            res = results.get(d, {})
            row.update(_flatten(res))

        out_path = Path(args.out)
        with open(out_path, "w", newline="") as f:
            w = csv.DictWriter(f, fieldnames=fieldnames)
            w.writeheader()
            w.writerows(original_rows)
        print(f"✓ wrote {out_path}  ({len(original_rows)} rows)", file=sys.stderr)
        return 0

    # Default: JSON to stdout
    if len(results) == 1:
        _, v = next(iter(results.items()))
        print(json.dumps(v, indent=2, ensure_ascii=False))
    else:
        print(json.dumps(results, indent=2, ensure_ascii=False))
    return 0


def main():
    ap = argparse.ArgumentParser(
        prog="dealix-prospect",
        description="Dealix Prospector CLI — tech detection + lead scoring for Saudi B2B",
    )
    ap.add_argument("domains", nargs="*", help="one or more domains (e.g. foodics.com salla.sa)")
    ap.add_argument("--csv", help="CSV file with a 'domain' column to enrich")
    ap.add_argument("--out", help="output CSV path (with --csv) — adds tech_status/tech_tools/tech_signals/tech_tools_count")
    ap.add_argument("--api", help="use live Dealix API instead of local (e.g. https://web-dealix.up.railway.app)")
    ap.add_argument("--concurrency", type=int, default=5, help="parallelism (1-10)")
    args = ap.parse_args()
    sys.exit(asyncio.run(_main(args)))


if __name__ == "__main__":
    main()