#!/usr/bin/env python3 """ audit_lead_file.py โ€” Local audit of a CSV/JSON lead file BEFORE you ingest it. Reports: - row count - per-field fill rate - normalized phone success rate (Saudi) - email validity rate - domain extractability - estimated dedup risk inside the file - flags for risky fields (personal-only contacts, no source URL) Use this to evaluate purchased datasets BEFORE you pay or upload. """ from __future__ import annotations import argparse import csv import json import sys from collections import Counter from pathlib import Path from typing import Any # Make local imports work without installing the package sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from auto_client_acquisition.pipelines.normalize import ( # noqa: E402 fuzzy_company_key, is_acceptable, normalize_email, normalize_domain, normalize_row, normalize_saudi_phone, ) def parse_file(path: Path) -> list[dict]: if path.suffix.lower() in {".json", ".jsonl"}: text = path.read_text(encoding="utf-8") if path.suffix.lower() == ".jsonl": return [json.loads(line) for line in text.splitlines() if line.strip()] data = json.loads(text) if isinstance(data, list): return data if isinstance(data, dict) and "rows" in data: return data["rows"] raise SystemExit("JSON must be list or {rows: [...]}") if path.suffix.lower() in {".csv", ".tsv"}: delim = "\t" if path.suffix.lower() == ".tsv" else "," with path.open(encoding="utf-8-sig", newline="") as f: return [dict(r) for r in csv.DictReader(f, delimiter=delim)] raise SystemExit(f"unsupported: {path.suffix}") def pct(n: int, d: int) -> str: return f"{(n / d * 100):.1f}%" if d else "0.0%" def main() -> int: ap = argparse.ArgumentParser() ap.add_argument("file") ap.add_argument("--show-rejects", type=int, default=5, help="Show first N rejected rows (default 5)") args = ap.parse_args() p = Path(args.file) if not p.exists(): print(f"file not found: {p}", file=sys.stderr) return 2 rows = parse_file(p) n = len(rows) if not n: print("empty file", file=sys.stderr) return 2 field_present: Counter[str] = Counter() for r in rows: for k, v in r.items(): if v not in (None, ""): field_present[k] += 1 accepted = 0 rejected: list[tuple[dict, str]] = [] phones_normalized = 0 emails_valid = 0 domains_extractable = 0 # Match server-side dedupe: collide on ANY of domain/phone/email/place_id/name+city by_domain: Counter[str] = Counter() by_phone: Counter[str] = Counter() by_email: Counter[str] = Counter() by_place: Counter[str] = Counter() by_name_city: Counter[str] = Counter() for r in rows: nr = normalize_row(r) if nr.get("phone"): phones_normalized += 1 if nr.get("email"): emails_valid += 1 if nr.get("domain"): domains_extractable += 1 ok, why = is_acceptable(nr) if ok: accepted += 1 d_val = nr.get("domain") if d_val: by_domain[d_val] += 1 p_val = nr.get("phone") if p_val: by_phone[p_val] += 1 e_val = nr.get("email") if e_val: by_email[e_val] += 1 pid_val = nr.get("google_place_id") if pid_val: by_place[pid_val] += 1 nk_val = nr.get("normalized_name") if nk_val: city = (nr.get("city") or "").strip().lower() by_name_city[f"{nk_val}|{city}"] += 1 else: rejected.append((r, why or "")) def _dup_count(c: Counter[str]) -> int: return sum(v - 1 for v in c.values() if v > 1) dup_by_kind = { "domain": _dup_count(by_domain), "phone": _dup_count(by_phone), "email": _dup_count(by_email), "place_id": _dup_count(by_place), "name+city": _dup_count(by_name_city), } dup_keys = sum(dup_by_kind.values()) unique_keys = ( len(by_domain) + len(by_phone) + len(by_email) + len(by_place) + len(by_name_city) ) print(f"\n๐Ÿ“‚ {p.name}") print(f" rows: {n}") print(f" acceptable (has company + 1+ identifier): {accepted} ({pct(accepted, n)})") print(f" rejected: {len(rejected)} ({pct(len(rejected), n)})") print(f" phones normalized to +966: {phones_normalized} ({pct(phones_normalized, n)})") print(f" valid emails: {emails_valid} ({pct(emails_valid, n)})") print(f" extractable domains: {domains_extractable} ({pct(domains_extractable, n)})") print(f" dedup risk: {dup_keys} duplicate-key collisions across {unique_keys} unique keys") for kind, count in dup_by_kind.items(): if count: print(f" ยท {kind}: {count} collision(s)") print("\n๐Ÿ“Š field fill rate:") for k, c in field_present.most_common(20): print(f" {k:30s} {pct(c, n)} ({c})") if rejected and args.show_rejects > 0: print(f"\nโŒ first {min(args.show_rejects, len(rejected))} rejected rows:") for r, why in rejected[: args.show_rejects]: print(f" - reason={why} row={json.dumps(r, ensure_ascii=False)[:160]}") print("\n๐Ÿ’ก recommendation:") if accepted / n < 0.5: print(" โš ๏ธ acceptance rate < 50% โ€” file is low quality. Re-request from vendor.") elif accepted / n < 0.8: print(" โš ๏ธ acceptance rate 50-80% โ€” usable but expect rejection on import.") else: print(" โœ… acceptance rate โ‰ฅ 80% โ€” file is healthy to import.") if dup_keys / max(unique_keys, 1) > 0.2: print(" โš ๏ธ high in-file duplicate ratio โ€” run dedupe before any outreach.") return 0 if __name__ == "__main__": raise SystemExit(main())