system-prompts-and-models-o.../dealix/scripts/audit_lead_file.py
2026-05-01 14:03:52 +03:00

180 lines
5.9 KiB
Python

#!/usr/bin/env python3
"""
audit_lead_file.py — Local audit of a CSV/JSON lead file BEFORE you ingest it.
Reports:
- row count
- per-field fill rate
- normalized phone success rate (Saudi)
- email validity rate
- domain extractability
- estimated dedup risk inside the file
- flags for risky fields (personal-only contacts, no source URL)
Use this to evaluate purchased datasets BEFORE you pay or upload.
"""
from __future__ import annotations
import argparse
import csv
import json
import sys
from collections import Counter
from pathlib import Path
from typing import Any
# Make local imports work without installing the package
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from auto_client_acquisition.pipelines.normalize import ( # noqa: E402
fuzzy_company_key,
is_acceptable,
normalize_email,
normalize_domain,
normalize_row,
normalize_saudi_phone,
)
def parse_file(path: Path) -> list[dict]:
if path.suffix.lower() in {".json", ".jsonl"}:
text = path.read_text(encoding="utf-8")
if path.suffix.lower() == ".jsonl":
return [json.loads(line) for line in text.splitlines() if line.strip()]
data = json.loads(text)
if isinstance(data, list):
return data
if isinstance(data, dict) and "rows" in data:
return data["rows"]
raise SystemExit("JSON must be list or {rows: [...]}")
if path.suffix.lower() in {".csv", ".tsv"}:
delim = "\t" if path.suffix.lower() == ".tsv" else ","
with path.open(encoding="utf-8-sig", newline="") as f:
return [dict(r) for r in csv.DictReader(f, delimiter=delim)]
raise SystemExit(f"unsupported: {path.suffix}")
def pct(n: int, d: int) -> str:
return f"{(n / d * 100):.1f}%" if d else "0.0%"
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("file")
ap.add_argument("--show-rejects", type=int, default=5,
help="Show first N rejected rows (default 5)")
args = ap.parse_args()
p = Path(args.file)
if not p.exists():
print(f"file not found: {p}", file=sys.stderr)
return 2
rows = parse_file(p)
n = len(rows)
if not n:
print("empty file", file=sys.stderr)
return 2
field_present: Counter[str] = Counter()
for r in rows:
for k, v in r.items():
if v not in (None, ""):
field_present[k] += 1
accepted = 0
rejected: list[tuple[dict, str]] = []
phones_normalized = 0
emails_valid = 0
domains_extractable = 0
# Match server-side dedupe: collide on ANY of domain/phone/email/place_id/name+city
by_domain: Counter[str] = Counter()
by_phone: Counter[str] = Counter()
by_email: Counter[str] = Counter()
by_place: Counter[str] = Counter()
by_name_city: Counter[str] = Counter()
for r in rows:
nr = normalize_row(r)
if nr.get("phone"):
phones_normalized += 1
if nr.get("email"):
emails_valid += 1
if nr.get("domain"):
domains_extractable += 1
ok, why = is_acceptable(nr)
if ok:
accepted += 1
d_val = nr.get("domain")
if d_val:
by_domain[d_val] += 1
p_val = nr.get("phone")
if p_val:
by_phone[p_val] += 1
e_val = nr.get("email")
if e_val:
by_email[e_val] += 1
pid_val = nr.get("google_place_id")
if pid_val:
by_place[pid_val] += 1
nk_val = nr.get("normalized_name")
if nk_val:
city = (nr.get("city") or "").strip().lower()
by_name_city[f"{nk_val}|{city}"] += 1
else:
rejected.append((r, why or ""))
def _dup_count(c: Counter[str]) -> int:
return sum(v - 1 for v in c.values() if v > 1)
dup_by_kind = {
"domain": _dup_count(by_domain),
"phone": _dup_count(by_phone),
"email": _dup_count(by_email),
"place_id": _dup_count(by_place),
"name+city": _dup_count(by_name_city),
}
dup_keys = sum(dup_by_kind.values())
unique_keys = (
len(by_domain) + len(by_phone) + len(by_email)
+ len(by_place) + len(by_name_city)
)
print(f"\n📂 {p.name}")
print(f" rows: {n}")
print(f" acceptable (has company + 1+ identifier): {accepted} ({pct(accepted, n)})")
print(f" rejected: {len(rejected)} ({pct(len(rejected), n)})")
print(f" phones normalized to +966: {phones_normalized} ({pct(phones_normalized, n)})")
print(f" valid emails: {emails_valid} ({pct(emails_valid, n)})")
print(f" extractable domains: {domains_extractable} ({pct(domains_extractable, n)})")
print(f" dedup risk: {dup_keys} duplicate-key collisions across {unique_keys} unique keys")
for kind, count in dup_by_kind.items():
if count:
print(f" · {kind}: {count} collision(s)")
print("\n📊 field fill rate:")
for k, c in field_present.most_common(20):
print(f" {k:30s} {pct(c, n)} ({c})")
if rejected and args.show_rejects > 0:
print(f"\n❌ first {min(args.show_rejects, len(rejected))} rejected rows:")
for r, why in rejected[: args.show_rejects]:
print(f" - reason={why} row={json.dumps(r, ensure_ascii=False)[:160]}")
print("\n💡 recommendation:")
if accepted / n < 0.5:
print(" ⚠️ acceptance rate < 50% — file is low quality. Re-request from vendor.")
elif accepted / n < 0.8:
print(" ⚠️ acceptance rate 50-80% — usable but expect rejection on import.")
else:
print(" ✅ acceptance rate ≥ 80% — file is healthy to import.")
if dup_keys / max(unique_keys, 1) > 0.2:
print(" ⚠️ high in-file duplicate ratio — run dedupe before any outreach.")
return 0
if __name__ == "__main__":
raise SystemExit(main())