mirror of
https://github.com/x1xhlol/system-prompts-and-models-of-ai-tools.git
synced 2026-06-18 15:29:36 +00:00
180 lines
5.9 KiB
Python
180 lines
5.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
audit_lead_file.py — Local audit of a CSV/JSON lead file BEFORE you ingest it.
|
|
|
|
Reports:
|
|
- row count
|
|
- per-field fill rate
|
|
- normalized phone success rate (Saudi)
|
|
- email validity rate
|
|
- domain extractability
|
|
- estimated dedup risk inside the file
|
|
- flags for risky fields (personal-only contacts, no source URL)
|
|
|
|
Use this to evaluate purchased datasets BEFORE you pay or upload.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import csv
|
|
import json
|
|
import sys
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
# Make local imports work without installing the package
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
|
|
from auto_client_acquisition.pipelines.normalize import ( # noqa: E402
|
|
fuzzy_company_key,
|
|
is_acceptable,
|
|
normalize_email,
|
|
normalize_domain,
|
|
normalize_row,
|
|
normalize_saudi_phone,
|
|
)
|
|
|
|
|
|
def parse_file(path: Path) -> list[dict]:
|
|
if path.suffix.lower() in {".json", ".jsonl"}:
|
|
text = path.read_text(encoding="utf-8")
|
|
if path.suffix.lower() == ".jsonl":
|
|
return [json.loads(line) for line in text.splitlines() if line.strip()]
|
|
data = json.loads(text)
|
|
if isinstance(data, list):
|
|
return data
|
|
if isinstance(data, dict) and "rows" in data:
|
|
return data["rows"]
|
|
raise SystemExit("JSON must be list or {rows: [...]}")
|
|
if path.suffix.lower() in {".csv", ".tsv"}:
|
|
delim = "\t" if path.suffix.lower() == ".tsv" else ","
|
|
with path.open(encoding="utf-8-sig", newline="") as f:
|
|
return [dict(r) for r in csv.DictReader(f, delimiter=delim)]
|
|
raise SystemExit(f"unsupported: {path.suffix}")
|
|
|
|
|
|
def pct(n: int, d: int) -> str:
|
|
return f"{(n / d * 100):.1f}%" if d else "0.0%"
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("file")
|
|
ap.add_argument("--show-rejects", type=int, default=5,
|
|
help="Show first N rejected rows (default 5)")
|
|
args = ap.parse_args()
|
|
|
|
p = Path(args.file)
|
|
if not p.exists():
|
|
print(f"file not found: {p}", file=sys.stderr)
|
|
return 2
|
|
|
|
rows = parse_file(p)
|
|
n = len(rows)
|
|
if not n:
|
|
print("empty file", file=sys.stderr)
|
|
return 2
|
|
|
|
field_present: Counter[str] = Counter()
|
|
for r in rows:
|
|
for k, v in r.items():
|
|
if v not in (None, ""):
|
|
field_present[k] += 1
|
|
|
|
accepted = 0
|
|
rejected: list[tuple[dict, str]] = []
|
|
phones_normalized = 0
|
|
emails_valid = 0
|
|
domains_extractable = 0
|
|
# Match server-side dedupe: collide on ANY of domain/phone/email/place_id/name+city
|
|
by_domain: Counter[str] = Counter()
|
|
by_phone: Counter[str] = Counter()
|
|
by_email: Counter[str] = Counter()
|
|
by_place: Counter[str] = Counter()
|
|
by_name_city: Counter[str] = Counter()
|
|
|
|
for r in rows:
|
|
nr = normalize_row(r)
|
|
if nr.get("phone"):
|
|
phones_normalized += 1
|
|
if nr.get("email"):
|
|
emails_valid += 1
|
|
if nr.get("domain"):
|
|
domains_extractable += 1
|
|
ok, why = is_acceptable(nr)
|
|
if ok:
|
|
accepted += 1
|
|
d_val = nr.get("domain")
|
|
if d_val:
|
|
by_domain[d_val] += 1
|
|
p_val = nr.get("phone")
|
|
if p_val:
|
|
by_phone[p_val] += 1
|
|
e_val = nr.get("email")
|
|
if e_val:
|
|
by_email[e_val] += 1
|
|
pid_val = nr.get("google_place_id")
|
|
if pid_val:
|
|
by_place[pid_val] += 1
|
|
nk_val = nr.get("normalized_name")
|
|
if nk_val:
|
|
city = (nr.get("city") or "").strip().lower()
|
|
by_name_city[f"{nk_val}|{city}"] += 1
|
|
else:
|
|
rejected.append((r, why or ""))
|
|
|
|
def _dup_count(c: Counter[str]) -> int:
|
|
return sum(v - 1 for v in c.values() if v > 1)
|
|
|
|
dup_by_kind = {
|
|
"domain": _dup_count(by_domain),
|
|
"phone": _dup_count(by_phone),
|
|
"email": _dup_count(by_email),
|
|
"place_id": _dup_count(by_place),
|
|
"name+city": _dup_count(by_name_city),
|
|
}
|
|
dup_keys = sum(dup_by_kind.values())
|
|
unique_keys = (
|
|
len(by_domain) + len(by_phone) + len(by_email)
|
|
+ len(by_place) + len(by_name_city)
|
|
)
|
|
|
|
print(f"\n📂 {p.name}")
|
|
print(f" rows: {n}")
|
|
print(f" acceptable (has company + 1+ identifier): {accepted} ({pct(accepted, n)})")
|
|
print(f" rejected: {len(rejected)} ({pct(len(rejected), n)})")
|
|
print(f" phones normalized to +966: {phones_normalized} ({pct(phones_normalized, n)})")
|
|
print(f" valid emails: {emails_valid} ({pct(emails_valid, n)})")
|
|
print(f" extractable domains: {domains_extractable} ({pct(domains_extractable, n)})")
|
|
print(f" dedup risk: {dup_keys} duplicate-key collisions across {unique_keys} unique keys")
|
|
for kind, count in dup_by_kind.items():
|
|
if count:
|
|
print(f" · {kind}: {count} collision(s)")
|
|
|
|
print("\n📊 field fill rate:")
|
|
for k, c in field_present.most_common(20):
|
|
print(f" {k:30s} {pct(c, n)} ({c})")
|
|
|
|
if rejected and args.show_rejects > 0:
|
|
print(f"\n❌ first {min(args.show_rejects, len(rejected))} rejected rows:")
|
|
for r, why in rejected[: args.show_rejects]:
|
|
print(f" - reason={why} row={json.dumps(r, ensure_ascii=False)[:160]}")
|
|
|
|
print("\n💡 recommendation:")
|
|
if accepted / n < 0.5:
|
|
print(" ⚠️ acceptance rate < 50% — file is low quality. Re-request from vendor.")
|
|
elif accepted / n < 0.8:
|
|
print(" ⚠️ acceptance rate 50-80% — usable but expect rejection on import.")
|
|
else:
|
|
print(" ✅ acceptance rate ≥ 80% — file is healthy to import.")
|
|
|
|
if dup_keys / max(unique_keys, 1) > 0.2:
|
|
print(" ⚠️ high in-file duplicate ratio — run dedupe before any outreach.")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|