system-prompts-and-models-o.../dealix/scripts/analyze_directory_duckdb.py

#!/usr/bin/env python3
"""
analyze_directory_duckdb.py — fast pre-import audit for large CSV/Excel/JSON
lead files using DuckDB (in-memory, no Postgres required).

Usage:
    python scripts/analyze_directory_duckdb.py vendor_file.csv
    python scripts/analyze_directory_duckdb.py vendor_file.xlsx --sheet "Sheet1"
    python scripts/analyze_directory_duckdb.py vendor_file.json --output report.json

Outputs:
    - row count
    - column count
    - per-column null rate
    - per-column unique count
    - duplicate rate (by company+city, by phone, by email, by domain)
    - top 20 sectors / cities
    - email domain distribution (personal vs business)
    - phone normalization rate (Saudi)
    - data_quality_report.json (in same folder by default)

Falls back to pandas if duckdb not installed.
"""

from __future__ import annotations

import argparse
import json
import re
import sys
from collections import Counter
from pathlib import Path

# Try DuckDB first, fall back to pandas
try:
    import duckdb
    HAS_DUCKDB = True
except ImportError:
    HAS_DUCKDB = False

try:
    import pandas as pd
except ImportError:
    print("ERROR: pandas required. Install: pip install pandas openpyxl")
    sys.exit(1)

PERSONAL_DOMAINS = {"gmail.com", "hotmail.com", "yahoo.com", "outlook.com",
                    "icloud.com", "live.com"}
PHONE_RE = re.compile(r"\D+")
EMAIL_RE = re.compile(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$")


def normalize_saudi_phone(raw):
    if raw is None or (isinstance(raw, float) and raw != raw):
        return None
    s = str(raw).strip()
    digits = PHONE_RE.sub("", s)
    if not digits:
        return None
    if digits.startswith("00966"):
        digits = digits[2:]
    if digits.startswith("966") and len(digits) >= 11:
        return f"+{digits[:12]}"
    if digits.startswith("05") and len(digits) == 10:
        return f"+966{digits[1:]}"
    if digits.startswith("5") and len(digits) == 9:
        return f"+966{digits}"
    if digits.startswith("0") and len(digits) == 10:
        return f"+966{digits[1:]}"
    return None


def normalize_email(raw):
    if raw is None or (isinstance(raw, float) and raw != raw):
        return None
    s = str(raw).strip().lower()
    return s if EMAIL_RE.match(s) else None


def email_kind(e):
    if not e:
        return "invalid"
    domain = e.split("@", 1)[1] if "@" in e else ""
    return "personal" if domain in PERSONAL_DOMAINS else "business"


def analyze(path: Path, sheet: str | None = None) -> dict:
    suffix = path.suffix.lower()
    if suffix in {".xlsx", ".xls"}:
        df = pd.read_excel(path, sheet_name=sheet or 0)
    elif suffix == ".csv":
        df = pd.read_csv(path)
    elif suffix == ".json":
        df = pd.read_json(path)
    else:
        raise SystemExit(f"unsupported extension: {suffix}")

    n = len(df)
    cols = list(df.columns)

    # Per-column profile
    col_profile = {}
    for c in cols:
        nonnull = int(df[c].notna().sum())
        unique = int(df[c].nunique(dropna=True))
        col_profile[str(c)] = {
            "non_null": nonnull,
            "null_rate": round((n - nonnull) / n, 4) if n else 0,
            "unique": unique,
            "uniqueness_ratio": round(unique / max(1, nonnull), 4),
        }

    # Try to identify common columns by heuristic
    def find_col(candidates):
        lower_map = {str(c).lower(): c for c in cols}
        for cand in candidates:
            if cand.lower() in lower_map:
                return lower_map[cand.lower()]
        return None

    name_col = find_col(["company_name", "name", "company", "اسم الشركة", "الشركة"])
    city_col = find_col(["city", "City", "المدينة"])
    email_col = find_col(["email", "Email", "الإيميل", "البريد"])
    phone_col = find_col(["phone", "Phone", "الهاتف", "الجوال", "رقم التواصل", "mobile"])
    sector_col = find_col(["sector", "industry", "وظيفة الشركة", "القطاع", "النشاط"])

    # Email kind / phone normalization
    email_kinds = Counter()
    phones_norm = 0
    if email_col:
        for v in df[email_col]:
            email_kinds[email_kind(normalize_email(v))] += 1
    if phone_col:
        for v in df[phone_col]:
            if normalize_saudi_phone(v):
                phones_norm += 1

    # Top sectors / cities
    top_sectors = []
    top_cities = []
    if sector_col:
        top_sectors = df[sector_col].value_counts().head(20).to_dict()
        top_sectors = [{"sector": str(k), "count": int(v)} for k, v in top_sectors.items()]
    if city_col:
        top_cities = df[city_col].value_counts().head(20).to_dict()
        top_cities = [{"city": str(k), "count": int(v)} for k, v in top_cities.items()]

    # Duplicate detection (4 keys)
    dup = {"by_email": 0, "by_phone": 0, "by_name_city": 0}
    if email_col:
        emails = df[email_col].apply(lambda x: normalize_email(x)).dropna()
        dup["by_email"] = int(len(emails) - emails.nunique())
    if phone_col:
        phones = df[phone_col].apply(lambda x: normalize_saudi_phone(x)).dropna()
        dup["by_phone"] = int(len(phones) - phones.nunique())
    if name_col and city_col:
        nc = df[[name_col, city_col]].dropna().astype(str)
        nc["k"] = nc[name_col].str.strip().str.lower() + "|" + nc[city_col].str.strip().str.lower()
        dup["by_name_city"] = int(len(nc) - nc["k"].nunique())

    return {
        "file": str(path),
        "rows": int(n), "columns": len(cols),
        "column_profile": col_profile,
        "detected_columns": {
            "name": name_col, "city": city_col, "email": email_col,
            "phone": phone_col, "sector": sector_col,
        },
        "email_kinds": dict(email_kinds),
        "phones_normalized_saudi": phones_norm,
        "phones_normalize_rate": round(phones_norm / max(1, n), 4),
        "top_sectors": top_sectors,
        "top_cities": top_cities,
        "duplicates": dup,
        "engine": "duckdb" if HAS_DUCKDB else "pandas",
    }


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("file")
    ap.add_argument("--sheet", help="sheet name for Excel")
    ap.add_argument("--output", help="output report.json (default: <file>.report.json)")
    args = ap.parse_args()

    path = Path(args.file)
    if not path.exists():
        print(f"file not found: {path}", file=sys.stderr)
        return 2

    report = analyze(path, sheet=args.sheet)

    out_path = Path(args.output) if args.output else path.with_suffix(path.suffix + ".report.json")
    out_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")

    # Brief stdout summary
    print(f"\n📂 {path.name}  ({report['engine']})")
    print(f"   rows: {report['rows']}, cols: {report['columns']}")
    if report["email_kinds"]:
        print(f"   email_kinds: {dict(report['email_kinds'])}")
    print(f"   phones_normalized_saudi: {report['phones_normalized_saudi']} "
          f"({report['phones_normalize_rate']*100:.1f}%)")
    print(f"   duplicates: {report['duplicates']}")
    print(f"   detected_cols: {report['detected_columns']}")
    if report["top_sectors"]:
        top_s = report["top_sectors"][:5]
        print(f"   top_sectors: {top_s}")
    if report["top_cities"]:
        top_c = report["top_cities"][:5]
        print(f"   top_cities: {top_c}")
    print(f"\n📄 wrote {out_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())