mirror of
https://github.com/x1xhlol/system-prompts-and-models-of-ai-tools.git
synced 2026-06-17 23:09:35 +00:00
217 lines
7.3 KiB
Python
217 lines
7.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
analyze_directory_duckdb.py — fast pre-import audit for large CSV/Excel/JSON
|
|
lead files using DuckDB (in-memory, no Postgres required).
|
|
|
|
Usage:
|
|
python scripts/analyze_directory_duckdb.py vendor_file.csv
|
|
python scripts/analyze_directory_duckdb.py vendor_file.xlsx --sheet "Sheet1"
|
|
python scripts/analyze_directory_duckdb.py vendor_file.json --output report.json
|
|
|
|
Outputs:
|
|
- row count
|
|
- column count
|
|
- per-column null rate
|
|
- per-column unique count
|
|
- duplicate rate (by company+city, by phone, by email, by domain)
|
|
- top 20 sectors / cities
|
|
- email domain distribution (personal vs business)
|
|
- phone normalization rate (Saudi)
|
|
- data_quality_report.json (in same folder by default)
|
|
|
|
Falls back to pandas if duckdb not installed.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
|
|
# Try DuckDB first, fall back to pandas
|
|
try:
|
|
import duckdb
|
|
HAS_DUCKDB = True
|
|
except ImportError:
|
|
HAS_DUCKDB = False
|
|
|
|
try:
|
|
import pandas as pd
|
|
except ImportError:
|
|
print("ERROR: pandas required. Install: pip install pandas openpyxl")
|
|
sys.exit(1)
|
|
|
|
PERSONAL_DOMAINS = {"gmail.com", "hotmail.com", "yahoo.com", "outlook.com",
|
|
"icloud.com", "live.com"}
|
|
PHONE_RE = re.compile(r"\D+")
|
|
EMAIL_RE = re.compile(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$")
|
|
|
|
|
|
def normalize_saudi_phone(raw):
|
|
if raw is None or (isinstance(raw, float) and raw != raw):
|
|
return None
|
|
s = str(raw).strip()
|
|
digits = PHONE_RE.sub("", s)
|
|
if not digits:
|
|
return None
|
|
if digits.startswith("00966"):
|
|
digits = digits[2:]
|
|
if digits.startswith("966") and len(digits) >= 11:
|
|
return f"+{digits[:12]}"
|
|
if digits.startswith("05") and len(digits) == 10:
|
|
return f"+966{digits[1:]}"
|
|
if digits.startswith("5") and len(digits) == 9:
|
|
return f"+966{digits}"
|
|
if digits.startswith("0") and len(digits) == 10:
|
|
return f"+966{digits[1:]}"
|
|
return None
|
|
|
|
|
|
def normalize_email(raw):
|
|
if raw is None or (isinstance(raw, float) and raw != raw):
|
|
return None
|
|
s = str(raw).strip().lower()
|
|
return s if EMAIL_RE.match(s) else None
|
|
|
|
|
|
def email_kind(e):
|
|
if not e:
|
|
return "invalid"
|
|
domain = e.split("@", 1)[1] if "@" in e else ""
|
|
return "personal" if domain in PERSONAL_DOMAINS else "business"
|
|
|
|
|
|
def analyze(path: Path, sheet: str | None = None) -> dict:
|
|
suffix = path.suffix.lower()
|
|
if suffix in {".xlsx", ".xls"}:
|
|
df = pd.read_excel(path, sheet_name=sheet or 0)
|
|
elif suffix == ".csv":
|
|
df = pd.read_csv(path)
|
|
elif suffix == ".json":
|
|
df = pd.read_json(path)
|
|
else:
|
|
raise SystemExit(f"unsupported extension: {suffix}")
|
|
|
|
n = len(df)
|
|
cols = list(df.columns)
|
|
|
|
# Per-column profile
|
|
col_profile = {}
|
|
for c in cols:
|
|
nonnull = int(df[c].notna().sum())
|
|
unique = int(df[c].nunique(dropna=True))
|
|
col_profile[str(c)] = {
|
|
"non_null": nonnull,
|
|
"null_rate": round((n - nonnull) / n, 4) if n else 0,
|
|
"unique": unique,
|
|
"uniqueness_ratio": round(unique / max(1, nonnull), 4),
|
|
}
|
|
|
|
# Try to identify common columns by heuristic
|
|
def find_col(candidates):
|
|
lower_map = {str(c).lower(): c for c in cols}
|
|
for cand in candidates:
|
|
if cand.lower() in lower_map:
|
|
return lower_map[cand.lower()]
|
|
return None
|
|
|
|
name_col = find_col(["company_name", "name", "company", "اسم الشركة", "الشركة"])
|
|
city_col = find_col(["city", "City", "المدينة"])
|
|
email_col = find_col(["email", "Email", "الإيميل", "البريد"])
|
|
phone_col = find_col(["phone", "Phone", "الهاتف", "الجوال", "رقم التواصل", "mobile"])
|
|
sector_col = find_col(["sector", "industry", "وظيفة الشركة", "القطاع", "النشاط"])
|
|
|
|
# Email kind / phone normalization
|
|
email_kinds = Counter()
|
|
phones_norm = 0
|
|
if email_col:
|
|
for v in df[email_col]:
|
|
email_kinds[email_kind(normalize_email(v))] += 1
|
|
if phone_col:
|
|
for v in df[phone_col]:
|
|
if normalize_saudi_phone(v):
|
|
phones_norm += 1
|
|
|
|
# Top sectors / cities
|
|
top_sectors = []
|
|
top_cities = []
|
|
if sector_col:
|
|
top_sectors = df[sector_col].value_counts().head(20).to_dict()
|
|
top_sectors = [{"sector": str(k), "count": int(v)} for k, v in top_sectors.items()]
|
|
if city_col:
|
|
top_cities = df[city_col].value_counts().head(20).to_dict()
|
|
top_cities = [{"city": str(k), "count": int(v)} for k, v in top_cities.items()]
|
|
|
|
# Duplicate detection (4 keys)
|
|
dup = {"by_email": 0, "by_phone": 0, "by_name_city": 0}
|
|
if email_col:
|
|
emails = df[email_col].apply(lambda x: normalize_email(x)).dropna()
|
|
dup["by_email"] = int(len(emails) - emails.nunique())
|
|
if phone_col:
|
|
phones = df[phone_col].apply(lambda x: normalize_saudi_phone(x)).dropna()
|
|
dup["by_phone"] = int(len(phones) - phones.nunique())
|
|
if name_col and city_col:
|
|
nc = df[[name_col, city_col]].dropna().astype(str)
|
|
nc["k"] = nc[name_col].str.strip().str.lower() + "|" + nc[city_col].str.strip().str.lower()
|
|
dup["by_name_city"] = int(len(nc) - nc["k"].nunique())
|
|
|
|
return {
|
|
"file": str(path),
|
|
"rows": int(n), "columns": len(cols),
|
|
"column_profile": col_profile,
|
|
"detected_columns": {
|
|
"name": name_col, "city": city_col, "email": email_col,
|
|
"phone": phone_col, "sector": sector_col,
|
|
},
|
|
"email_kinds": dict(email_kinds),
|
|
"phones_normalized_saudi": phones_norm,
|
|
"phones_normalize_rate": round(phones_norm / max(1, n), 4),
|
|
"top_sectors": top_sectors,
|
|
"top_cities": top_cities,
|
|
"duplicates": dup,
|
|
"engine": "duckdb" if HAS_DUCKDB else "pandas",
|
|
}
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("file")
|
|
ap.add_argument("--sheet", help="sheet name for Excel")
|
|
ap.add_argument("--output", help="output report.json (default: <file>.report.json)")
|
|
args = ap.parse_args()
|
|
|
|
path = Path(args.file)
|
|
if not path.exists():
|
|
print(f"file not found: {path}", file=sys.stderr)
|
|
return 2
|
|
|
|
report = analyze(path, sheet=args.sheet)
|
|
|
|
out_path = Path(args.output) if args.output else path.with_suffix(path.suffix + ".report.json")
|
|
out_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
|
|
# Brief stdout summary
|
|
print(f"\n📂 {path.name} ({report['engine']})")
|
|
print(f" rows: {report['rows']}, cols: {report['columns']}")
|
|
if report["email_kinds"]:
|
|
print(f" email_kinds: {dict(report['email_kinds'])}")
|
|
print(f" phones_normalized_saudi: {report['phones_normalized_saudi']} "
|
|
f"({report['phones_normalize_rate']*100:.1f}%)")
|
|
print(f" duplicates: {report['duplicates']}")
|
|
print(f" detected_cols: {report['detected_columns']}")
|
|
if report["top_sectors"]:
|
|
top_s = report["top_sectors"][:5]
|
|
print(f" top_sectors: {top_s}")
|
|
if report["top_cities"]:
|
|
top_c = report["top_cities"][:5]
|
|
print(f" top_cities: {top_c}")
|
|
print(f"\n📄 wrote {out_path}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|