system-prompts-and-models-o.../dealix/auto_client_acquisition/pipelines/normalize.py
2026-05-01 14:03:52 +03:00

164 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Normalization helpers — Saudi-tuned.
Used by the data ingestion pipeline to clean rows before they enter the
lead graph. No external deps beyond stdlib.
"""
from __future__ import annotations
import re
import unicodedata
from typing import Any
from urllib.parse import urlparse
EMAIL_RE = re.compile(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$")
_NON_DIGIT = re.compile(r"\D+")
_WS_RE = re.compile(r"\s+")
_PUNCT_RE = re.compile(r"[،؛؟\.,;:!?\-_/\\()\[\]{}\"'`~]")
def normalize_company_name(name: str | None) -> str:
"""Lowercase, strip diacritics + Arabic punctuation, collapse whitespace."""
if not name:
return ""
s = unicodedata.normalize("NFKC", str(name)).strip()
# Strip Arabic diacritics (tashkeel) — U+064B..U+065F + U+0670
s = re.sub(r"[ً-ٰٟ]", "", s)
# Drop common business suffixes (Arabic + English)
suffix_patterns = [
r"\bشركة\b", r"\bمؤسسة\b", r"\bمكتب\b",
r"\bllc\b", r"\binc\b", r"\bltd\b", r"\bco\.?\b",
r"\bcompany\b", r"\bcorp\.?\b", r"\bgroup\b",
]
for pat in suffix_patterns:
s = re.sub(pat, "", s, flags=re.IGNORECASE)
s = _PUNCT_RE.sub(" ", s)
s = _WS_RE.sub(" ", s).strip().lower()
return s
def normalize_domain(raw: str | None) -> str | None:
"""Extract bare domain from a URL or domain-like string."""
if not raw:
return None
s = str(raw).strip().lower()
if not s:
return None
if "://" not in s:
s = "https://" + s
try:
host = urlparse(s).netloc or urlparse(s).path
except Exception:
return None
host = host.split("/")[0].split(":")[0]
if host.startswith("www."):
host = host[4:]
if not host or "." not in host:
return None
return host
def normalize_saudi_phone(raw: str | None) -> str | None:
"""Return +966XXXXXXXXX or None."""
if not raw:
return None
digits = _NON_DIGIT.sub("", str(raw))
if not digits:
return None
if digits.startswith("00966"):
digits = digits[2:]
if digits.startswith("966") and len(digits) >= 11:
return f"+{digits[:12]}"
if digits.startswith("05") and len(digits) == 10:
return f"+966{digits[1:]}"
if digits.startswith("5") and len(digits) == 9:
return f"+966{digits}"
if digits.startswith("0") and len(digits) == 10:
return f"+966{digits[1:]}"
if 10 <= len(digits) <= 15:
return f"+{digits}"
return None
def normalize_email(raw: str | None) -> str | None:
if not raw:
return None
s = str(raw).strip().lower()
return s if EMAIL_RE.match(s) else None
def fuzzy_company_key(name: str | None) -> str:
"""A short, dedupe-friendly key derived from normalize_company_name."""
n = normalize_company_name(name)
if not n:
return ""
# Drop generic words that hurt dedupe
drop = {"the", "and", "for", "of", "al", "ال", "في", "و"}
parts = [p for p in n.split() if p not in drop]
return " ".join(parts)[:120]
def normalize_row(raw: dict[str, Any]) -> dict[str, Any]:
"""
Normalize a raw inbound row into the canonical schema.
Accepts loose keys (Arabic + English). Returns a dict with:
company_name, normalized_name, domain, website, phone, email,
city, country, sector, name (contact), role, source_url,
google_place_id, raw_keys (list)
"""
def pick(*keys: str) -> Any:
for k in keys:
v = raw.get(k)
if v not in (None, ""):
return v
return None
company = pick(
"company", "company_name", "companyName", "name", "business_name",
"اسم_الشركة", "اسم الشركة", "الشركة",
)
domain_raw = pick("domain", "website", "site", "url", "الموقع")
domain = normalize_domain(domain_raw) if domain_raw else None
phone = normalize_saudi_phone(pick("phone", "mobile", "tel", "whatsapp", "الهاتف", "الجوال"))
email = normalize_email(pick("email", "Email", "البريد", "البريد_الإلكتروني"))
city = pick("city", "City", "المدينة")
country = pick("country", "Country", "الدولة") or "SA"
sector = pick("sector", "industry", "category", "القطاع", "النشاط")
name = pick("contact_name", "person", "lead", "الاسم")
role = pick("role", "title", "position", "المسمى")
source_url = pick("source_url", "source", "linkedin_url", "rabit", "الرابط")
place_id = pick("place_id", "google_place_id", "googlePlaceId")
return {
"company_name": str(company).strip() if company else "",
"normalized_name": fuzzy_company_key(company) if company else "",
"domain": domain,
"website": str(domain_raw).strip() if domain_raw else (f"https://{domain}" if domain else None),
"phone": phone,
"email": email,
"city": str(city).strip() if city else None,
"country": str(country).strip() if country else "SA",
"sector": str(sector).strip() if sector else None,
"contact_name": str(name).strip() if name else None,
"role": str(role).strip() if role else None,
"source_url": str(source_url).strip() if source_url else None,
"google_place_id": str(place_id).strip() if place_id else None,
"raw_keys": list(raw.keys()),
}
def is_acceptable(normalized: dict[str, Any]) -> tuple[bool, str | None]:
"""
Acceptance gate. A row is acceptable if it has at minimum:
- company_name
- at least one of: domain, phone, email, google_place_id
Returns (ok, reason_if_not).
"""
if not normalized.get("company_name"):
return False, "missing_company_name"
if not any(normalized.get(k) for k in ("domain", "phone", "email", "google_place_id")):
return False, "no_contact_or_identifier"
return True, None