mirror of
https://github.com/x1xhlol/system-prompts-and-models-of-ai-tools.git
synced 2026-06-18 07:19:35 +00:00
164 lines
5.8 KiB
Python
164 lines
5.8 KiB
Python
"""
|
||
Normalization helpers — Saudi-tuned.
|
||
|
||
Used by the data ingestion pipeline to clean rows before they enter the
|
||
lead graph. No external deps beyond stdlib.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
import unicodedata
|
||
from typing import Any
|
||
from urllib.parse import urlparse
|
||
|
||
EMAIL_RE = re.compile(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$")
|
||
_NON_DIGIT = re.compile(r"\D+")
|
||
_WS_RE = re.compile(r"\s+")
|
||
_PUNCT_RE = re.compile(r"[،؛؟\.,;:!?\-_/\\()\[\]{}\"'`~]")
|
||
|
||
|
||
def normalize_company_name(name: str | None) -> str:
|
||
"""Lowercase, strip diacritics + Arabic punctuation, collapse whitespace."""
|
||
if not name:
|
||
return ""
|
||
s = unicodedata.normalize("NFKC", str(name)).strip()
|
||
# Strip Arabic diacritics (tashkeel) — U+064B..U+065F + U+0670
|
||
s = re.sub(r"[ً-ٰٟ]", "", s)
|
||
# Drop common business suffixes (Arabic + English)
|
||
suffix_patterns = [
|
||
r"\bشركة\b", r"\bمؤسسة\b", r"\bمكتب\b",
|
||
r"\bllc\b", r"\binc\b", r"\bltd\b", r"\bco\.?\b",
|
||
r"\bcompany\b", r"\bcorp\.?\b", r"\bgroup\b",
|
||
]
|
||
for pat in suffix_patterns:
|
||
s = re.sub(pat, "", s, flags=re.IGNORECASE)
|
||
s = _PUNCT_RE.sub(" ", s)
|
||
s = _WS_RE.sub(" ", s).strip().lower()
|
||
return s
|
||
|
||
|
||
def normalize_domain(raw: str | None) -> str | None:
|
||
"""Extract bare domain from a URL or domain-like string."""
|
||
if not raw:
|
||
return None
|
||
s = str(raw).strip().lower()
|
||
if not s:
|
||
return None
|
||
if "://" not in s:
|
||
s = "https://" + s
|
||
try:
|
||
host = urlparse(s).netloc or urlparse(s).path
|
||
except Exception:
|
||
return None
|
||
host = host.split("/")[0].split(":")[0]
|
||
if host.startswith("www."):
|
||
host = host[4:]
|
||
if not host or "." not in host:
|
||
return None
|
||
return host
|
||
|
||
|
||
def normalize_saudi_phone(raw: str | None) -> str | None:
|
||
"""Return +966XXXXXXXXX or None."""
|
||
if not raw:
|
||
return None
|
||
digits = _NON_DIGIT.sub("", str(raw))
|
||
if not digits:
|
||
return None
|
||
if digits.startswith("00966"):
|
||
digits = digits[2:]
|
||
if digits.startswith("966") and len(digits) >= 11:
|
||
return f"+{digits[:12]}"
|
||
if digits.startswith("05") and len(digits) == 10:
|
||
return f"+966{digits[1:]}"
|
||
if digits.startswith("5") and len(digits) == 9:
|
||
return f"+966{digits}"
|
||
if digits.startswith("0") and len(digits) == 10:
|
||
return f"+966{digits[1:]}"
|
||
if 10 <= len(digits) <= 15:
|
||
return f"+{digits}"
|
||
return None
|
||
|
||
|
||
def normalize_email(raw: str | None) -> str | None:
|
||
if not raw:
|
||
return None
|
||
s = str(raw).strip().lower()
|
||
return s if EMAIL_RE.match(s) else None
|
||
|
||
|
||
def fuzzy_company_key(name: str | None) -> str:
|
||
"""A short, dedupe-friendly key derived from normalize_company_name."""
|
||
n = normalize_company_name(name)
|
||
if not n:
|
||
return ""
|
||
# Drop generic words that hurt dedupe
|
||
drop = {"the", "and", "for", "of", "al", "ال", "في", "و"}
|
||
parts = [p for p in n.split() if p not in drop]
|
||
return " ".join(parts)[:120]
|
||
|
||
|
||
def normalize_row(raw: dict[str, Any]) -> dict[str, Any]:
|
||
"""
|
||
Normalize a raw inbound row into the canonical schema.
|
||
|
||
Accepts loose keys (Arabic + English). Returns a dict with:
|
||
company_name, normalized_name, domain, website, phone, email,
|
||
city, country, sector, name (contact), role, source_url,
|
||
google_place_id, raw_keys (list)
|
||
"""
|
||
def pick(*keys: str) -> Any:
|
||
for k in keys:
|
||
v = raw.get(k)
|
||
if v not in (None, ""):
|
||
return v
|
||
return None
|
||
|
||
company = pick(
|
||
"company", "company_name", "companyName", "name", "business_name",
|
||
"اسم_الشركة", "اسم الشركة", "الشركة",
|
||
)
|
||
domain_raw = pick("domain", "website", "site", "url", "الموقع")
|
||
domain = normalize_domain(domain_raw) if domain_raw else None
|
||
phone = normalize_saudi_phone(pick("phone", "mobile", "tel", "whatsapp", "الهاتف", "الجوال"))
|
||
email = normalize_email(pick("email", "Email", "البريد", "البريد_الإلكتروني"))
|
||
city = pick("city", "City", "المدينة")
|
||
country = pick("country", "Country", "الدولة") or "SA"
|
||
sector = pick("sector", "industry", "category", "القطاع", "النشاط")
|
||
name = pick("contact_name", "person", "lead", "الاسم")
|
||
role = pick("role", "title", "position", "المسمى")
|
||
source_url = pick("source_url", "source", "linkedin_url", "rabit", "الرابط")
|
||
place_id = pick("place_id", "google_place_id", "googlePlaceId")
|
||
|
||
return {
|
||
"company_name": str(company).strip() if company else "",
|
||
"normalized_name": fuzzy_company_key(company) if company else "",
|
||
"domain": domain,
|
||
"website": str(domain_raw).strip() if domain_raw else (f"https://{domain}" if domain else None),
|
||
"phone": phone,
|
||
"email": email,
|
||
"city": str(city).strip() if city else None,
|
||
"country": str(country).strip() if country else "SA",
|
||
"sector": str(sector).strip() if sector else None,
|
||
"contact_name": str(name).strip() if name else None,
|
||
"role": str(role).strip() if role else None,
|
||
"source_url": str(source_url).strip() if source_url else None,
|
||
"google_place_id": str(place_id).strip() if place_id else None,
|
||
"raw_keys": list(raw.keys()),
|
||
}
|
||
|
||
|
||
def is_acceptable(normalized: dict[str, Any]) -> tuple[bool, str | None]:
|
||
"""
|
||
Acceptance gate. A row is acceptable if it has at minimum:
|
||
- company_name
|
||
- at least one of: domain, phone, email, google_place_id
|
||
Returns (ok, reason_if_not).
|
||
"""
|
||
if not normalized.get("company_name"):
|
||
return False, "missing_company_name"
|
||
if not any(normalized.get(k) for k in ("domain", "phone", "email", "google_place_id")):
|
||
return False, "no_contact_or_identifier"
|
||
return True, None
|