mirror of
https://github.com/x1xhlol/system-prompts-and-models-of-ai-tools.git
synced 2026-06-18 23:39:34 +00:00
180 lines
6.1 KiB
Python
180 lines
6.1 KiB
Python
"""Smoke tests for normalization, dedupe, and scoring pipelines."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from auto_client_acquisition.pipelines.dedupe import build_index, find_match
|
|
from auto_client_acquisition.pipelines.normalize import (
|
|
fuzzy_company_key,
|
|
is_acceptable,
|
|
normalize_company_name,
|
|
normalize_domain,
|
|
normalize_email,
|
|
normalize_row,
|
|
normalize_saudi_phone,
|
|
)
|
|
from auto_client_acquisition.pipelines.scoring import (
|
|
compute_data_quality,
|
|
compute_lead_score,
|
|
)
|
|
|
|
|
|
# ── Phone ──────────────────────────────────────────────────────
|
|
def test_phone_local_to_e164():
|
|
assert normalize_saudi_phone("0501234567") == "+966501234567"
|
|
|
|
|
|
def test_phone_already_e164():
|
|
assert normalize_saudi_phone("+966501234567") == "+966501234567"
|
|
|
|
|
|
def test_phone_double_zero_intl():
|
|
assert normalize_saudi_phone("00966501234567") == "+966501234567"
|
|
|
|
|
|
def test_phone_short_local():
|
|
assert normalize_saudi_phone("501234567") == "+966501234567"
|
|
|
|
|
|
def test_phone_empty():
|
|
assert normalize_saudi_phone("") is None
|
|
assert normalize_saudi_phone(None) is None
|
|
|
|
|
|
# ── Domain ─────────────────────────────────────────────────────
|
|
def test_domain_strips_protocol_and_www():
|
|
assert normalize_domain("https://www.foodics.com/contact") == "foodics.com"
|
|
|
|
|
|
def test_domain_case_insensitive():
|
|
assert normalize_domain("FOODICS.com") == "foodics.com"
|
|
|
|
|
|
def test_domain_invalid():
|
|
assert normalize_domain("") is None
|
|
assert normalize_domain(None) is None
|
|
assert normalize_domain("just-a-string") is None
|
|
|
|
|
|
# ── Email ──────────────────────────────────────────────────────
|
|
def test_email_lowercase():
|
|
assert normalize_email("Sami@DEALIX.me") == "sami@dealix.me"
|
|
|
|
|
|
def test_email_invalid():
|
|
assert normalize_email("not-an-email") is None
|
|
assert normalize_email("") is None
|
|
|
|
|
|
# ── Row normalization ──────────────────────────────────────────
|
|
def test_normalize_row_arabic_keys():
|
|
nr = normalize_row({
|
|
"اسم الشركة": "شركة الراجحي العقارية",
|
|
"الموقع": "https://alrajhi-realestate.com",
|
|
"الهاتف": "+966501234567",
|
|
"المدينة": "الرياض",
|
|
"القطاع": "real_estate",
|
|
})
|
|
assert nr["domain"] == "alrajhi-realestate.com"
|
|
assert nr["phone"] == "+966501234567"
|
|
assert nr["city"] == "الرياض"
|
|
|
|
|
|
def test_acceptable_requires_company_and_identifier():
|
|
nr = normalize_row({"company_name": "X"})
|
|
ok, why = is_acceptable(nr)
|
|
assert ok is False
|
|
assert "no_contact_or_identifier" in (why or "")
|
|
|
|
|
|
def test_acceptable_with_phone():
|
|
nr = normalize_row({"company_name": "Test", "phone": "0501234567"})
|
|
ok, _why = is_acceptable(nr)
|
|
assert ok is True
|
|
|
|
|
|
# ── Dedupe ─────────────────────────────────────────────────────
|
|
def test_dedupe_by_domain():
|
|
idx = build_index([
|
|
{"id": "acc_1", "company_name": "Foodics", "normalized_name": "foodics",
|
|
"domain": "foodics.com", "city": "Riyadh"},
|
|
])
|
|
new = {
|
|
"company_name": "فودكس", "normalized_name": "فودكس",
|
|
"domain": "foodics.com", "phone": None, "email": None,
|
|
"google_place_id": None, "city": "Riyadh",
|
|
}
|
|
hit, kind = find_match(new, idx)
|
|
assert hit == "acc_1"
|
|
assert kind == "domain"
|
|
|
|
|
|
def test_dedupe_no_match_for_different_domain():
|
|
idx = build_index([
|
|
{"id": "acc_1", "company_name": "Foodics", "normalized_name": "foodics",
|
|
"domain": "foodics.com", "city": "Riyadh"},
|
|
])
|
|
new = {
|
|
"company_name": "Other", "normalized_name": "other",
|
|
"domain": "otherco.sa", "phone": None, "email": None,
|
|
"google_place_id": None, "city": "Riyadh",
|
|
}
|
|
hit, kind = find_match(new, idx)
|
|
assert hit is None
|
|
assert kind is None
|
|
|
|
|
|
def test_dedupe_by_place_id():
|
|
idx = build_index([
|
|
{"id": "acc_x", "company_name": "Clinic", "normalized_name": "clinic",
|
|
"google_place_id": "PID_123", "city": "Riyadh"},
|
|
])
|
|
hit, kind = find_match(
|
|
{"company_name": "Clinic", "normalized_name": "clinic",
|
|
"google_place_id": "PID_123", "city": "Riyadh"},
|
|
idx,
|
|
)
|
|
assert (hit, kind) == ("acc_x", "place_id")
|
|
|
|
|
|
# ── Scoring ────────────────────────────────────────────────────
|
|
def test_lead_score_high_value_sector():
|
|
score = compute_lead_score({
|
|
"sector": "saas", "country": "SA", "city": "Riyadh",
|
|
"phone": "+966501234567", "website": "https://foodics.com",
|
|
"email": "info@foodics.com",
|
|
})
|
|
assert score.priority in ("P0", "P1", "P2")
|
|
assert score.fit > 0
|
|
assert score.recommended_channel is not None
|
|
|
|
|
|
def test_lead_score_opt_out_blocks_channel():
|
|
score = compute_lead_score({
|
|
"sector": "saas", "country": "SA",
|
|
"phone": "+966501234567", "opt_out": True,
|
|
})
|
|
assert score.recommended_channel is None
|
|
|
|
|
|
def test_dq_score_negative_on_opt_out():
|
|
dq, reasons = compute_data_quality({
|
|
"domain": "x.com", "city": "Riyadh", "sector": "SaaS",
|
|
"phone": "+966501234567", "opt_out": True,
|
|
"allowed_use": "business_contact_research_only",
|
|
})
|
|
assert dq == 0 # opt_out subtracts 100, clamped to 0
|
|
assert "-opt_out" in reasons
|
|
|
|
|
|
def test_dq_score_rewards_multi_source():
|
|
dq, reasons = compute_data_quality({
|
|
"domain": "x.com", "website": "https://x.com",
|
|
"city": "Riyadh", "sector": "SaaS",
|
|
"source_url": "https://example.com",
|
|
"phone": "+966501234567",
|
|
"source_count": 3, "best_source": "google_maps",
|
|
"allowed_use": "business_contact_research_only",
|
|
})
|
|
assert "+multi_source" in reasons
|
|
assert dq > 50
|