system-prompts-and-models-o.../dealix/auto_client_acquisition/v3/project_intelligence.py

"""Project Intelligence layer for Dealix v3.

Inspired by tools like SocraticCode, but implemented as a Dealix-owned core:
- index project files
- chunk code/docs
- prepare deterministic local embeddings hooks
- answer architectural questions with source-aware context

Production storage target: Supabase/Postgres + pgvector via the migration in
supabase/migrations/202605010001_v3_project_memory.sql.
"""

from __future__ import annotations

from dataclasses import dataclass
from hashlib import sha256
from pathlib import Path
from typing import Any, Iterable

TEXT_EXTENSIONS = {
    ".py", ".ts", ".tsx", ".js", ".jsx", ".md", ".txt", ".sql", ".json", ".yaml", ".yml", ".html", ".css", ".toml", ".ini", ".env.example",
}

IGNORE_DIRS = {
    ".git", ".venv", "venv", "node_modules", ".next", "dist", "build", "__pycache__", ".pytest_cache", ".mypy_cache",
}


@dataclass(frozen=True)
class ProjectDocument:
    path: str
    source_type: str
    content: str
    content_hash: str
    metadata: dict[str, Any]

    def to_dict(self) -> dict[str, Any]:
        return {
            "path": self.path,
            "source_type": self.source_type,
            "content_hash": self.content_hash,
            "metadata": self.metadata,
            "chars": len(self.content),
        }


@dataclass(frozen=True)
class ProjectChunk:
    path: str
    chunk_index: int
    content: str
    token_estimate: int
    metadata: dict[str, Any]

    def to_dict(self) -> dict[str, Any]:
        return {
            "path": self.path,
            "chunk_index": self.chunk_index,
            "content": self.content,
            "token_estimate": self.token_estimate,
            "metadata": self.metadata,
        }


def classify_path(path: str) -> str:
    p = path.lower()
    if p.startswith("api/"):
        return "api"
    if p.startswith("auto_client_acquisition/"):
        return "revenue_engine"
    if p.startswith("db/") or "migration" in p:
        return "database"
    if p.startswith("landing/") or p.endswith(".html"):
        return "frontend_landing"
    if p.startswith("docs/") or p.endswith(".md"):
        return "documentation"
    if p.startswith("tests/"):
        return "tests"
    return "code"


def should_index(path: Path) -> bool:
    if any(part in IGNORE_DIRS for part in path.parts):
        return False
    if path.is_dir():
        return False
    if path.name == ".env":
        return False
    suffix = path.suffix.lower()
    if suffix in TEXT_EXTENSIONS:
        return True
    return path.name.endswith(".env.example")


def scan_project(root: str | Path) -> list[ProjectDocument]:
    root_path = Path(root)
    docs: list[ProjectDocument] = []
    for path in root_path.rglob("*"):
        if not should_index(path):
            continue
        rel = str(path.relative_to(root_path)).replace("\\", "/")
        try:
            content = path.read_text(encoding="utf-8")
        except UnicodeDecodeError:
            continue
        if not content.strip():
            continue
        docs.append(
            ProjectDocument(
                path=rel,
                source_type=classify_path(rel),
                content=content,
                content_hash=sha256(content.encode("utf-8")).hexdigest(),
                metadata={"extension": path.suffix.lower(), "chars": len(content)},
            )
        )
    return docs


def chunk_text(document: ProjectDocument, *, max_chars: int = 1800, overlap: int = 180) -> list[ProjectChunk]:
    content = document.content
    chunks: list[ProjectChunk] = []
    start = 0
    index = 0
    while start < len(content):
        end = min(len(content), start + max_chars)
        window = content[start:end]
        # Prefer to cut on line boundary when possible.
        if end < len(content):
            newline = window.rfind("\n")
            if newline > max_chars * 0.55:
                end = start + newline
                window = content[start:end]
        chunks.append(
            ProjectChunk(
                path=document.path,
                chunk_index=index,
                content=window.strip(),
                token_estimate=max(1, len(window) // 4),
                metadata={"source_type": document.source_type, "content_hash": document.content_hash},
            )
        )
        index += 1
        start = max(end - overlap, end)
    return chunks


def build_index_summary(documents: Iterable[ProjectDocument]) -> dict[str, Any]:
    docs = list(documents)
    by_type: dict[str, int] = {}
    total_chars = 0
    for doc in docs:
        by_type[doc.source_type] = by_type.get(doc.source_type, 0) + 1
        total_chars += len(doc.content)
    return {
        "documents": len(docs),
        "total_chars": total_chars,
        "by_type": by_type,
        "recommended_next_step": "Generate embeddings and upsert into Supabase project_chunks.",
    }


def naive_search(documents: Iterable[ProjectDocument], query: str, limit: int = 10) -> list[dict[str, Any]]:
    terms = [term.lower() for term in query.split() if len(term) > 2]
    scored: list[tuple[int, ProjectDocument]] = []
    for doc in documents:
        text = f"{doc.path}\n{doc.content}".lower()
        score = sum(text.count(term) for term in terms)
        if score:
            scored.append((score, doc))
    scored.sort(key=lambda item: item[0], reverse=True)
    return [
        {"score": score, **doc.to_dict()}
        for score, doc in scored[:limit]
    ]


def explain_project_intelligence_stack() -> dict[str, Any]:
    return {
        "purpose": "Make Dealix understand its own codebase, docs, strategy, and relationships.",
        "storage": "Supabase Postgres + pgvector",
        "embedding_dimensions": 384,
        "embedding_model_options": ["gte-small local/edge", "OpenAI text-embedding-3-small", "bge-small"],
        "search_modes": ["keyword", "semantic", "hybrid", "relationship-aware"],
        "best_use": [
            "Ask what is missing before launch",
            "Find files related to a feature",
            "Generate implementation plans grounded in code",
            "Power Sami Personal Operator memory",
            "Let agents understand project relationships before editing",
        ],
    }


def should_block_embedding(text: str) -> tuple[bool, str]:
    """Block embedding if content looks like secrets (never embed keys/tokens)."""
    from auto_client_acquisition.personal_operator.memory import looks_like_secret

    if looks_like_secret(text):
        return True, "secret_pattern_detected"
    return False, ""


def answer_operator_question(
    question: str,
    *,
    root: str | Path = ".",
    deep_scan: bool = False,
) -> dict[str, Any]:
    """Grounded answers for Personal Operator; keyword search optional."""
    q = question.strip().lower()
    note_ar = (
        "البحث الدلالي غير متصل بعد؛ نستخدم مخطط المشروع المعروف والوحدات الأساسية. "
        "semantic search not connected yet; using project blueprint and known modules."
    )
    answer_ar = (
        "ركّز على: Personal Operator API، ذاكرة المشروع (Supabase/pgvector)، اختبارات، "
        "واتساب موافقات، Gmail/Calendar كمسودات فقط، ثم pilot لـ 10 عملاء."
    )
    related_files: list[str] = [
        "api/main.py",
        "api/routers/personal_operator.py",
        "api/routers/v3.py",
        "auto_client_acquisition/personal_operator/operator.py",
        "auto_client_acquisition/v3/project_intelligence.py",
    ]
    if "ناقص" in question or "missing" in q:
        answer_ar = (
            "قبل التدشين: ربط embeddings بـ Supabase، سياسات RLS، تدفق واتساب بأزرار، "
            "Gmail draft + Calendar draft بموافقة، تثبيت الاختبارات، ومراقبة (Sentry/OTel)."
        )
    elif "خطوة" in question or "next" in q:
        answer_ar = "الخطوة التالية العملية: شغّل فهرسة المشروع، راجع تقرير الجاهزية، ثم ربط pilot مع قائمة 10 مؤسسين."
    elif "ملف" in question or "files" in q or "pr" in q:
        answer_ar = "أهم الملفات: `api/routers/personal_operator.py`, `auto_client_acquisition/personal_operator/`, `supabase/migrations/`."
    elif "supabase" in q:
        answer_ar = "أفضل مسار: Postgres + pgvector + Edge Function للـ embeddings، ومفتاح الخدمة فقط في السيرفر وليس في الواجهة."
    elif "whatsapp" in q or "واتساب" in question or "buttons" in q:
        answer_ar = "استخدم رسالتين كحد أقصى 3 أزرار لكل رسالة: قبول/تخطي/رسالة ثم اعتماد/تعديل/إلغاء. لا إرسال بارد."
    elif "personal operator" in q or "مشغل" in question or "operator" in q:
        answer_ar = "Personal Operator: daily brief + فرص + قرارات + مسودات برسالة عربية وموافقة صريحة قبل أي إرسال خارجي."

    search_hits: list[dict[str, Any]] = []
    if deep_scan:
        docs = scan_project(root)
        search_hits = naive_search(docs, question, limit=5)

    return {
        "question": question,
        "answer_ar": answer_ar,
        "semantic_status_ar": note_ar,
        "related_files": related_files,
        "search_hits": search_hits,
    }