system-prompts-and-models-o.../dealix/auto_client_acquisition/v3/project_intelligence.py
2026-05-01 14:03:52 +03:00

255 lines
9.2 KiB
Python

"""Project Intelligence layer for Dealix v3.
Inspired by tools like SocraticCode, but implemented as a Dealix-owned core:
- index project files
- chunk code/docs
- prepare deterministic local embeddings hooks
- answer architectural questions with source-aware context
Production storage target: Supabase/Postgres + pgvector via the migration in
supabase/migrations/202605010001_v3_project_memory.sql.
"""
from __future__ import annotations
from dataclasses import dataclass
from hashlib import sha256
from pathlib import Path
from typing import Any, Iterable
TEXT_EXTENSIONS = {
".py", ".ts", ".tsx", ".js", ".jsx", ".md", ".txt", ".sql", ".json", ".yaml", ".yml", ".html", ".css", ".toml", ".ini", ".env.example",
}
IGNORE_DIRS = {
".git", ".venv", "venv", "node_modules", ".next", "dist", "build", "__pycache__", ".pytest_cache", ".mypy_cache",
}
@dataclass(frozen=True)
class ProjectDocument:
path: str
source_type: str
content: str
content_hash: str
metadata: dict[str, Any]
def to_dict(self) -> dict[str, Any]:
return {
"path": self.path,
"source_type": self.source_type,
"content_hash": self.content_hash,
"metadata": self.metadata,
"chars": len(self.content),
}
@dataclass(frozen=True)
class ProjectChunk:
path: str
chunk_index: int
content: str
token_estimate: int
metadata: dict[str, Any]
def to_dict(self) -> dict[str, Any]:
return {
"path": self.path,
"chunk_index": self.chunk_index,
"content": self.content,
"token_estimate": self.token_estimate,
"metadata": self.metadata,
}
def classify_path(path: str) -> str:
p = path.lower()
if p.startswith("api/"):
return "api"
if p.startswith("auto_client_acquisition/"):
return "revenue_engine"
if p.startswith("db/") or "migration" in p:
return "database"
if p.startswith("landing/") or p.endswith(".html"):
return "frontend_landing"
if p.startswith("docs/") or p.endswith(".md"):
return "documentation"
if p.startswith("tests/"):
return "tests"
return "code"
def should_index(path: Path) -> bool:
if any(part in IGNORE_DIRS for part in path.parts):
return False
if path.is_dir():
return False
if path.name == ".env":
return False
suffix = path.suffix.lower()
if suffix in TEXT_EXTENSIONS:
return True
return path.name.endswith(".env.example")
def scan_project(root: str | Path) -> list[ProjectDocument]:
root_path = Path(root)
docs: list[ProjectDocument] = []
for path in root_path.rglob("*"):
if not should_index(path):
continue
rel = str(path.relative_to(root_path)).replace("\\", "/")
try:
content = path.read_text(encoding="utf-8")
except UnicodeDecodeError:
continue
if not content.strip():
continue
docs.append(
ProjectDocument(
path=rel,
source_type=classify_path(rel),
content=content,
content_hash=sha256(content.encode("utf-8")).hexdigest(),
metadata={"extension": path.suffix.lower(), "chars": len(content)},
)
)
return docs
def chunk_text(document: ProjectDocument, *, max_chars: int = 1800, overlap: int = 180) -> list[ProjectChunk]:
content = document.content
chunks: list[ProjectChunk] = []
start = 0
index = 0
while start < len(content):
end = min(len(content), start + max_chars)
window = content[start:end]
# Prefer to cut on line boundary when possible.
if end < len(content):
newline = window.rfind("\n")
if newline > max_chars * 0.55:
end = start + newline
window = content[start:end]
chunks.append(
ProjectChunk(
path=document.path,
chunk_index=index,
content=window.strip(),
token_estimate=max(1, len(window) // 4),
metadata={"source_type": document.source_type, "content_hash": document.content_hash},
)
)
index += 1
start = max(end - overlap, end)
return chunks
def build_index_summary(documents: Iterable[ProjectDocument]) -> dict[str, Any]:
docs = list(documents)
by_type: dict[str, int] = {}
total_chars = 0
for doc in docs:
by_type[doc.source_type] = by_type.get(doc.source_type, 0) + 1
total_chars += len(doc.content)
return {
"documents": len(docs),
"total_chars": total_chars,
"by_type": by_type,
"recommended_next_step": "Generate embeddings and upsert into Supabase project_chunks.",
}
def naive_search(documents: Iterable[ProjectDocument], query: str, limit: int = 10) -> list[dict[str, Any]]:
terms = [term.lower() for term in query.split() if len(term) > 2]
scored: list[tuple[int, ProjectDocument]] = []
for doc in documents:
text = f"{doc.path}\n{doc.content}".lower()
score = sum(text.count(term) for term in terms)
if score:
scored.append((score, doc))
scored.sort(key=lambda item: item[0], reverse=True)
return [
{"score": score, **doc.to_dict()}
for score, doc in scored[:limit]
]
def explain_project_intelligence_stack() -> dict[str, Any]:
return {
"purpose": "Make Dealix understand its own codebase, docs, strategy, and relationships.",
"storage": "Supabase Postgres + pgvector",
"embedding_dimensions": 384,
"embedding_model_options": ["gte-small local/edge", "OpenAI text-embedding-3-small", "bge-small"],
"search_modes": ["keyword", "semantic", "hybrid", "relationship-aware"],
"best_use": [
"Ask what is missing before launch",
"Find files related to a feature",
"Generate implementation plans grounded in code",
"Power Sami Personal Operator memory",
"Let agents understand project relationships before editing",
],
}
def should_block_embedding(text: str) -> tuple[bool, str]:
"""Block embedding if content looks like secrets (never embed keys/tokens)."""
from auto_client_acquisition.personal_operator.memory import looks_like_secret
if looks_like_secret(text):
return True, "secret_pattern_detected"
return False, ""
def answer_operator_question(
question: str,
*,
root: str | Path = ".",
deep_scan: bool = False,
) -> dict[str, Any]:
"""Grounded answers for Personal Operator; keyword search optional."""
q = question.strip().lower()
note_ar = (
"البحث الدلالي غير متصل بعد؛ نستخدم مخطط المشروع المعروف والوحدات الأساسية. "
"semantic search not connected yet; using project blueprint and known modules."
)
answer_ar = (
"ركّز على: Personal Operator API، ذاكرة المشروع (Supabase/pgvector)، اختبارات، "
"واتساب موافقات، Gmail/Calendar كمسودات فقط، ثم pilot لـ 10 عملاء."
)
related_files: list[str] = [
"api/main.py",
"api/routers/personal_operator.py",
"api/routers/v3.py",
"auto_client_acquisition/personal_operator/operator.py",
"auto_client_acquisition/v3/project_intelligence.py",
]
if "ناقص" in question or "missing" in q:
answer_ar = (
"قبل التدشين: ربط embeddings بـ Supabase، سياسات RLS، تدفق واتساب بأزرار، "
"Gmail draft + Calendar draft بموافقة، تثبيت الاختبارات، ومراقبة (Sentry/OTel)."
)
elif "خطوة" in question or "next" in q:
answer_ar = "الخطوة التالية العملية: شغّل فهرسة المشروع، راجع تقرير الجاهزية، ثم ربط pilot مع قائمة 10 مؤسسين."
elif "ملف" in question or "files" in q or "pr" in q:
answer_ar = "أهم الملفات: `api/routers/personal_operator.py`, `auto_client_acquisition/personal_operator/`, `supabase/migrations/`."
elif "supabase" in q:
answer_ar = "أفضل مسار: Postgres + pgvector + Edge Function للـ embeddings، ومفتاح الخدمة فقط في السيرفر وليس في الواجهة."
elif "whatsapp" in q or "واتساب" in question or "buttons" in q:
answer_ar = "استخدم رسالتين كحد أقصى 3 أزرار لكل رسالة: قبول/تخطي/رسالة ثم اعتماد/تعديل/إلغاء. لا إرسال بارد."
elif "personal operator" in q or "مشغل" in question or "operator" in q:
answer_ar = "Personal Operator: daily brief + فرص + قرارات + مسودات برسالة عربية وموافقة صريحة قبل أي إرسال خارجي."
search_hits: list[dict[str, Any]] = []
if deep_scan:
docs = scan_project(root)
search_hits = naive_search(docs, question, limit=5)
return {
"question": question,
"answer_ar": answer_ar,
"semantic_status_ar": note_ar,
"related_files": related_files,
"search_hits": search_hits,
}