mirror of
https://github.com/x1xhlol/system-prompts-and-models-of-ai-tools.git
synced 2026-06-18 07:19:35 +00:00
255 lines
9.2 KiB
Python
255 lines
9.2 KiB
Python
"""Project Intelligence layer for Dealix v3.
|
|
|
|
Inspired by tools like SocraticCode, but implemented as a Dealix-owned core:
|
|
- index project files
|
|
- chunk code/docs
|
|
- prepare deterministic local embeddings hooks
|
|
- answer architectural questions with source-aware context
|
|
|
|
Production storage target: Supabase/Postgres + pgvector via the migration in
|
|
supabase/migrations/202605010001_v3_project_memory.sql.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
from hashlib import sha256
|
|
from pathlib import Path
|
|
from typing import Any, Iterable
|
|
|
|
TEXT_EXTENSIONS = {
|
|
".py", ".ts", ".tsx", ".js", ".jsx", ".md", ".txt", ".sql", ".json", ".yaml", ".yml", ".html", ".css", ".toml", ".ini", ".env.example",
|
|
}
|
|
|
|
IGNORE_DIRS = {
|
|
".git", ".venv", "venv", "node_modules", ".next", "dist", "build", "__pycache__", ".pytest_cache", ".mypy_cache",
|
|
}
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ProjectDocument:
|
|
path: str
|
|
source_type: str
|
|
content: str
|
|
content_hash: str
|
|
metadata: dict[str, Any]
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"path": self.path,
|
|
"source_type": self.source_type,
|
|
"content_hash": self.content_hash,
|
|
"metadata": self.metadata,
|
|
"chars": len(self.content),
|
|
}
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ProjectChunk:
|
|
path: str
|
|
chunk_index: int
|
|
content: str
|
|
token_estimate: int
|
|
metadata: dict[str, Any]
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"path": self.path,
|
|
"chunk_index": self.chunk_index,
|
|
"content": self.content,
|
|
"token_estimate": self.token_estimate,
|
|
"metadata": self.metadata,
|
|
}
|
|
|
|
|
|
def classify_path(path: str) -> str:
|
|
p = path.lower()
|
|
if p.startswith("api/"):
|
|
return "api"
|
|
if p.startswith("auto_client_acquisition/"):
|
|
return "revenue_engine"
|
|
if p.startswith("db/") or "migration" in p:
|
|
return "database"
|
|
if p.startswith("landing/") or p.endswith(".html"):
|
|
return "frontend_landing"
|
|
if p.startswith("docs/") or p.endswith(".md"):
|
|
return "documentation"
|
|
if p.startswith("tests/"):
|
|
return "tests"
|
|
return "code"
|
|
|
|
|
|
def should_index(path: Path) -> bool:
|
|
if any(part in IGNORE_DIRS for part in path.parts):
|
|
return False
|
|
if path.is_dir():
|
|
return False
|
|
if path.name == ".env":
|
|
return False
|
|
suffix = path.suffix.lower()
|
|
if suffix in TEXT_EXTENSIONS:
|
|
return True
|
|
return path.name.endswith(".env.example")
|
|
|
|
|
|
def scan_project(root: str | Path) -> list[ProjectDocument]:
|
|
root_path = Path(root)
|
|
docs: list[ProjectDocument] = []
|
|
for path in root_path.rglob("*"):
|
|
if not should_index(path):
|
|
continue
|
|
rel = str(path.relative_to(root_path)).replace("\\", "/")
|
|
try:
|
|
content = path.read_text(encoding="utf-8")
|
|
except UnicodeDecodeError:
|
|
continue
|
|
if not content.strip():
|
|
continue
|
|
docs.append(
|
|
ProjectDocument(
|
|
path=rel,
|
|
source_type=classify_path(rel),
|
|
content=content,
|
|
content_hash=sha256(content.encode("utf-8")).hexdigest(),
|
|
metadata={"extension": path.suffix.lower(), "chars": len(content)},
|
|
)
|
|
)
|
|
return docs
|
|
|
|
|
|
def chunk_text(document: ProjectDocument, *, max_chars: int = 1800, overlap: int = 180) -> list[ProjectChunk]:
|
|
content = document.content
|
|
chunks: list[ProjectChunk] = []
|
|
start = 0
|
|
index = 0
|
|
while start < len(content):
|
|
end = min(len(content), start + max_chars)
|
|
window = content[start:end]
|
|
# Prefer to cut on line boundary when possible.
|
|
if end < len(content):
|
|
newline = window.rfind("\n")
|
|
if newline > max_chars * 0.55:
|
|
end = start + newline
|
|
window = content[start:end]
|
|
chunks.append(
|
|
ProjectChunk(
|
|
path=document.path,
|
|
chunk_index=index,
|
|
content=window.strip(),
|
|
token_estimate=max(1, len(window) // 4),
|
|
metadata={"source_type": document.source_type, "content_hash": document.content_hash},
|
|
)
|
|
)
|
|
index += 1
|
|
start = max(end - overlap, end)
|
|
return chunks
|
|
|
|
|
|
def build_index_summary(documents: Iterable[ProjectDocument]) -> dict[str, Any]:
|
|
docs = list(documents)
|
|
by_type: dict[str, int] = {}
|
|
total_chars = 0
|
|
for doc in docs:
|
|
by_type[doc.source_type] = by_type.get(doc.source_type, 0) + 1
|
|
total_chars += len(doc.content)
|
|
return {
|
|
"documents": len(docs),
|
|
"total_chars": total_chars,
|
|
"by_type": by_type,
|
|
"recommended_next_step": "Generate embeddings and upsert into Supabase project_chunks.",
|
|
}
|
|
|
|
|
|
def naive_search(documents: Iterable[ProjectDocument], query: str, limit: int = 10) -> list[dict[str, Any]]:
|
|
terms = [term.lower() for term in query.split() if len(term) > 2]
|
|
scored: list[tuple[int, ProjectDocument]] = []
|
|
for doc in documents:
|
|
text = f"{doc.path}\n{doc.content}".lower()
|
|
score = sum(text.count(term) for term in terms)
|
|
if score:
|
|
scored.append((score, doc))
|
|
scored.sort(key=lambda item: item[0], reverse=True)
|
|
return [
|
|
{"score": score, **doc.to_dict()}
|
|
for score, doc in scored[:limit]
|
|
]
|
|
|
|
|
|
def explain_project_intelligence_stack() -> dict[str, Any]:
|
|
return {
|
|
"purpose": "Make Dealix understand its own codebase, docs, strategy, and relationships.",
|
|
"storage": "Supabase Postgres + pgvector",
|
|
"embedding_dimensions": 384,
|
|
"embedding_model_options": ["gte-small local/edge", "OpenAI text-embedding-3-small", "bge-small"],
|
|
"search_modes": ["keyword", "semantic", "hybrid", "relationship-aware"],
|
|
"best_use": [
|
|
"Ask what is missing before launch",
|
|
"Find files related to a feature",
|
|
"Generate implementation plans grounded in code",
|
|
"Power Sami Personal Operator memory",
|
|
"Let agents understand project relationships before editing",
|
|
],
|
|
}
|
|
|
|
|
|
def should_block_embedding(text: str) -> tuple[bool, str]:
|
|
"""Block embedding if content looks like secrets (never embed keys/tokens)."""
|
|
from auto_client_acquisition.personal_operator.memory import looks_like_secret
|
|
|
|
if looks_like_secret(text):
|
|
return True, "secret_pattern_detected"
|
|
return False, ""
|
|
|
|
|
|
def answer_operator_question(
|
|
question: str,
|
|
*,
|
|
root: str | Path = ".",
|
|
deep_scan: bool = False,
|
|
) -> dict[str, Any]:
|
|
"""Grounded answers for Personal Operator; keyword search optional."""
|
|
q = question.strip().lower()
|
|
note_ar = (
|
|
"البحث الدلالي غير متصل بعد؛ نستخدم مخطط المشروع المعروف والوحدات الأساسية. "
|
|
"semantic search not connected yet; using project blueprint and known modules."
|
|
)
|
|
answer_ar = (
|
|
"ركّز على: Personal Operator API، ذاكرة المشروع (Supabase/pgvector)، اختبارات، "
|
|
"واتساب موافقات، Gmail/Calendar كمسودات فقط، ثم pilot لـ 10 عملاء."
|
|
)
|
|
related_files: list[str] = [
|
|
"api/main.py",
|
|
"api/routers/personal_operator.py",
|
|
"api/routers/v3.py",
|
|
"auto_client_acquisition/personal_operator/operator.py",
|
|
"auto_client_acquisition/v3/project_intelligence.py",
|
|
]
|
|
if "ناقص" in question or "missing" in q:
|
|
answer_ar = (
|
|
"قبل التدشين: ربط embeddings بـ Supabase، سياسات RLS، تدفق واتساب بأزرار، "
|
|
"Gmail draft + Calendar draft بموافقة، تثبيت الاختبارات، ومراقبة (Sentry/OTel)."
|
|
)
|
|
elif "خطوة" in question or "next" in q:
|
|
answer_ar = "الخطوة التالية العملية: شغّل فهرسة المشروع، راجع تقرير الجاهزية، ثم ربط pilot مع قائمة 10 مؤسسين."
|
|
elif "ملف" in question or "files" in q or "pr" in q:
|
|
answer_ar = "أهم الملفات: `api/routers/personal_operator.py`, `auto_client_acquisition/personal_operator/`, `supabase/migrations/`."
|
|
elif "supabase" in q:
|
|
answer_ar = "أفضل مسار: Postgres + pgvector + Edge Function للـ embeddings، ومفتاح الخدمة فقط في السيرفر وليس في الواجهة."
|
|
elif "whatsapp" in q or "واتساب" in question or "buttons" in q:
|
|
answer_ar = "استخدم رسالتين كحد أقصى 3 أزرار لكل رسالة: قبول/تخطي/رسالة ثم اعتماد/تعديل/إلغاء. لا إرسال بارد."
|
|
elif "personal operator" in q or "مشغل" in question or "operator" in q:
|
|
answer_ar = "Personal Operator: daily brief + فرص + قرارات + مسودات برسالة عربية وموافقة صريحة قبل أي إرسال خارجي."
|
|
|
|
search_hits: list[dict[str, Any]] = []
|
|
if deep_scan:
|
|
docs = scan_project(root)
|
|
search_hits = naive_search(docs, question, limit=5)
|
|
|
|
return {
|
|
"question": question,
|
|
"answer_ar": answer_ar,
|
|
"semantic_status_ar": note_ar,
|
|
"related_files": related_files,
|
|
"search_hits": search_hits,
|
|
}
|