fix(docker): CPU-only torch + multi-stage build to fix Railway 4GB limit

Railway build was failing with "Image of size 5.7 GB exceeded limit of 4.0 GB" because sentence-transformers pulled torch with full CUDA/NVIDIA GPU packages (~3 GB). Fix: multi-stage Dockerfile that: 1. Installs CPU-only torch first (--index-url pytorch.org/whl/cpu) saving ~3 GB (200 MB CPU vs 3.2 GB CUDA) 2. Multi-stage build: builder + runtime (smaller final image) 3. Non-root user (app:1000) 4. tini init for proper signal handling 5. Built-in HEALTHCHECK with 60s start-period 6. railway.toml with healthcheck path and restart policy Also fixes healthcheck failure: start-period=60s gives the app time to initialize before Railway starts checking /health. Expected image size: ~2 GB (down from 5.7 GB). https://claude.ai/code/session_01W1rJthWDkasijTdXCfxVHs
2026-06-17 23:09:35 +00:00 · 2026-04-23 13:33:09 +00:00 · 2026-04-23 13:33:09 +00:00 · dd8d2ccc9e
commit dd8d2ccc9e
parent 8760078f45
2 changed files with 47 additions and 8 deletions
--- a/salesflow-saas/backend/Dockerfile
+++ b/salesflow-saas/backend/Dockerfile
@ -1,16 +1,46 @@
-FROM python:3.12-slim
-
-WORKDIR /app
+# ── Stage 1: Builder ──────────────────────────────────
+FROM python:3.12-slim AS builder

 RUN apt-get update && apt-get install -y --no-install-recommends \
-    gcc libpq-dev curl \
+    build-essential libpq-dev curl \
    && rm -rf /var/lib/apt/lists/*

-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
+WORKDIR /build

-COPY . .
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+COPY requirements.txt ./
+
+# Install CPU-only torch first (saves ~3 GB vs CUDA version)
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel \
+    && pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu \
+    && pip install --no-cache-dir -r requirements.txt
+
+# ── Stage 2: Runtime ─────────────────────────────────
+FROM python:3.12-slim AS runtime
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libpq5 curl tini \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN groupadd --gid 1000 app \
+    && useradd --uid 1000 --gid app --shell /bin/bash --create-home app
+
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH" \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1
+
+WORKDIR /app
+COPY --chown=app:app . .
+
+USER app

 EXPOSE 8000

-CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl -f http://localhost:8000/api/v1/health || exit 1
+
+ENTRYPOINT ["tini", "--"]
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]
--- a/salesflow-saas/backend/railway.toml
+++ b/salesflow-saas/backend/railway.toml
@ -0,0 +1,9 @@
+[build]
+dockerfilePath = "Dockerfile"
+
+[deploy]
+healthcheckPath = "/api/v1/health"
+healthcheckTimeout = 120
+startCommand = "uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-8000} --workers 2"
+restartPolicyType = "ON_FAILURE"
+restartPolicyMaxRetries = 3