From dd8d2ccc9e8dcb4866185a7780beebfeab52a447 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 23 Apr 2026 13:33:09 +0000 Subject: [PATCH] fix(docker): CPU-only torch + multi-stage build to fix Railway 4GB limit Railway build was failing with "Image of size 5.7 GB exceeded limit of 4.0 GB" because sentence-transformers pulled torch with full CUDA/NVIDIA GPU packages (~3 GB). Fix: multi-stage Dockerfile that: 1. Installs CPU-only torch first (--index-url pytorch.org/whl/cpu) saving ~3 GB (200 MB CPU vs 3.2 GB CUDA) 2. Multi-stage build: builder + runtime (smaller final image) 3. Non-root user (app:1000) 4. tini init for proper signal handling 5. Built-in HEALTHCHECK with 60s start-period 6. railway.toml with healthcheck path and restart policy Also fixes healthcheck failure: start-period=60s gives the app time to initialize before Railway starts checking /health. Expected image size: ~2 GB (down from 5.7 GB). https://claude.ai/code/session_01W1rJthWDkasijTdXCfxVHs --- salesflow-saas/backend/Dockerfile | 46 ++++++++++++++++++++++++----- salesflow-saas/backend/railway.toml | 9 ++++++ 2 files changed, 47 insertions(+), 8 deletions(-) create mode 100644 salesflow-saas/backend/railway.toml diff --git a/salesflow-saas/backend/Dockerfile b/salesflow-saas/backend/Dockerfile index 27510ee4..8432bdc5 100644 --- a/salesflow-saas/backend/Dockerfile +++ b/salesflow-saas/backend/Dockerfile @@ -1,16 +1,46 @@ -FROM python:3.12-slim - -WORKDIR /app +# ── Stage 1: Builder ────────────────────────────────── +FROM python:3.12-slim AS builder RUN apt-get update && apt-get install -y --no-install-recommends \ - gcc libpq-dev curl \ + build-essential libpq-dev curl \ && rm -rf /var/lib/apt/lists/* -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt +WORKDIR /build -COPY . . +RUN python -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +COPY requirements.txt ./ + +# Install CPU-only torch first (saves ~3 GB vs CUDA version) +RUN pip install --no-cache-dir --upgrade pip setuptools wheel \ + && pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu \ + && pip install --no-cache-dir -r requirements.txt + +# ── Stage 2: Runtime ───────────────────────────────── +FROM python:3.12-slim AS runtime + +RUN apt-get update && apt-get install -y --no-install-recommends \ + libpq5 curl tini \ + && rm -rf /var/lib/apt/lists/* + +RUN groupadd --gid 1000 app \ + && useradd --uid 1000 --gid app --shell /bin/bash --create-home app + +COPY --from=builder /opt/venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" \ + PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 + +WORKDIR /app +COPY --chown=app:app . . + +USER app EXPOSE 8000 -CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ + CMD curl -f http://localhost:8000/api/v1/health || exit 1 + +ENTRYPOINT ["tini", "--"] +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"] diff --git a/salesflow-saas/backend/railway.toml b/salesflow-saas/backend/railway.toml new file mode 100644 index 00000000..66a8bbd5 --- /dev/null +++ b/salesflow-saas/backend/railway.toml @@ -0,0 +1,9 @@ +[build] +dockerfilePath = "Dockerfile" + +[deploy] +healthcheckPath = "/api/v1/health" +healthcheckTimeout = 120 +startCommand = "uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-8000} --workers 2" +restartPolicyType = "ON_FAILURE" +restartPolicyMaxRetries = 3