FROM nvidia/cuda:12.4.1-devel-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV GRADIO_SERVER_NAME=0.0.0.0
ENV GRADIO_SERVER_PORT=7860

# System dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
    python3.11 \
    python3.11-venv \
    python3.11-dev \
    python3-pip \
    git \
    git-lfs \
    wget \
    curl \
    build-essential \
    && rm -rf /var/lib/apt/lists/* \
    && git lfs install

# Set python3.11 as default
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 \
    && update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1

# Upgrade pip
RUN python -m pip install --no-cache-dir --upgrade pip setuptools wheel

# Install PyTorch with CUDA 12.4
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

# Install transformers from git main (qwen3_next model type not in any stable release yet)
RUN pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@main"

# Flash-attention, causal-conv1d, flash-linear-attention are optional perf
# optimizations.  Building from source OOMs the HF Spaces builder (~16 GB RAM)
# so we only attempt pre-built binary wheels (--only-binary :all:).
# If no wheel exists for this CUDA/Python combo the install is skipped.
RUN pip install --no-cache-dir --only-binary :all: flash-attn 2>/dev/null \
    || echo "flash-attn: no pre-built wheel, skipping"
RUN pip install --no-cache-dir --only-binary :all: causal-conv1d 2>/dev/null \
    || echo "causal-conv1d: no pre-built wheel, skipping"
RUN pip install --no-cache-dir --only-binary :all: flash-linear-attention 2>/dev/null \
    || echo "flash-linear-attention: no pre-built wheel, skipping"

# Create non-root user FIRST
RUN useradd -m -u 1000 user

# Create app directory
WORKDIR /app

# Copy requirements and install (transformers already installed from git above, pip will skip it)
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy app files
COPY . .

# Create ALL directories under user home (NOT /tmp)
RUN mkdir -p /home/user/hf_cache /home/user/torch_cache /home/user/output /home/user/merged \
    && chown -R user:user /home/user /app

# Set cache env vars to user home
ENV HOME=/home/user
ENV HF_HOME=/home/user/hf_cache
ENV TRANSFORMERS_CACHE=/home/user/hf_cache
ENV TORCH_HOME=/home/user/torch_cache
ENV PATH="/home/user/.local/bin:$PATH"
# Limit PyTorch CUDA memory fragmentation and allow expandable segments
ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

USER user

EXPOSE 7860

CMD ["python", "app.py"]