4 months ago · 494d4838fd
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,16 +29,23 @@ RUN python -m pip install --no-cache-dir --upgrade pip setuptools wheel
 
															 # Install PyTorch with CUDA 12.4
														
 
															 RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
														
 
															+# Install transformers from git main (qwen3_next model type not in any stable release yet)
														
 
															+RUN pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@main"
														
 
															+
														
 
															 # Install flash-attention for faster training
														
 
															 RUN pip install --no-cache-dir flash-attn --no-build-isolation 2>/dev/null || echo "Flash attention build failed, continuing without it"
														
 
															+# Install causal-conv1d and flash-linear-attention for DeltaNet layers (optional but recommended)
														
 
															+RUN pip install --no-cache-dir causal-conv1d 2>/dev/null || echo "causal-conv1d build failed, continuing without it"
														
 
															+RUN pip install --no-cache-dir flash-linear-attention 2>/dev/null || echo "flash-linear-attention build failed, continuing without it"
														
 
															+
														
 
															 # Create non-root user FIRST
														
 
															 RUN useradd -m -u 1000 user
														
 
															 # Create app directory
														
 
															 WORKDIR /app
														
 
															-# Copy requirements and install
														
 
															+# Copy requirements and install (transformers already installed from git above, pip will skip it)
														
 
															 COPY requirements.txt .
														
 
															 RUN pip install --no-cache-dir -r requirements.txt
														
--- a/config.yaml
+++ b/config.yaml
@@ -19,14 +19,10 @@ quantization:
 
															 lora:
														
 
															   r: 64
														
 
															   lora_alpha: 128
														
 
															-  target_modules:
														
 
															-    - "q_proj"
														
 
															-    - "k_proj"
														
 
															-    - "v_proj"
														
 
															-    - "o_proj"
														
 
															-    - "gate_proj"
														
 
															-    - "up_proj"
														
 
															-    - "down_proj"
														
 
															+  # Use "all-linear" to auto-discover all linear layers in the hybrid
														
 
															+  # DeltaNet + Gated Attention + MoE architecture (safer than listing
														
 
															+  # individual module names which may differ from standard Qwen).
														
 
															+  target_modules: "all-linear"
														
 
															   lora_dropout: 0.05
														
 
															   bias: "none"
														
 
															   task_type: "CAUSAL_LM"
														
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 
															 torch>=2.4.0
														
 
															-transformers>=4.48.0
														
 
															+transformers @ git+https://github.com/huggingface/transformers.git@main
														
 
															 accelerate>=1.2.0
														
 
															 peft>=0.14.0
														
 
															 trl>=0.14.0
														
--- a/train.py
+++ b/train.py
@@ -332,7 +332,7 @@ def train(
 
															     # -----------------------------------------------------------------------
														
 
															     write_status(
														
 
															         "loading_model",
														
 
															-        "Loading Qwen3-Coder-Next in 4-bit quantization... (this takes a while)",
														
 
															+        "Loading model in 4-bit quantization... (this takes a while)",
														
 
															     )
														
 
															     q_cfg = config["quantization"]
														
@@ -355,18 +355,51 @@ def train(
 
															         attn_impl = "eager"
														
 
															     logger.info(f"Using attention implementation: {attn_impl}")
														
 
															+    # Log transformers version to confirm qwen3_next support
														
 
															+    import transformers
														
 
															+
														
 
															+    logger.info(f"transformers version: {transformers.__version__}")
														
 
															+
														
 
															+    # Pre-quantized fallback model for Qwen3-Next architecture
														
 
															+    PRE_QUANTIZED_FALLBACK = "unsloth/Qwen3-Next-80B-A3B-Instruct-bnb-4bit"
														
 
															+
														
 
															     # Force ALL layers onto GPU 0. device_map="auto" is too conservative
														
 
															     # with large MoE models and offloads to CPU where bnb 4-bit can't run.
														
 
															     # 80B params in 4-bit ≈ 40GB — fits comfortably on A100 80GB.
														
 
															-    model = AutoModelForCausalLM.from_pretrained(
														
 
															-        model_name,
														
 
															-        quantization_config=bnb_config,
														
 
															-        device_map={"": 0},
														
 
															-        trust_remote_code=config["model"]["trust_remote_code"],
														
 
															-        torch_dtype=getattr(torch, config["model"]["torch_dtype"]),
														
 
															-        token=hf_token,
														
 
															-        attn_implementation=attn_impl,
														
 
															-    )
														
 
															+    try:
														
 
															+        logger.info(
														
 
															+            f"Attempting to load {model_name} with on-the-fly 4-bit quantization..."
														
 
															+        )
														
 
															+        model = AutoModelForCausalLM.from_pretrained(
														
 
															+            model_name,
														
 
															+            quantization_config=bnb_config,
														
 
															+            device_map={"": 0},
														
 
															+            trust_remote_code=config["model"]["trust_remote_code"],
														
 
															+            torch_dtype=getattr(torch, config["model"]["torch_dtype"]),
														
 
															+            token=hf_token,
														
 
															+            attn_implementation=attn_impl,
														
 
															+        )
														
 
															+        logger.info(f"Successfully loaded {model_name} with 4-bit quantization")
														
 
															+    except Exception as e:
														
 
															+        logger.warning(f"On-the-fly 4-bit quantization failed for {model_name}: {e}")
														
 
															+        logger.info(f"Falling back to pre-quantized model: {PRE_QUANTIZED_FALLBACK}")
														
 
															+        write_status(
														
 
															+            "loading_model",
														
 
															+            f"On-the-fly quantization failed, loading pre-quantized fallback...",
														
 
															+        )
														
 
															+        # Pre-quantized model already has bnb 4-bit weights baked in —
														
 
															+        # do NOT pass quantization_config again, just load directly.
														
 
															+        model = AutoModelForCausalLM.from_pretrained(
														
 
															+            PRE_QUANTIZED_FALLBACK,
														
 
															+            device_map={"": 0},
														
 
															+            trust_remote_code=True,
														
 
															+            torch_dtype=torch.bfloat16,
														
 
															+            token=hf_token,
														
 
															+            attn_implementation=attn_impl,
														
 
															+        )
														
 
															+        logger.info(
														
 
															+            f"Successfully loaded pre-quantized fallback: {PRE_QUANTIZED_FALLBACK}"
														
 
															+        )
														
 
															     model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
														
 
															     logger.info("Model loaded and prepared for k-bit training")
														
@@ -377,10 +410,15 @@ def train(
 
															     write_status("loading_model", "Applying LoRA adapters...")
														
 
															     lora_cfg = config["lora"]
														
 
															+    target_modules = lora_cfg["target_modules"]
														
 
															+    # target_modules can be a list of strings or the string "all-linear"
														
 
															+    if isinstance(target_modules, str) and target_modules != "all-linear":
														
 
															+        target_modules = [target_modules]
														
 
															+
														
 
															     lora_config = LoraConfig(
														
 
															         r=lora_cfg["r"],
														
 
															         lora_alpha=lora_cfg["lora_alpha"],
														
 
															-        target_modules=lora_cfg["target_modules"],
														
 
															+        target_modules=target_modules,
														
 
															         lora_dropout=lora_cfg["lora_dropout"],
														
 
															         bias=lora_cfg["bias"],
														
 
															         task_type=lora_cfg["task_type"],