há 4 meses atrás · 494d4838fd
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,16 +29,23 @@ RUN python -m pip install --no-cache-dir --upgrade pip setuptools wheel
 
				 # Install PyTorch with CUDA 12.4
			
 
				 RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
			
 
				 
			
 
				+# Install transformers from git main (qwen3_next model type not in any stable release yet)
			
 
				+RUN pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@main"
			
 
				+
			
 
				 # Install flash-attention for faster training
			
 
				 RUN pip install --no-cache-dir flash-attn --no-build-isolation 2>/dev/null || echo "Flash attention build failed, continuing without it"
			
 
				 
			
 
				+# Install causal-conv1d and flash-linear-attention for DeltaNet layers (optional but recommended)
			
 
				+RUN pip install --no-cache-dir causal-conv1d 2>/dev/null || echo "causal-conv1d build failed, continuing without it"
			
 
				+RUN pip install --no-cache-dir flash-linear-attention 2>/dev/null || echo "flash-linear-attention build failed, continuing without it"
			
 
				+
			
 
				 # Create non-root user FIRST
			
 
				 RUN useradd -m -u 1000 user
			
 
				 
			
 
				 # Create app directory
			
 
				 WORKDIR /app
			
 
				 
			
 
				-# Copy requirements and install
			
 
				+# Copy requirements and install (transformers already installed from git above, pip will skip it)
			
 
				 COPY requirements.txt .
			
 
				 RUN pip install --no-cache-dir -r requirements.txt
			
 
				 
			
--- a/config.yaml
+++ b/config.yaml
@@ -19,14 +19,10 @@ quantization:
 
				 lora:
			
 
				   r: 64
			
 
				   lora_alpha: 128
			
 
				-  target_modules:
			
 
				-    - "q_proj"
			
 
				-    - "k_proj"
			
 
				-    - "v_proj"
			
 
				-    - "o_proj"
			
 
				-    - "gate_proj"
			
 
				-    - "up_proj"
			
 
				-    - "down_proj"
			
 
				+  # Use "all-linear" to auto-discover all linear layers in the hybrid
			
 
				+  # DeltaNet + Gated Attention + MoE architecture (safer than listing
			
 
				+  # individual module names which may differ from standard Qwen).
			
 
				+  target_modules: "all-linear"
			
 
				   lora_dropout: 0.05
			
 
				   bias: "none"
			
 
				   task_type: "CAUSAL_LM"
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 
				 torch>=2.4.0
			
 
				-transformers>=4.48.0
			
 
				+transformers @ git+https://github.com/huggingface/transformers.git@main
			
 
				 accelerate>=1.2.0
			
 
				 peft>=0.14.0
			
 
				 trl>=0.14.0
			
--- a/train.py
+++ b/train.py
@@ -332,7 +332,7 @@ def train(
 
				     # -----------------------------------------------------------------------
			
 
				     write_status(
			
 
				         "loading_model",
			
 
				-        "Loading Qwen3-Coder-Next in 4-bit quantization... (this takes a while)",
			
 
				+        "Loading model in 4-bit quantization... (this takes a while)",
			
 
				     )
			
 
				 
			
 
				     q_cfg = config["quantization"]
			
@@ -355,18 +355,51 @@ def train(
 
				         attn_impl = "eager"
			
 
				     logger.info(f"Using attention implementation: {attn_impl}")
			
 
				 
			
 
				+    # Log transformers version to confirm qwen3_next support
			
 
				+    import transformers
			
 
				+
			
 
				+    logger.info(f"transformers version: {transformers.__version__}")
			
 
				+
			
 
				+    # Pre-quantized fallback model for Qwen3-Next architecture
			
 
				+    PRE_QUANTIZED_FALLBACK = "unsloth/Qwen3-Next-80B-A3B-Instruct-bnb-4bit"
			
 
				+
			
 
				     # Force ALL layers onto GPU 0. device_map="auto" is too conservative
			
 
				     # with large MoE models and offloads to CPU where bnb 4-bit can't run.
			
 
				     # 80B params in 4-bit ≈ 40GB — fits comfortably on A100 80GB.
			
 
				-    model = AutoModelForCausalLM.from_pretrained(
			
 
				-        model_name,
			
 
				-        quantization_config=bnb_config,
			
 
				-        device_map={"": 0},
			
 
				-        trust_remote_code=config["model"]["trust_remote_code"],
			
 
				-        torch_dtype=getattr(torch, config["model"]["torch_dtype"]),
			
 
				-        token=hf_token,
			
 
				-        attn_implementation=attn_impl,
			
 
				-    )
			
 
				+    try:
			
 
				+        logger.info(
			
 
				+            f"Attempting to load {model_name} with on-the-fly 4-bit quantization..."
			
 
				+        )
			
 
				+        model = AutoModelForCausalLM.from_pretrained(
			
 
				+            model_name,
			
 
				+            quantization_config=bnb_config,
			
 
				+            device_map={"": 0},
			
 
				+            trust_remote_code=config["model"]["trust_remote_code"],
			
 
				+            torch_dtype=getattr(torch, config["model"]["torch_dtype"]),
			
 
				+            token=hf_token,
			
 
				+            attn_implementation=attn_impl,
			
 
				+        )
			
 
				+        logger.info(f"Successfully loaded {model_name} with 4-bit quantization")
			
 
				+    except Exception as e:
			
 
				+        logger.warning(f"On-the-fly 4-bit quantization failed for {model_name}: {e}")
			
 
				+        logger.info(f"Falling back to pre-quantized model: {PRE_QUANTIZED_FALLBACK}")
			
 
				+        write_status(
			
 
				+            "loading_model",
			
 
				+            f"On-the-fly quantization failed, loading pre-quantized fallback...",
			
 
				+        )
			
 
				+        # Pre-quantized model already has bnb 4-bit weights baked in —
			
 
				+        # do NOT pass quantization_config again, just load directly.
			
 
				+        model = AutoModelForCausalLM.from_pretrained(
			
 
				+            PRE_QUANTIZED_FALLBACK,
			
 
				+            device_map={"": 0},
			
 
				+            trust_remote_code=True,
			
 
				+            torch_dtype=torch.bfloat16,
			
 
				+            token=hf_token,
			
 
				+            attn_implementation=attn_impl,
			
 
				+        )
			
 
				+        logger.info(
			
 
				+            f"Successfully loaded pre-quantized fallback: {PRE_QUANTIZED_FALLBACK}"
			
 
				+        )
			
 
				 
			
 
				     model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
			
 
				     logger.info("Model loaded and prepared for k-bit training")
			
@@ -377,10 +410,15 @@ def train(
 
				     write_status("loading_model", "Applying LoRA adapters...")
			
 
				 
			
 
				     lora_cfg = config["lora"]
			
 
				+    target_modules = lora_cfg["target_modules"]
			
 
				+    # target_modules can be a list of strings or the string "all-linear"
			
 
				+    if isinstance(target_modules, str) and target_modules != "all-linear":
			
 
				+        target_modules = [target_modules]
			
 
				+
			
 
				     lora_config = LoraConfig(
			
 
				         r=lora_cfg["r"],
			
 
				         lora_alpha=lora_cfg["lora_alpha"],
			
 
				-        target_modules=lora_cfg["target_modules"],
			
 
				+        target_modules=target_modules,
			
 
				         lora_dropout=lora_cfg["lora_dropout"],
			
 
				         bias=lora_cfg["bias"],
			
 
				         task_type=lora_cfg["task_type"],