Ver Fonte

Deploy Qwen3-Coder-Next uncensored fine-tuner

Sameric há 4 meses atrás
pai
commit
494d4838fd
4 ficheiros alterados com 62 adições e 21 exclusões
  1. 8 1
      Dockerfile
  2. 4 8
      config.yaml
  3. 1 1
      requirements.txt
  4. 49 11
      train.py

+ 8 - 1
Dockerfile

@@ -29,16 +29,23 @@ RUN python -m pip install --no-cache-dir --upgrade pip setuptools wheel
 # Install PyTorch with CUDA 12.4
 RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
 
+# Install transformers from git main (qwen3_next model type not in any stable release yet)
+RUN pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@main"
+
 # Install flash-attention for faster training
 RUN pip install --no-cache-dir flash-attn --no-build-isolation 2>/dev/null || echo "Flash attention build failed, continuing without it"
 
+# Install causal-conv1d and flash-linear-attention for DeltaNet layers (optional but recommended)
+RUN pip install --no-cache-dir causal-conv1d 2>/dev/null || echo "causal-conv1d build failed, continuing without it"
+RUN pip install --no-cache-dir flash-linear-attention 2>/dev/null || echo "flash-linear-attention build failed, continuing without it"
+
 # Create non-root user FIRST
 RUN useradd -m -u 1000 user
 
 # Create app directory
 WORKDIR /app
 
-# Copy requirements and install
+# Copy requirements and install (transformers already installed from git above, pip will skip it)
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 

+ 4 - 8
config.yaml

@@ -19,14 +19,10 @@ quantization:
 lora:
   r: 64
   lora_alpha: 128
-  target_modules:
-    - "q_proj"
-    - "k_proj"
-    - "v_proj"
-    - "o_proj"
-    - "gate_proj"
-    - "up_proj"
-    - "down_proj"
+  # Use "all-linear" to auto-discover all linear layers in the hybrid
+  # DeltaNet + Gated Attention + MoE architecture (safer than listing
+  # individual module names which may differ from standard Qwen).
+  target_modules: "all-linear"
   lora_dropout: 0.05
   bias: "none"
   task_type: "CAUSAL_LM"

+ 1 - 1
requirements.txt

@@ -1,5 +1,5 @@
 torch>=2.4.0
-transformers>=4.48.0
+transformers @ git+https://github.com/huggingface/transformers.git@main
 accelerate>=1.2.0
 peft>=0.14.0
 trl>=0.14.0

+ 49 - 11
train.py

@@ -332,7 +332,7 @@ def train(
     # -----------------------------------------------------------------------
     write_status(
         "loading_model",
-        "Loading Qwen3-Coder-Next in 4-bit quantization... (this takes a while)",
+        "Loading model in 4-bit quantization... (this takes a while)",
     )
 
     q_cfg = config["quantization"]
@@ -355,18 +355,51 @@ def train(
         attn_impl = "eager"
     logger.info(f"Using attention implementation: {attn_impl}")
 
+    # Log transformers version to confirm qwen3_next support
+    import transformers
+
+    logger.info(f"transformers version: {transformers.__version__}")
+
+    # Pre-quantized fallback model for Qwen3-Next architecture
+    PRE_QUANTIZED_FALLBACK = "unsloth/Qwen3-Next-80B-A3B-Instruct-bnb-4bit"
+
     # Force ALL layers onto GPU 0. device_map="auto" is too conservative
     # with large MoE models and offloads to CPU where bnb 4-bit can't run.
     # 80B params in 4-bit ≈ 40GB — fits comfortably on A100 80GB.
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        quantization_config=bnb_config,
-        device_map={"": 0},
-        trust_remote_code=config["model"]["trust_remote_code"],
-        torch_dtype=getattr(torch, config["model"]["torch_dtype"]),
-        token=hf_token,
-        attn_implementation=attn_impl,
-    )
+    try:
+        logger.info(
+            f"Attempting to load {model_name} with on-the-fly 4-bit quantization..."
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            quantization_config=bnb_config,
+            device_map={"": 0},
+            trust_remote_code=config["model"]["trust_remote_code"],
+            torch_dtype=getattr(torch, config["model"]["torch_dtype"]),
+            token=hf_token,
+            attn_implementation=attn_impl,
+        )
+        logger.info(f"Successfully loaded {model_name} with 4-bit quantization")
+    except Exception as e:
+        logger.warning(f"On-the-fly 4-bit quantization failed for {model_name}: {e}")
+        logger.info(f"Falling back to pre-quantized model: {PRE_QUANTIZED_FALLBACK}")
+        write_status(
+            "loading_model",
+            f"On-the-fly quantization failed, loading pre-quantized fallback...",
+        )
+        # Pre-quantized model already has bnb 4-bit weights baked in —
+        # do NOT pass quantization_config again, just load directly.
+        model = AutoModelForCausalLM.from_pretrained(
+            PRE_QUANTIZED_FALLBACK,
+            device_map={"": 0},
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
+            token=hf_token,
+            attn_implementation=attn_impl,
+        )
+        logger.info(
+            f"Successfully loaded pre-quantized fallback: {PRE_QUANTIZED_FALLBACK}"
+        )
 
     model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
     logger.info("Model loaded and prepared for k-bit training")
@@ -377,10 +410,15 @@ def train(
     write_status("loading_model", "Applying LoRA adapters...")
 
     lora_cfg = config["lora"]
+    target_modules = lora_cfg["target_modules"]
+    # target_modules can be a list of strings or the string "all-linear"
+    if isinstance(target_modules, str) and target_modules != "all-linear":
+        target_modules = [target_modules]
+
     lora_config = LoraConfig(
         r=lora_cfg["r"],
         lora_alpha=lora_cfg["lora_alpha"],
-        target_modules=lora_cfg["target_modules"],
+        target_modules=target_modules,
         lora_dropout=lora_cfg["lora_dropout"],
         bias=lora_cfg["bias"],
         task_type=lora_cfg["task_type"],