4 місяців тому · 693efc83be
--- a/config.yaml
+++ b/config.yaml
@@ -1,31 +1,26 @@
 
				 # =============================================================================
			
 
				-# Qwen3-Coder-Next Uncensored Fine-Tuning Configuration
			
 
				+# Qwen3 Uncensored Fine-Tuning Configuration (Unsloth)
			
 
				 # =============================================================================
			
 
				 
			
 
				-# Model
			
 
				+# Model — Unsloth handles quantization internally
			
 
				 model:
			
 
				-  name: "Qwen/Qwen3-Coder-Next"
			
 
				+  name: "unsloth/Qwen3-30B-A3B"
			
 
				   trust_remote_code: true
			
 
				-  torch_dtype: "bfloat16"
			
 
				-
			
 
				-# Quantization (QLoRA 4-bit)
			
 
				-quantization:
			
 
				-  load_in_4bit: true
			
 
				-  bnb_4bit_quant_type: "nf4"
			
 
				-  bnb_4bit_compute_dtype: "bfloat16"
			
 
				-  bnb_4bit_use_double_quant: true
			
 
				 
			
 
				 # LoRA Configuration
			
 
				 lora:
			
 
				-  r: 64
			
 
				-  lora_alpha: 128
			
 
				-  # Use "all-linear" to auto-discover all linear layers in the hybrid
			
 
				-  # DeltaNet + Gated Attention + MoE architecture (safer than listing
			
 
				-  # individual module names which may differ from standard Qwen).
			
 
				-  target_modules: "all-linear"
			
 
				-  lora_dropout: 0.05
			
 
				+  r: 16
			
 
				+  lora_alpha: 32
			
 
				+  target_modules:
			
 
				+    - q_proj
			
 
				+    - k_proj
			
 
				+    - v_proj
			
 
				+    - o_proj
			
 
				+    - gate_proj
			
 
				+    - up_proj
			
 
				+    - down_proj
			
 
				+  lora_dropout: 0
			
 
				   bias: "none"
			
 
				-  task_type: "CAUSAL_LM"
			
 
				 
			
 
				 # Dataset options (pick one or provide custom)
			
 
				 datasets:
			
@@ -70,7 +65,7 @@ training:
 
				   output_dir: "/home/user/output"
			
 
				   num_train_epochs: 2
			
 
				   per_device_train_batch_size: 1
			
 
				-  gradient_accumulation_steps: 16
			
 
				+  gradient_accumulation_steps: 8
			
 
				   learning_rate: 0.0002
			
 
				   lr_scheduler_type: "cosine"
			
 
				   warmup_ratio: 0.05
			
@@ -82,16 +77,8 @@ training:
 
				   save_strategy: "steps"
			
 
				   save_steps: 50
			
 
				   save_total_limit: 3
			
 
				-  max_seq_length: 2048
			
 
				-  gradient_checkpointing: true
			
 
				-  gradient_checkpointing_kwargs:
			
 
				-    use_reentrant: false
			
 
				-  optim: "paged_adamw_8bit"
			
 
				-  report_to: "wandb"
			
 
				+  max_seq_length: 512
			
 
				   seed: 42
			
 
				-  dataloader_num_workers: 4
			
 
				-  dataloader_pin_memory: true
			
 
				-  packing: true
			
 
				 
			
 
				 # Hub push settings
			
 
				 hub: