|
@@ -1,31 +1,26 @@
|
|
|
# =============================================================================
|
|
# =============================================================================
|
|
|
-# Qwen3-Coder-Next Uncensored Fine-Tuning Configuration
|
|
|
|
|
|
|
+# Qwen3 Uncensored Fine-Tuning Configuration (Unsloth)
|
|
|
# =============================================================================
|
|
# =============================================================================
|
|
|
|
|
|
|
|
-# Model
|
|
|
|
|
|
|
+# Model — Unsloth handles quantization internally
|
|
|
model:
|
|
model:
|
|
|
- name: "Qwen/Qwen3-Coder-Next"
|
|
|
|
|
|
|
+ name: "unsloth/Qwen3-30B-A3B"
|
|
|
trust_remote_code: true
|
|
trust_remote_code: true
|
|
|
- torch_dtype: "bfloat16"
|
|
|
|
|
-
|
|
|
|
|
-# Quantization (QLoRA 4-bit)
|
|
|
|
|
-quantization:
|
|
|
|
|
- load_in_4bit: true
|
|
|
|
|
- bnb_4bit_quant_type: "nf4"
|
|
|
|
|
- bnb_4bit_compute_dtype: "bfloat16"
|
|
|
|
|
- bnb_4bit_use_double_quant: true
|
|
|
|
|
|
|
|
|
|
# LoRA Configuration
|
|
# LoRA Configuration
|
|
|
lora:
|
|
lora:
|
|
|
- r: 64
|
|
|
|
|
- lora_alpha: 128
|
|
|
|
|
- # Use "all-linear" to auto-discover all linear layers in the hybrid
|
|
|
|
|
- # DeltaNet + Gated Attention + MoE architecture (safer than listing
|
|
|
|
|
- # individual module names which may differ from standard Qwen).
|
|
|
|
|
- target_modules: "all-linear"
|
|
|
|
|
- lora_dropout: 0.05
|
|
|
|
|
|
|
+ r: 16
|
|
|
|
|
+ lora_alpha: 32
|
|
|
|
|
+ target_modules:
|
|
|
|
|
+ - q_proj
|
|
|
|
|
+ - k_proj
|
|
|
|
|
+ - v_proj
|
|
|
|
|
+ - o_proj
|
|
|
|
|
+ - gate_proj
|
|
|
|
|
+ - up_proj
|
|
|
|
|
+ - down_proj
|
|
|
|
|
+ lora_dropout: 0
|
|
|
bias: "none"
|
|
bias: "none"
|
|
|
- task_type: "CAUSAL_LM"
|
|
|
|
|
|
|
|
|
|
# Dataset options (pick one or provide custom)
|
|
# Dataset options (pick one or provide custom)
|
|
|
datasets:
|
|
datasets:
|
|
@@ -70,7 +65,7 @@ training:
|
|
|
output_dir: "/home/user/output"
|
|
output_dir: "/home/user/output"
|
|
|
num_train_epochs: 2
|
|
num_train_epochs: 2
|
|
|
per_device_train_batch_size: 1
|
|
per_device_train_batch_size: 1
|
|
|
- gradient_accumulation_steps: 16
|
|
|
|
|
|
|
+ gradient_accumulation_steps: 8
|
|
|
learning_rate: 0.0002
|
|
learning_rate: 0.0002
|
|
|
lr_scheduler_type: "cosine"
|
|
lr_scheduler_type: "cosine"
|
|
|
warmup_ratio: 0.05
|
|
warmup_ratio: 0.05
|
|
@@ -82,16 +77,8 @@ training:
|
|
|
save_strategy: "steps"
|
|
save_strategy: "steps"
|
|
|
save_steps: 50
|
|
save_steps: 50
|
|
|
save_total_limit: 3
|
|
save_total_limit: 3
|
|
|
- max_seq_length: 2048
|
|
|
|
|
- gradient_checkpointing: true
|
|
|
|
|
- gradient_checkpointing_kwargs:
|
|
|
|
|
- use_reentrant: false
|
|
|
|
|
- optim: "paged_adamw_8bit"
|
|
|
|
|
- report_to: "wandb"
|
|
|
|
|
|
|
+ max_seq_length: 512
|
|
|
seed: 42
|
|
seed: 42
|
|
|
- dataloader_num_workers: 4
|
|
|
|
|
- dataloader_pin_memory: true
|
|
|
|
|
- packing: true
|
|
|
|
|
|
|
|
|
|
# Hub push settings
|
|
# Hub push settings
|
|
|
hub:
|
|
hub:
|