4 ヶ月前 · b8152dcb30
--- a/train.py
+++ b/train.py
@@ -363,9 +363,9 @@ def train(
 
				     # Pre-quantized fallback model for Qwen3-Next architecture
			
 
				     PRE_QUANTIZED_FALLBACK = "unsloth/Qwen3-Next-80B-A3B-Instruct-bnb-4bit"
			
 
				 
			
 
				-    # Force ALL layers onto GPU 0. device_map="auto" is too conservative
			
 
				-    # with large MoE models and offloads to CPU where bnb 4-bit can't run.
			
 
				-    # 80B params in 4-bit ≈ 40GB — fits comfortably on A100 80GB.
			
 
				+    # Force ALL layers onto GPU 0.  bnb 4-bit layers cannot run on CPU.
			
 
				+    # With expandable_segments=True (set via PYTORCH_CUDA_ALLOC_CONF env),
			
 
				+    # PyTorch won't pre-reserve all 80GB upfront, leaving room for activations.
			
 
				     try:
			
 
				         logger.info(
			
 
				             f"Attempting to load {model_name} with on-the-fly 4-bit quantization..."