|
|
@@ -363,9 +363,9 @@ def train(
|
|
|
# Pre-quantized fallback model for Qwen3-Next architecture
|
|
|
PRE_QUANTIZED_FALLBACK = "unsloth/Qwen3-Next-80B-A3B-Instruct-bnb-4bit"
|
|
|
|
|
|
- # Force ALL layers onto GPU 0. device_map="auto" is too conservative
|
|
|
- # with large MoE models and offloads to CPU where bnb 4-bit can't run.
|
|
|
- # 80B params in 4-bit ≈ 40GB — fits comfortably on A100 80GB.
|
|
|
+ # Force ALL layers onto GPU 0. bnb 4-bit layers cannot run on CPU.
|
|
|
+ # With expandable_segments=True (set via PYTORCH_CUDA_ALLOC_CONF env),
|
|
|
+ # PyTorch won't pre-reserve all 80GB upfront, leaving room for activations.
|
|
|
try:
|
|
|
logger.info(
|
|
|
f"Attempting to load {model_name} with on-the-fly 4-bit quantization..."
|