4 hónapja · f91f73aaf5
--- a/train.py
+++ b/train.py
@@ -271,7 +271,7 @@ def train(
 
				     lora_alpha: int = 128,
			
 
				     batch_size: int = 1,
			
 
				     grad_accum: int = 16,
			
 
				-    max_seq_length: int = 2048,
			
 
				+    max_seq_length: int = 1024,
			
 
				     system_prompt: str = "",
			
 
				 ):
			
 
				     """Run the full QLoRA fine-tuning pipeline."""
			
@@ -472,7 +472,11 @@ def train(
 
				         seed=t_cfg["seed"],
			
 
				         dataloader_num_workers=t_cfg.get("dataloader_num_workers", 4),
			
 
				         dataloader_pin_memory=t_cfg.get("dataloader_pin_memory", True),
			
 
				-        packing=t_cfg.get("packing", True),
			
 
				+        # packing=False because sdpa attention + packing is unsupported
			
 
				+        # and causes silent crashes on Qwen3-Next architecture.
			
 
				+        # flash_attention_2 would fix this but flash-attn is hard to compile
			
 
				+        # in Docker. Disabling packing is the safest fix.
			
 
				+        packing=False,
			
 
				         dataset_text_field="text",
			
 
				         push_to_hub=push_to_hub,
			
 
				         hub_model_id=hub_model_id if push_to_hub else None,
			
@@ -518,7 +522,23 @@ def train(
 
				     logger.info(f"  Push to hub: {push_to_hub} → {hub_model_id}")
			
 
				     logger.info("=" * 60)
			
 
				 
			
 
				-    train_result = trainer.train()
			
 
				+    import traceback as _tb
			
 
				+
			
 
				+    try:
			
 
				+        logger.info("Calling trainer.train() ...")
			
 
				+        train_result = trainer.train()
			
 
				+        logger.info("trainer.train() returned successfully")
			
 
				+    except Exception as train_exc:
			
 
				+        err_msg = f"trainer.train() CRASHED: {train_exc}"
			
 
				+        full_tb = _tb.format_exc()
			
 
				+        logger.error(err_msg)
			
 
				+        logger.error(full_tb)
			
 
				+        # Also write to a persistent crash file
			
 
				+        crash_path = "/home/user/crash.log"
			
 
				+        with open(crash_path, "w") as cf:
			
 
				+            cf.write(f"{err_msg}\n\n{full_tb}")
			
 
				+        write_status("error", err_msg, 0.0, {"traceback": full_tb[:2000]})
			
 
				+        raise
			
 
				 
			
 
				     # -----------------------------------------------------------------------
			
 
				     # 8. Save final adapter