Răsfoiți Sursa

fix: disable packing (sdpa+packing crash), reduce seq_len to 1024, add traceback logging

Sameric 4 luni în urmă
părinte
comite
f91f73aaf5
1 a modificat fișierele cu 23 adăugiri și 3 ștergeri
  1. 23 3
      train.py

+ 23 - 3
train.py

@@ -271,7 +271,7 @@ def train(
     lora_alpha: int = 128,
     batch_size: int = 1,
     grad_accum: int = 16,
-    max_seq_length: int = 2048,
+    max_seq_length: int = 1024,
     system_prompt: str = "",
 ):
     """Run the full QLoRA fine-tuning pipeline."""
@@ -472,7 +472,11 @@ def train(
         seed=t_cfg["seed"],
         dataloader_num_workers=t_cfg.get("dataloader_num_workers", 4),
         dataloader_pin_memory=t_cfg.get("dataloader_pin_memory", True),
-        packing=t_cfg.get("packing", True),
+        # packing=False because sdpa attention + packing is unsupported
+        # and causes silent crashes on Qwen3-Next architecture.
+        # flash_attention_2 would fix this but flash-attn is hard to compile
+        # in Docker. Disabling packing is the safest fix.
+        packing=False,
         dataset_text_field="text",
         push_to_hub=push_to_hub,
         hub_model_id=hub_model_id if push_to_hub else None,
@@ -518,7 +522,23 @@ def train(
     logger.info(f"  Push to hub: {push_to_hub} → {hub_model_id}")
     logger.info("=" * 60)
 
-    train_result = trainer.train()
+    import traceback as _tb
+
+    try:
+        logger.info("Calling trainer.train() ...")
+        train_result = trainer.train()
+        logger.info("trainer.train() returned successfully")
+    except Exception as train_exc:
+        err_msg = f"trainer.train() CRASHED: {train_exc}"
+        full_tb = _tb.format_exc()
+        logger.error(err_msg)
+        logger.error(full_tb)
+        # Also write to a persistent crash file
+        crash_path = "/home/user/crash.log"
+        with open(crash_path, "w") as cf:
+            cf.write(f"{err_msg}\n\n{full_tb}")
+        write_status("error", err_msg, 0.0, {"traceback": full_tb[:2000]})
+        raise
 
     # -----------------------------------------------------------------------
     # 8. Save final adapter