|
|
@@ -271,7 +271,7 @@ def train(
|
|
|
lora_alpha: int = 128,
|
|
|
batch_size: int = 1,
|
|
|
grad_accum: int = 16,
|
|
|
- max_seq_length: int = 2048,
|
|
|
+ max_seq_length: int = 1024,
|
|
|
system_prompt: str = "",
|
|
|
):
|
|
|
"""Run the full QLoRA fine-tuning pipeline."""
|
|
|
@@ -472,7 +472,11 @@ def train(
|
|
|
seed=t_cfg["seed"],
|
|
|
dataloader_num_workers=t_cfg.get("dataloader_num_workers", 4),
|
|
|
dataloader_pin_memory=t_cfg.get("dataloader_pin_memory", True),
|
|
|
- packing=t_cfg.get("packing", True),
|
|
|
+ # packing=False because sdpa attention + packing is unsupported
|
|
|
+ # and causes silent crashes on Qwen3-Next architecture.
|
|
|
+ # flash_attention_2 would fix this but flash-attn is hard to compile
|
|
|
+ # in Docker. Disabling packing is the safest fix.
|
|
|
+ packing=False,
|
|
|
dataset_text_field="text",
|
|
|
push_to_hub=push_to_hub,
|
|
|
hub_model_id=hub_model_id if push_to_hub else None,
|
|
|
@@ -518,7 +522,23 @@ def train(
|
|
|
logger.info(f" Push to hub: {push_to_hub} → {hub_model_id}")
|
|
|
logger.info("=" * 60)
|
|
|
|
|
|
- train_result = trainer.train()
|
|
|
+ import traceback as _tb
|
|
|
+
|
|
|
+ try:
|
|
|
+ logger.info("Calling trainer.train() ...")
|
|
|
+ train_result = trainer.train()
|
|
|
+ logger.info("trainer.train() returned successfully")
|
|
|
+ except Exception as train_exc:
|
|
|
+ err_msg = f"trainer.train() CRASHED: {train_exc}"
|
|
|
+ full_tb = _tb.format_exc()
|
|
|
+ logger.error(err_msg)
|
|
|
+ logger.error(full_tb)
|
|
|
+ # Also write to a persistent crash file
|
|
|
+ crash_path = "/home/user/crash.log"
|
|
|
+ with open(crash_path, "w") as cf:
|
|
|
+ cf.write(f"{err_msg}\n\n{full_tb}")
|
|
|
+ write_status("error", err_msg, 0.0, {"traceback": full_tb[:2000]})
|
|
|
+ raise
|
|
|
|
|
|
# -----------------------------------------------------------------------
|
|
|
# 8. Save final adapter
|