hace 4 meses · bc8de7d068
--- a/train.py
+++ b/train.py
@@ -1,6 +1,7 @@
 
															 """
														
 
															-Qwen3-Coder-Next Uncensored Fine-Tuning Script
														
 
															-QLoRA 4-bit fine-tuning with TRL's SFTTrainer
														
 
															+Qwen3 Uncensored Fine-Tuning Script (Unsloth)
														
 
															+QLoRA 4-bit fine-tuning with Unsloth's FastModel + TRL SFTTrainer
														
 
															+Uses Qwen3-30B-A3B (30B total, 3B active MoE) - fits in ~17.5GB VRAM
														
 
															 """
														
 
															 import os
														
@@ -11,18 +12,8 @@ import torch
 
															 import logging
														
 
															 from pathlib import Path
														
 
															 from typing import Optional
														
 
															-from dataclasses import dataclass
														
 
															-from transformers import (
														
 
															-    AutoModelForCausalLM,
														
 
															-    AutoTokenizer,
														
 
															-    BitsAndBytesConfig,
														
 
															-    TrainerCallback,
														
 
															-)
														
 
															-from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
														
 
															-from trl import SFTTrainer, SFTConfig
														
 
															-from datasets import load_dataset, Dataset, concatenate_datasets
														
 
															-from huggingface_hub import HfApi
														
 
															+from transformers import TrainerCallback
														
 
															 logging.basicConfig(
														
 
															     level=logging.INFO,
														
@@ -108,8 +99,9 @@ def prepare_dataset(
 
															     system_prompt: str,
														
 
															     max_samples: Optional[int] = None,
														
 
															     custom_dataset_path: Optional[str] = None,
														
 
															-) -> Dataset:
														
 
															+):
														
 
															     """Load and format dataset into chat-template strings."""
														
 
															+    from datasets import load_dataset
														
 
															     if custom_dataset_path:
														
 
															         logger.info(f"Loading custom dataset from {custom_dataset_path}")
														
@@ -274,7 +266,7 @@ def train(
 
															     max_seq_length: int = 512,
														
 
															     system_prompt: str = "",
														
 
															 ):
														
 
															-    """Run the full QLoRA fine-tuning pipeline."""
														
 
															+    """Run QLoRA fine-tuning using Unsloth FastModel."""
														
 
															     config = load_config()
														
 
															     write_status("initializing", "Loading configuration...")
														
@@ -294,27 +286,48 @@ def train(
 
															     hf_token = os.environ.get("HF_TOKEN")
														
 
															     if not hf_token:
														
 
															         write_status(
														
 
															-            "error", "HF_TOKEN secret not set! Add it in Space Settings → Secrets."
														
 
															+            "error", "HF_TOKEN secret not set! Add it in Space Settings -> Secrets."
														
 
															         )
														
 
															         raise ValueError("HF_TOKEN environment variable is required")
														
 
															-    # -----------------------------------------------------------------------
														
 
															-    # 1. Load tokenizer
														
 
															-    # -----------------------------------------------------------------------
														
 
															-    write_status("initializing", "Loading tokenizer...")
														
 
															+    # -------------------------------------------------------------------
														
 
															+    # 1. Load model with Unsloth FastModel (4-bit QLoRA)
														
 
															+    # -------------------------------------------------------------------
														
 
															+    write_status(
														
 
															+        "loading_model",
														
 
															+        "Loading Qwen3-30B-A3B with Unsloth (4-bit)... "
														
 
															+        "MoE models download full 16-bit then convert to 4-bit on-the-fly.",
														
 
															+    )
														
 
															+
														
 
															+    from unsloth import FastModel
														
 
															+
														
 
															     model_name = config["model"]["name"]
														
 
															-    tokenizer = AutoTokenizer.from_pretrained(
														
 
															-        model_name,
														
 
															-        trust_remote_code=config["model"]["trust_remote_code"],
														
 
															+    logger.info(f"Loading model: {model_name} with Unsloth FastModel")
														
 
															+
														
 
															+    model, tokenizer = FastModel.from_pretrained(
														
 
															+        model_name=model_name,
														
 
															+        max_seq_length=max_seq_length,
														
 
															+        load_in_4bit=True,
														
 
															+        load_in_8bit=False,
														
 
															+        full_finetuning=False,
														
 
															         token=hf_token,
														
 
															     )
														
 
															+
														
 
															+    logger.info("Model loaded successfully with Unsloth")
														
 
															+
														
 
															     if tokenizer.pad_token is None:
														
 
															         tokenizer.pad_token = tokenizer.eos_token
														
 
															     tokenizer.padding_side = "right"
														
 
															-    # -----------------------------------------------------------------------
														
 
															+    if torch.cuda.is_available():
														
 
															+        logger.info(
														
 
															+            f"Post-load VRAM: {torch.cuda.memory_allocated(0) / 1e9:.1f} GB allocated, "
														
 
															+            f"{torch.cuda.memory_reserved(0) / 1e9:.1f} GB reserved"
														
 
															+        )
														
 
															+
														
 
															+    # -------------------------------------------------------------------
														
 
															     # 2. Load dataset
														
 
															-    # -----------------------------------------------------------------------
														
 
															+    # -------------------------------------------------------------------
														
 
															     write_status("initializing", "Loading and formatting dataset...")
														
 
															     dataset = prepare_dataset(
														
 
															         dataset_name=dataset_choice,
														
@@ -327,120 +340,74 @@ def train(
 
															     logger.info(f"Formatted dataset: {len(dataset)} samples")
														
 
															     logger.info(f"Sample:\n{dataset[0]['text'][:500]}...")
														
 
															-    # -----------------------------------------------------------------------
														
 
															-    # 3. Load model in 4-bit
														
 
															-    # -----------------------------------------------------------------------
														
 
															-    write_status(
														
 
															-        "loading_model",
														
 
															-        "Loading model in 4-bit quantization... (this takes a while)",
														
 
															-    )
														
 
															-
														
 
															-    q_cfg = config["quantization"]
														
 
															-    bnb_config = BitsAndBytesConfig(
														
 
															-        load_in_4bit=q_cfg["load_in_4bit"],
														
 
															-        bnb_4bit_quant_type=q_cfg["bnb_4bit_quant_type"],
														
 
															-        bnb_4bit_compute_dtype=getattr(torch, q_cfg["bnb_4bit_compute_dtype"]),
														
 
															-        bnb_4bit_use_double_quant=q_cfg["bnb_4bit_use_double_quant"],
														
 
															-    )
														
 
															-
														
 
															-    # Pick best available attention: flash_attention_2 > sdpa > eager
														
 
															-    if torch.cuda.is_available():
														
 
															-        try:
														
 
															-            import flash_attn  # noqa: F401
														
 
															-
														
 
															-            attn_impl = "flash_attention_2"
														
 
															-        except ImportError:
														
 
															-            attn_impl = "sdpa"  # PyTorch native, no extra install needed
														
 
															-    else:
														
 
															-        attn_impl = "eager"
														
 
															-    logger.info(f"Using attention implementation: {attn_impl}")
														
 
															-
														
 
															-    # Log transformers version to confirm qwen3_next support
														
 
															-    import transformers
														
 
															-
														
 
															-    logger.info(f"transformers version: {transformers.__version__}")
														
 
															-
														
 
															-    # Pre-quantized fallback model for Qwen3-Next architecture
														
 
															-    PRE_QUANTIZED_FALLBACK = "unsloth/Qwen3-Next-80B-A3B-Instruct-bnb-4bit"
														
 
															-
														
 
															-    # Force ALL layers onto GPU 0.  bnb 4-bit layers cannot run on CPU.
														
 
															-    # With expandable_segments=True (set via PYTORCH_CUDA_ALLOC_CONF env),
														
 
															-    # PyTorch won't pre-reserve all 80GB upfront, leaving room for activations.
														
 
															-    try:
														
 
															-        logger.info(
														
 
															-            f"Attempting to load {model_name} with on-the-fly 4-bit quantization..."
														
 
															-        )
														
 
															-        model = AutoModelForCausalLM.from_pretrained(
														
 
															-            model_name,
														
 
															-            quantization_config=bnb_config,
														
 
															-            device_map={"": 0},
														
 
															-            trust_remote_code=config["model"]["trust_remote_code"],
														
 
															-            torch_dtype=getattr(torch, config["model"]["torch_dtype"]),
														
 
															-            token=hf_token,
														
 
															-            attn_implementation=attn_impl,
														
 
															-        )
														
 
															-        logger.info(f"Successfully loaded {model_name} with 4-bit quantization")
														
 
															-    except Exception as e:
														
 
															-        logger.warning(f"On-the-fly 4-bit quantization failed for {model_name}: {e}")
														
 
															-        logger.info(f"Falling back to pre-quantized model: {PRE_QUANTIZED_FALLBACK}")
														
 
															-        write_status(
														
 
															-            "loading_model",
														
 
															-            f"On-the-fly quantization failed, loading pre-quantized fallback...",
														
 
															-        )
														
 
															-        # Pre-quantized model already has bnb 4-bit weights baked in —
														
 
															-        # do NOT pass quantization_config again, just load directly.
														
 
															-        model = AutoModelForCausalLM.from_pretrained(
														
 
															-            PRE_QUANTIZED_FALLBACK,
														
 
															-            device_map={"": 0},
														
 
															-            trust_remote_code=True,
														
 
															-            torch_dtype=torch.bfloat16,
														
 
															-            token=hf_token,
														
 
															-            attn_implementation=attn_impl,
														
 
															-        )
														
 
															-        logger.info(
														
 
															-            f"Successfully loaded pre-quantized fallback: {PRE_QUANTIZED_FALLBACK}"
														
 
															-        )
														
 
															-
														
 
															-    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
														
 
															-    logger.info("Model loaded and prepared for k-bit training")
														
 
															-
														
 
															-    # -----------------------------------------------------------------------
														
 
															-    # 4. Apply LoRA
														
 
															-    # -----------------------------------------------------------------------
														
 
															-    write_status("loading_model", "Applying LoRA adapters...")
														
 
															+    # -------------------------------------------------------------------
														
 
															+    # 3. Apply LoRA via Unsloth
														
 
															+    # -------------------------------------------------------------------
														
 
															+    write_status("loading_model", "Applying LoRA adapters via Unsloth...")
														
 
															     lora_cfg = config["lora"]
														
 
															-    target_modules = lora_cfg["target_modules"]
														
 
															-    # target_modules can be a list of strings or the string "all-linear"
														
 
															-    if isinstance(target_modules, str) and target_modules != "all-linear":
														
 
															-        target_modules = [target_modules]
														
 
															-
														
 
															-    lora_config = LoraConfig(
														
 
															+    target_modules = lora_cfg.get(
														
 
															+        "target_modules",
														
 
															+        [
														
 
															+            "q_proj",
														
 
															+            "k_proj",
														
 
															+            "v_proj",
														
 
															+            "o_proj",
														
 
															+            "gate_proj",
														
 
															+            "up_proj",
														
 
															+            "down_proj",
														
 
															+        ],
														
 
															+    )
														
 
															+    # Unsloth get_peft_model expects a list, not "all-linear"
														
 
															+    if isinstance(target_modules, str) and target_modules == "all-linear":
														
 
															+        target_modules = [
														
 
															+            "q_proj",
														
 
															+            "k_proj",
														
 
															+            "v_proj",
														
 
															+            "o_proj",
														
 
															+            "gate_proj",
														
 
															+            "up_proj",
														
 
															+            "down_proj",
														
 
															+        ]
														
 
															+
														
 
															+    model = FastModel.get_peft_model(
														
 
															+        model,
														
 
															         r=lora_cfg["r"],
														
 
															-        lora_alpha=lora_cfg["lora_alpha"],
														
 
															         target_modules=target_modules,
														
 
															-        lora_dropout=lora_cfg["lora_dropout"],
														
 
															-        bias=lora_cfg["bias"],
														
 
															-        task_type=lora_cfg["task_type"],
														
 
															+        lora_alpha=lora_cfg["lora_alpha"],
														
 
															+        lora_dropout=lora_cfg.get("lora_dropout", 0),
														
 
															+        bias=lora_cfg.get("bias", "none"),
														
 
															+        use_gradient_checkpointing="unsloth",  # Unsloth optimized
														
 
															+        random_state=42,
														
 
															     )
														
 
															-    model = get_peft_model(model, lora_config)
														
 
															-    trainable, total = model.get_nb_trainable_parameters()
														
 
															+    # Log trainable params
														
 
															+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
														
 
															+    total_params = sum(p.numel() for p in model.parameters())
														
 
															     logger.info(
														
 
															-        f"Trainable params: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)"
														
 
															+        f"Trainable params: {trainable_params:,} / {total_params:,} "
														
 
															+        f"({100 * trainable_params / total_params:.2f}%)"
														
 
															     )
														
 
															     write_status(
														
 
															         "loading_model",
														
 
															-        f"LoRA applied: {trainable:,} trainable params ({100 * trainable / total:.2f}%)",
														
 
															+        f"LoRA applied: {trainable_params:,} trainable params "
														
 
															+        f"({100 * trainable_params / total_params:.2f}%)",
														
 
															     )
														
 
															-    # -----------------------------------------------------------------------
														
 
															-    # 5. Training arguments
														
 
															-    # -----------------------------------------------------------------------
														
 
															+    if torch.cuda.is_available():
														
 
															+        logger.info(
														
 
															+            f"Post-LoRA VRAM: {torch.cuda.memory_allocated(0) / 1e9:.1f} GB allocated, "
														
 
															+            f"{torch.cuda.memory_reserved(0) / 1e9:.1f} GB reserved"
														
 
															+        )
														
 
															+
														
 
															+    # -------------------------------------------------------------------
														
 
															+    # 4. Training arguments
														
 
															+    # -------------------------------------------------------------------
														
 
															+    from trl import SFTTrainer, SFTConfig
														
 
															+
														
 
															     t_cfg = config["training"]
														
 
															     output_dir = t_cfg["output_dir"]
														
 
															-    # Determine hub settings
														
 
															     push_to_hub = bool(hub_model_id)
														
 
															     hub_cfg = config.get("hub", {})
														
@@ -450,44 +417,31 @@ def train(
 
															         per_device_train_batch_size=t_cfg["per_device_train_batch_size"],
														
 
															         gradient_accumulation_steps=t_cfg["gradient_accumulation_steps"],
														
 
															         learning_rate=t_cfg["learning_rate"],
														
 
															-        lr_scheduler_type=t_cfg["lr_scheduler_type"],
														
 
															-        warmup_ratio=t_cfg["warmup_ratio"],
														
 
															-        weight_decay=t_cfg["weight_decay"],
														
 
															-        bf16=t_cfg["bf16"],
														
 
															+        lr_scheduler_type=t_cfg.get("lr_scheduler_type", "cosine"),
														
 
															+        warmup_ratio=t_cfg.get("warmup_ratio", 0.05),
														
 
															+        weight_decay=t_cfg.get("weight_decay", 0.01),
														
 
															+        bf16=t_cfg.get("bf16", True),
														
 
															         tf32=t_cfg.get("tf32", True),
														
 
															-        max_grad_norm=t_cfg["max_grad_norm"],
														
 
															-        logging_steps=t_cfg["logging_steps"],
														
 
															-        save_strategy=t_cfg["save_strategy"],
														
 
															-        save_steps=t_cfg["save_steps"],
														
 
															-        save_total_limit=t_cfg["save_total_limit"],
														
 
															-        max_length=t_cfg["max_seq_length"],
														
 
															-        gradient_checkpointing=t_cfg["gradient_checkpointing"],
														
 
															-        gradient_checkpointing_kwargs=t_cfg.get(
														
 
															-            "gradient_checkpointing_kwargs", {"use_reentrant": False}
														
 
															-        ),
														
 
															-        optim=t_cfg["optim"],
														
 
															-        report_to=t_cfg.get("report_to", "none")
														
 
															-        if os.environ.get("WANDB_API_KEY")
														
 
															-        else "none",
														
 
															-        seed=t_cfg["seed"],
														
 
															-        dataloader_num_workers=0,
														
 
															-        dataloader_pin_memory=False,
														
 
															-        # packing=False because sdpa attention + packing is unsupported
														
 
															-        # and causes silent crashes on Qwen3-Next architecture.
														
 
															-        # flash_attention_2 would fix this but flash-attn is hard to compile
														
 
															-        # in Docker. Disabling packing is the safest fix.
														
 
															+        max_grad_norm=t_cfg.get("max_grad_norm", 1.0),
														
 
															+        logging_steps=t_cfg.get("logging_steps", 5),
														
 
															+        save_strategy=t_cfg.get("save_strategy", "steps"),
														
 
															+        save_steps=t_cfg.get("save_steps", 50),
														
 
															+        save_total_limit=t_cfg.get("save_total_limit", 3),
														
 
															+        max_length=max_seq_length,
														
 
															         packing=False,
														
 
															         dataset_text_field="text",
														
 
															-        push_to_hub=push_to_hub,
														
 
															-        hub_model_id=hub_model_id if push_to_hub else None,
														
 
															-        hub_strategy=hub_cfg.get("hub_strategy", "checkpoint"),
														
 
															-        hub_private_repo=hub_cfg.get("hub_private_repo", False),
														
 
															-        hub_token=hf_token,
														
 
															+        optim="adamw_8bit",
														
 
															+        report_to="none",
														
 
															+        seed=t_cfg.get("seed", 42),
														
 
															+        dataloader_num_workers=0,
														
 
															+        dataloader_pin_memory=False,
														
 
															+        # Don't push via SFTTrainer - we use Unsloth's push_to_hub_merged
														
 
															+        push_to_hub=False,
														
 
															     )
														
 
															-    # -----------------------------------------------------------------------
														
 
															-    # 6. Trainer
														
 
															-    # -----------------------------------------------------------------------
														
 
															+    # -------------------------------------------------------------------
														
 
															+    # 5. Trainer
														
 
															+    # -------------------------------------------------------------------
														
 
															     total_steps = (
														
 
															         len(dataset)
														
 
															         // (t_cfg["per_device_train_batch_size"] * t_cfg["gradient_accumulation_steps"])
														
@@ -502,12 +456,13 @@ def train(
 
															         callbacks=[StatusCallback(total_steps)],
														
 
															     )
														
 
															-    # -----------------------------------------------------------------------
														
 
															-    # 7. Train!
														
 
															-    # -----------------------------------------------------------------------
														
 
															+    # -------------------------------------------------------------------
														
 
															+    # 6. Train!
														
 
															+    # -------------------------------------------------------------------
														
 
															     write_status("training", "Starting training...", 0.0)
														
 
															     logger.info("=" * 60)
														
 
															-    logger.info("TRAINING STARTED")
														
 
															+    logger.info("TRAINING STARTED (Unsloth FastModel)")
														
 
															+    logger.info(f"  Model: {model_name}")
														
 
															     logger.info(f"  Dataset: {len(dataset)} samples")
														
 
															     logger.info(f"  Epochs: {t_cfg['num_train_epochs']}")
														
 
															     logger.info(f"  Batch size: {t_cfg['per_device_train_batch_size']}")
														
@@ -517,14 +472,11 @@ def train(
 
															     )
														
 
															     logger.info(f"  LR: {t_cfg['learning_rate']}")
														
 
															     logger.info(f"  LoRA r={lora_cfg['r']}, alpha={lora_cfg['lora_alpha']}")
														
 
															-    logger.info(f"  Max seq length: {t_cfg['max_seq_length']}")
														
 
															+    logger.info(f"  Max seq length: {max_seq_length}")
														
 
															     logger.info(f"  Total steps: ~{total_steps}")
														
 
															-    logger.info(f"  Push to hub: {push_to_hub} → {hub_model_id}")
														
 
															+    logger.info(f"  Push to hub: {push_to_hub} -> {hub_model_id}")
														
 
															     logger.info("=" * 60)
														
 
															-    import traceback as _tb
														
 
															-
														
 
															-    # Free any cached CUDA memory before training starts
														
 
															     if torch.cuda.is_available():
														
 
															         torch.cuda.empty_cache()
														
 
															         torch.cuda.reset_peak_memory_stats()
														
@@ -533,6 +485,8 @@ def train(
 
															             f"{torch.cuda.memory_reserved(0) / 1e9:.1f} GB reserved"
														
 
															         )
														
 
															+    import traceback as _tb
														
 
															+
														
 
															     try:
														
 
															         logger.info("Calling trainer.train() ...")
														
 
															         train_result = trainer.train()
														
@@ -542,38 +496,55 @@ def train(
 
															         full_tb = _tb.format_exc()
														
 
															         logger.error(err_msg)
														
 
															         logger.error(full_tb)
														
 
															-        # Also write to a persistent crash file
														
 
															         crash_path = "/home/user/crash.log"
														
 
															         with open(crash_path, "w") as cf:
														
 
															             cf.write(f"{err_msg}\n\n{full_tb}")
														
 
															         write_status("error", err_msg, 0.0, {"traceback": full_tb[:2000]})
														
 
															         raise
														
 
															-    # -----------------------------------------------------------------------
														
 
															-    # 8. Save final adapter
														
 
															-    # -----------------------------------------------------------------------
														
 
															-    write_status("saving", "Saving final LoRA adapter...")
														
 
															-    final_adapter_path = os.path.join(output_dir, "final_adapter")
														
 
															-    trainer.save_model(final_adapter_path)
														
 
															-    tokenizer.save_pretrained(final_adapter_path)
														
 
															+    # -------------------------------------------------------------------
														
 
															+    # 7. Save and push LoRA adapter
														
 
															+    # -------------------------------------------------------------------
														
 
															+    write_status("saving", "Saving LoRA adapter...")
														
 
															+
														
 
															+    local_lora_path = os.path.join(output_dir, "final_adapter")
														
 
															+    model.save_pretrained(local_lora_path)
														
 
															+    tokenizer.save_pretrained(local_lora_path)
														
 
															+    logger.info(f"LoRA adapter saved locally to {local_lora_path}")
														
 
															-    # Push adapter to Hub
														
 
															     if push_to_hub and hub_model_id:
														
 
															         write_status("pushing", f"Pushing LoRA adapter to {hub_model_id}...")
														
 
															-        api = HfApi(token=hf_token)
														
 
															-        api.create_repo(
														
 
															-            hub_model_id, exist_ok=True, private=hub_cfg.get("hub_private_repo", False)
														
 
															-        )
														
 
															-        api.upload_folder(
														
 
															-            folder_path=final_adapter_path,
														
 
															-            repo_id=hub_model_id,
														
 
															-            commit_message="Upload QLoRA adapter — Qwen3-Coder-Next uncensored",
														
 
															-        )
														
 
															-        logger.info(f"Adapter pushed to https://huggingface.co/{hub_model_id}")
														
 
															+        try:
														
 
															+            model.push_to_hub_merged(
														
 
															+                hub_model_id,
														
 
															+                tokenizer,
														
 
															+                save_method="lora",
														
 
															+                token=hf_token,
														
 
															+            )
														
 
															+            logger.info(f"LoRA adapter pushed to https://huggingface.co/{hub_model_id}")
														
 
															+        except Exception as push_exc:
														
 
															+            # Fallback: manual upload via HfApi
														
 
															+            logger.warning(
														
 
															+                f"push_to_hub_merged failed: {push_exc}, trying manual upload"
														
 
															+            )
														
 
															+            from huggingface_hub import HfApi
														
 
															+
														
 
															+            api = HfApi(token=hf_token)
														
 
															+            api.create_repo(
														
 
															+                hub_model_id,
														
 
															+                exist_ok=True,
														
 
															+                private=hub_cfg.get("hub_private_repo", False),
														
 
															+            )
														
 
															+            api.upload_folder(
														
 
															+                folder_path=local_lora_path,
														
 
															+                repo_id=hub_model_id,
														
 
															+                commit_message="Upload QLoRA adapter - Qwen3 uncensored (Unsloth)",
														
 
															+            )
														
 
															+            logger.info(f"Adapter uploaded via HfApi to {hub_model_id}")
														
 
															     write_status(
														
 
															         "completed",
														
 
															-        f"Training complete! Adapter saved to {final_adapter_path}",
														
 
															+        f"Training complete! Adapter saved to {local_lora_path}",
														
 
															         1.0,
														
 
															         {
														
 
															             "train_loss": round(train_result.metrics.get("train_loss", 0), 4),
														
@@ -584,7 +555,7 @@ def train(
 
															         },
														
 
															     )
														
 
															-    return final_adapter_path
														
 
															+    return local_lora_path
														
 
															 # ---------------------------------------------------------------------------
														
@@ -600,6 +571,7 @@ def abliterate(
 
															     """
														
 
															     Remove the refusal direction from model weights.
														
 
															     Based on: https://huggingface.co/blog/mlabonne/abliteration
														
 
															+    Uses Unsloth FastModel to load the model efficiently.
														
 
															     """
														
 
															     write_status("initializing", "Starting abliteration (refusal direction removal)...")
														
@@ -607,18 +579,19 @@ def abliterate(
 
															     config = load_config()
														
 
															     model_name = config["model"]["name"]
														
 
															-    # Load in bfloat16 (need full weights for abliteration)
														
 
															+    # Load in bfloat16 (need full weights for abliteration — no 4-bit)
														
 
															     write_status("loading_model", "Loading model in bfloat16 for abliteration...")
														
 
															-    model = AutoModelForCausalLM.from_pretrained(
														
 
															-        model_name,
														
 
															-        torch_dtype=torch.bfloat16,
														
 
															-        device_map="auto",
														
 
															-        trust_remote_code=True,
														
 
															+
														
 
															+    from unsloth import FastModel
														
 
															+
														
 
															+    model, tokenizer = FastModel.from_pretrained(
														
 
															+        model_name=model_name,
														
 
															+        max_seq_length=2048,
														
 
															+        load_in_4bit=False,
														
 
															+        load_in_8bit=False,
														
 
															+        full_finetuning=False,
														
 
															         token=hf_token,
														
 
															     )
														
 
															-    tokenizer = AutoTokenizer.from_pretrained(
														
 
															-        model_name, trust_remote_code=True, token=hf_token
														
 
															-    )
														
 
															     # Harmful prompts that trigger refusal
														
 
															     harmful_prompts = [
														
@@ -716,12 +689,14 @@ def abliterate(
 
															     if hub_model_id:
														
 
															         write_status("pushing", f"Pushing abliterated model to {hub_model_id}...")
														
 
															+        from huggingface_hub import HfApi
														
 
															+
														
 
															         api = HfApi(token=hf_token)
														
 
															         api.create_repo(hub_model_id, exist_ok=True)
														
 
															         api.upload_folder(
														
 
															             folder_path=output_path,
														
 
															             repo_id=hub_model_id,
														
 
															-            commit_message="Upload abliterated Qwen3-Coder-Next (refusal direction removed)",
														
 
															+            commit_message="Upload abliterated Qwen3 (refusal direction removed)",
														
 
															         )
														
 
															     write_status(