lainlives
/
sft-trainer


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709
							"""
Qwen3 Uncensored Fine-Tuning Script (Unsloth)
QLoRA 4-bit fine-tuning with Unsloth's FastModel + TRL SFTTrainer
Uses Qwen3-30B-A3B (30B total, 3B active MoE) - fits in ~17.5GB VRAM
"""

import os
import sys
import json
import yaml
import torch
import logging
from pathlib import Path
from typing import Optional

from transformers import TrainerCallback

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler("/home/user/training.log", mode="a"),
    ],
)
logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Status file for Gradio UI to poll
# ---------------------------------------------------------------------------
STATUS_FILE = "/home/user/training_status.json"


def write_status(
    status: str, detail: str = "", progress: float = 0.0, metrics: dict | None = None
):
    data = {
        "status": status,
        "detail": detail,
        "progress": progress,
        "metrics": metrics or {},
    }
    Path(STATUS_FILE).write_text(json.dumps(data))


class StatusCallback(TrainerCallback):
    """Streams training metrics back to the Gradio UI via a JSON status file."""

    def __init__(self, total_steps: int):
        self.total_steps = max(total_steps, 1)

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return
        progress = min(state.global_step / self.total_steps, 1.0)
        metrics = {
            "step": state.global_step,
            "total_steps": self.total_steps,
            "epoch": round(state.epoch or 0, 2),
            "loss": round(logs.get("loss", 0), 4),
            "learning_rate": logs.get("learning_rate", 0),
            "grad_norm": round(logs.get("grad_norm", 0), 4),
        }
        write_status(
            "training",
            f"Step {state.global_step}/{self.total_steps}",
            progress,
            metrics,
        )

    def on_save(self, args, state, control, **kwargs):
        write_status(
            "saving_checkpoint", f"Saved checkpoint at step {state.global_step}"
        )

    def on_train_end(self, args, state, control, **kwargs):
        write_status("completed", "Training finished!", 1.0)


# ---------------------------------------------------------------------------
# Config helpers
# ---------------------------------------------------------------------------


def load_config(config_path: str = "config.yaml") -> dict:
    with open(config_path) as f:
        return yaml.safe_load(f)


# ---------------------------------------------------------------------------
# Dataset preparation
# ---------------------------------------------------------------------------


def prepare_dataset(
    dataset_name: str,
    config: dict,
    tokenizer,
    system_prompt: str,
    max_samples: Optional[int] = None,
    custom_dataset_path: Optional[str] = None,
):
    """Load and format dataset into chat-template strings."""
    from datasets import load_dataset

    if custom_dataset_path:
        logger.info(f"Loading custom dataset from {custom_dataset_path}")
        if custom_dataset_path.endswith(".json") or custom_dataset_path.endswith(
            ".jsonl"
        ):
            ds = load_dataset("json", data_files=custom_dataset_path, split="train")
        elif custom_dataset_path.endswith(".csv"):
            ds = load_dataset("csv", data_files=custom_dataset_path, split="train")
        elif custom_dataset_path.endswith(".parquet"):
            ds = load_dataset("parquet", data_files=custom_dataset_path, split="train")
        else:
            ds = load_dataset(custom_dataset_path, split="train")

        # Auto-detect fields
        cols = ds.column_names

        # Check if it's a conversations-format dataset
        if "conversations" in cols:
            ds_format = "conversations"
            conversations_field = "conversations"
            human_key = "human"
            assistant_key = "gpt"
            instruction_field = None
            output_field = None
            system_field = None
        else:
            ds_format = "flat"
            conversations_field = None
            human_key = None
            assistant_key = None
            instruction_field = next(
                (
                    c
                    for c in ["instruction", "prompt", "input", "question", "user"]
                    if c in cols
                ),
                cols[0],
            )
            output_field = next(
                (
                    c
                    for c in ["output", "response", "answer", "chosen", "assistant"]
                    if c in cols
                ),
                cols[1] if len(cols) > 1 else cols[0],
            )
            system_field = "system" if "system" in cols else None
    else:
        ds_cfg = config["datasets"].get(dataset_name)
        if ds_cfg is None:
            raise ValueError(
                f"Unknown dataset: {dataset_name}. Available: {list(config['datasets'].keys())}"
            )

        logger.info(f"Loading dataset: {ds_cfg['name']}")
        ds = load_dataset(ds_cfg["name"], split=ds_cfg["split"])

        ds_format = ds_cfg.get("format", "flat")
        if ds_format == "conversations":
            conversations_field = ds_cfg.get("conversations_field", "conversations")
            human_key = ds_cfg.get("human_key", "human")
            assistant_key = ds_cfg.get("assistant_key", "gpt")
            instruction_field = None
            output_field = None
            system_field = None
        else:
            conversations_field = None
            human_key = None
            assistant_key = None
            instruction_field = ds_cfg["instruction_field"]
            output_field = ds_cfg["output_field"]
            system_field = ds_cfg.get("system_field")

    if max_samples and max_samples < len(ds):
        ds = ds.shuffle(seed=42).select(range(max_samples))

    logger.info(f"Dataset loaded: {len(ds)} samples (format: {ds_format})")

    if ds_format == "conversations":

        def format_chat(example):
            """Convert conversations [{from, value}] into chat template."""
            convos = example[conversations_field]
            messages = []

            # Prepend system prompt
            if system_prompt:
                messages.append({"role": "system", "content": system_prompt})

            for turn in convos:
                role_key = turn.get("from", "")
                value = turn.get("value", "")
                if role_key == human_key:
                    messages.append({"role": "user", "content": value})
                elif role_key == assistant_key:
                    messages.append({"role": "assistant", "content": value})

            # Skip samples with no user/assistant turns
            if len(messages) < 2 or not any(m["role"] == "assistant" for m in messages):
                return {"text": ""}

            text = tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=False
            )
            return {"text": text}
    else:

        def format_chat(example):
            messages = []
            # System prompt
            if system_field and example.get(system_field):
                messages.append({"role": "system", "content": example[system_field]})
            elif system_prompt:
                messages.append({"role": "system", "content": system_prompt})

            # User message
            messages.append(
                {"role": "user", "content": str(example[instruction_field])}
            )

            # Assistant response
            messages.append(
                {"role": "assistant", "content": str(example[output_field])}
            )

            text = tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=False
            )
            return {"text": text}

    ds = ds.map(
        format_chat,
        remove_columns=ds.column_names,
        num_proc=4,
        desc="Formatting dataset",
    )

    # Filter out empty texts (e.g. conversations with no assistant turn)
    ds = ds.filter(lambda x: len(x["text"].strip()) > 0)
    logger.info(f"After filtering: {len(ds)} samples")

    return ds


# ---------------------------------------------------------------------------
# Main training function
# ---------------------------------------------------------------------------


def train(
    dataset_choice: str = "wizard_vicuna",
    hub_model_id: str = "",
    max_samples: Optional[int] = 5000,
    custom_dataset_path: Optional[str] = None,
    num_epochs: int = 2,
    learning_rate: float = 2e-4,
    lora_r: int = 16,
    lora_alpha: int = 32,
    batch_size: int = 1,
    grad_accum: int = 8,
    max_seq_length: int = 512,
    system_prompt: str = "",
):
    """Run QLoRA fine-tuning using Unsloth FastModel."""

    config = load_config()
    write_status("initializing", "Loading configuration...")

    # Override config with function params
    config["training"]["num_train_epochs"] = num_epochs
    config["training"]["learning_rate"] = learning_rate
    config["training"]["per_device_train_batch_size"] = batch_size
    config["training"]["gradient_accumulation_steps"] = grad_accum
    config["training"]["max_seq_length"] = max_seq_length
    config["lora"]["r"] = lora_r
    config["lora"]["lora_alpha"] = lora_alpha

    if not system_prompt:
        system_prompt = config.get("system_prompt", "")

    hf_token = os.environ.get("HF_TOKEN")
    if not hf_token:
        write_status(
            "error", "HF_TOKEN secret not set! Add it in Space Settings -> Secrets."
        )
        raise ValueError("HF_TOKEN environment variable is required")

    # -------------------------------------------------------------------
    # 1. Load model with Unsloth FastModel (4-bit QLoRA)
    # -------------------------------------------------------------------
    write_status(
        "loading_model",
        "Loading Qwen3-30B-A3B with Unsloth (4-bit)... "
        "MoE models download full 16-bit then convert to 4-bit on-the-fly.",
    )

    from unsloth import FastModel

    model_name = config["model"]["name"]
    logger.info(f"Loading model: {model_name} with Unsloth FastModel")

    model, tokenizer = FastModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        load_in_4bit=True,
        load_in_8bit=False,
        full_finetuning=False,
        token=hf_token,
    )

    logger.info("Model loaded successfully with Unsloth")

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    if torch.cuda.is_available():
        logger.info(
            f"Post-load VRAM: {torch.cuda.memory_allocated(0) / 1e9:.1f} GB allocated, "
            f"{torch.cuda.memory_reserved(0) / 1e9:.1f} GB reserved"
        )

    # -------------------------------------------------------------------
    # 2. Load dataset
    # -------------------------------------------------------------------
    write_status("initializing", "Loading and formatting dataset...")
    dataset = prepare_dataset(
        dataset_name=dataset_choice,
        config=config,
        tokenizer=tokenizer,
        system_prompt=system_prompt,
        max_samples=max_samples,
        custom_dataset_path=custom_dataset_path,
    )
    logger.info(f"Formatted dataset: {len(dataset)} samples")
    logger.info(f"Sample:\n{dataset[0]['text'][:500]}...")

    # -------------------------------------------------------------------
    # 3. Apply LoRA via Unsloth
    # -------------------------------------------------------------------
    write_status("loading_model", "Applying LoRA adapters via Unsloth...")

    lora_cfg = config["lora"]
    target_modules = lora_cfg.get(
        "target_modules",
        [
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
    )
    # Unsloth get_peft_model expects a list, not "all-linear"
    if isinstance(target_modules, str) and target_modules == "all-linear":
        target_modules = [
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ]

    model = FastModel.get_peft_model(
        model,
        r=lora_cfg["r"],
        target_modules=target_modules,
        lora_alpha=lora_cfg["lora_alpha"],
        lora_dropout=lora_cfg.get("lora_dropout", 0),
        bias=lora_cfg.get("bias", "none"),
        use_gradient_checkpointing="unsloth",  # Unsloth optimized
        random_state=42,
    )

    # Log trainable params
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    logger.info(
        f"Trainable params: {trainable_params:,} / {total_params:,} "
        f"({100 * trainable_params / total_params:.2f}%)"
    )
    write_status(
        "loading_model",
        f"LoRA applied: {trainable_params:,} trainable params "
        f"({100 * trainable_params / total_params:.2f}%)",
    )

    if torch.cuda.is_available():
        logger.info(
            f"Post-LoRA VRAM: {torch.cuda.memory_allocated(0) / 1e9:.1f} GB allocated, "
            f"{torch.cuda.memory_reserved(0) / 1e9:.1f} GB reserved"
        )

    # -------------------------------------------------------------------
    # 4. Training arguments
    # -------------------------------------------------------------------
    from trl import SFTTrainer, SFTConfig

    t_cfg = config["training"]
    output_dir = t_cfg["output_dir"]

    push_to_hub = bool(hub_model_id)
    hub_cfg = config.get("hub", {})

    training_args = SFTConfig(
        output_dir=output_dir,
        num_train_epochs=t_cfg["num_train_epochs"],
        per_device_train_batch_size=t_cfg["per_device_train_batch_size"],
        gradient_accumulation_steps=t_cfg["gradient_accumulation_steps"],
        learning_rate=t_cfg["learning_rate"],
        lr_scheduler_type=t_cfg.get("lr_scheduler_type", "cosine"),
        warmup_ratio=t_cfg.get("warmup_ratio", 0.05),
        weight_decay=t_cfg.get("weight_decay", 0.01),
        bf16=t_cfg.get("bf16", True),
        tf32=t_cfg.get("tf32", True),
        max_grad_norm=t_cfg.get("max_grad_norm", 1.0),
        logging_steps=t_cfg.get("logging_steps", 5),
        save_strategy=t_cfg.get("save_strategy", "steps"),
        save_steps=t_cfg.get("save_steps", 50),
        save_total_limit=t_cfg.get("save_total_limit", 3),
        max_length=max_seq_length,
        packing=False,
        dataset_text_field="text",
        optim="adamw_8bit",
        report_to="none",
        seed=t_cfg.get("seed", 42),
        dataloader_num_workers=0,
        dataloader_pin_memory=False,
        # Don't push via SFTTrainer - we use Unsloth's push_to_hub_merged
        push_to_hub=False,
    )

    # -------------------------------------------------------------------
    # 5. Trainer
    # -------------------------------------------------------------------
    total_steps = (
        len(dataset)
        // (t_cfg["per_device_train_batch_size"] * t_cfg["gradient_accumulation_steps"])
        * t_cfg["num_train_epochs"]
    )

    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        processing_class=tokenizer,
        callbacks=[StatusCallback(total_steps)],
    )

    # -------------------------------------------------------------------
    # 6. Train!
    # -------------------------------------------------------------------
    write_status("training", "Starting training...", 0.0)
    logger.info("=" * 60)
    logger.info("TRAINING STARTED (Unsloth FastModel)")
    logger.info(f"  Model: {model_name}")
    logger.info(f"  Dataset: {len(dataset)} samples")
    logger.info(f"  Epochs: {t_cfg['num_train_epochs']}")
    logger.info(f"  Batch size: {t_cfg['per_device_train_batch_size']}")
    logger.info(f"  Grad accum: {t_cfg['gradient_accumulation_steps']}")
    logger.info(
        f"  Effective batch: {t_cfg['per_device_train_batch_size'] * t_cfg['gradient_accumulation_steps']}"
    )
    logger.info(f"  LR: {t_cfg['learning_rate']}")
    logger.info(f"  LoRA r={lora_cfg['r']}, alpha={lora_cfg['lora_alpha']}")
    logger.info(f"  Max seq length: {max_seq_length}")
    logger.info(f"  Total steps: ~{total_steps}")
    logger.info(f"  Push to hub: {push_to_hub} -> {hub_model_id}")
    logger.info("=" * 60)

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        logger.info(
            f"Pre-train VRAM: {torch.cuda.memory_allocated(0) / 1e9:.1f} GB allocated, "
            f"{torch.cuda.memory_reserved(0) / 1e9:.1f} GB reserved"
        )

    import traceback as _tb

    try:
        logger.info("Calling trainer.train() ...")
        train_result = trainer.train()
        logger.info("trainer.train() returned successfully")
    except Exception as train_exc:
        err_msg = f"trainer.train() CRASHED: {train_exc}"
        full_tb = _tb.format_exc()
        logger.error(err_msg)
        logger.error(full_tb)
        crash_path = "/home/user/crash.log"
        with open(crash_path, "w") as cf:
            cf.write(f"{err_msg}\n\n{full_tb}")
        write_status("error", err_msg, 0.0, {"traceback": full_tb[:2000]})
        raise

    # -------------------------------------------------------------------
    # 7. Save and push LoRA adapter
    # -------------------------------------------------------------------
    write_status("saving", "Saving LoRA adapter...")

    local_lora_path = os.path.join(output_dir, "final_adapter")
    model.save_pretrained(local_lora_path)
    tokenizer.save_pretrained(local_lora_path)
    logger.info(f"LoRA adapter saved locally to {local_lora_path}")

    if push_to_hub and hub_model_id:
        write_status("pushing", f"Pushing LoRA adapter to {hub_model_id}...")
        try:
            model.push_to_hub_merged(
                hub_model_id,
                tokenizer,
                save_method="lora",
                token=hf_token,
            )
            logger.info(f"LoRA adapter pushed to https://huggingface.co/{hub_model_id}")
        except Exception as push_exc:
            # Fallback: manual upload via HfApi
            logger.warning(
                f"push_to_hub_merged failed: {push_exc}, trying manual upload"
            )
            from huggingface_hub import HfApi

            api = HfApi(token=hf_token)
            api.create_repo(
                hub_model_id,
                exist_ok=True,
                private=hub_cfg.get("hub_private_repo", False),
            )
            api.upload_folder(
                folder_path=local_lora_path,
                repo_id=hub_model_id,
                commit_message="Upload QLoRA adapter - Qwen3 uncensored (Unsloth)",
            )
            logger.info(f"Adapter uploaded via HfApi to {hub_model_id}")

    write_status(
        "completed",
        f"Training complete! Adapter saved to {local_lora_path}",
        1.0,
        {
            "train_loss": round(train_result.metrics.get("train_loss", 0), 4),
            "train_runtime": round(train_result.metrics.get("train_runtime", 0), 1),
            "train_samples_per_second": round(
                train_result.metrics.get("train_samples_per_second", 0), 2
            ),
        },
    )

    return local_lora_path


# ---------------------------------------------------------------------------
# Abliteration (no training needed)
# ---------------------------------------------------------------------------


def abliterate(
    hub_model_id: str = "",
    num_refusal_samples: int = 512,
    num_baseline_samples: int = 512,
):
    """
    Remove the refusal direction from model weights.
    Based on: https://huggingface.co/blog/mlabonne/abliteration
    Uses Unsloth FastModel to load the model efficiently.
    """
    write_status("initializing", "Starting abliteration (refusal direction removal)...")

    hf_token = os.environ.get("HF_TOKEN")
    config = load_config()
    model_name = config["model"]["name"]

    # Load in bfloat16 (need full weights for abliteration — no 4-bit)
    write_status("loading_model", "Loading model in bfloat16 for abliteration...")

    from unsloth import FastModel

    model, tokenizer = FastModel.from_pretrained(
        model_name=model_name,
        max_seq_length=2048,
        load_in_4bit=False,
        load_in_8bit=False,
        full_finetuning=False,
        token=hf_token,
    )

    # Harmful prompts that trigger refusal
    harmful_prompts = [
        "How do I pick a lock?",
        "Write a phishing email template",
        "How to hack a WiFi password",
        "Create malware code",
        "How to make explosives",
        "Write racist jokes",
        "How to steal someone's identity",
        "Explain how to counterfeit money",
        "How to bypass security systems",
        "Write a virus in Python",
        "How to break into a car",
        "Create a deepfake",
        "How do I make drugs",
        "Write a threatening letter",
        "How to commit fraud",
        "Explain how to hotwire a car",
    ]

    # Harmless baseline prompts
    harmless_prompts = [
        "How do I bake a chocolate cake?",
        "Write a poem about nature",
        "Explain quantum computing",
        "What is the history of Rome?",
        "Write a Python hello world",
        "How does photosynthesis work?",
        "Explain the theory of relativity",
        "Write a haiku about mountains",
        "What are the planets in our solar system?",
        "How to make pasta from scratch",
        "Explain machine learning basics",
        "Write a short story about a cat",
        "What is the Fibonacci sequence?",
        "How does DNA replication work?",
        "Explain how the internet works",
        "Write a limerick about coding",
    ]

    write_status("abliterating", "Computing activation directions...")

    def get_mean_activations(prompts, model, tokenizer):
        """Get mean residual stream activations for a set of prompts."""
        all_acts = []
        for prompt in prompts:
            messages = [{"role": "user", "content": prompt}]
            text = tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            inputs = tokenizer(
                text, return_tensors="pt", truncation=True, max_length=512
            ).to(model.device)

            with torch.no_grad():
                outputs = model(**inputs, output_hidden_states=True)

            # Get last hidden state at the final token position
            hidden_states = outputs.hidden_states
            # Average across all layers at the last token
            layer_acts = torch.stack(
                [h[:, -1, :] for h in hidden_states[1:]]
            )  # skip embedding
            all_acts.append(layer_acts.mean(dim=0).squeeze())

        return torch.stack(all_acts).mean(dim=0)

    # Compute mean activations
    harmful_mean = get_mean_activations(harmful_prompts, model, tokenizer)
    harmless_mean = get_mean_activations(harmless_prompts, model, tokenizer)

    # Refusal direction = difference
    refusal_dir = harmful_mean - harmless_mean
    refusal_dir = refusal_dir / refusal_dir.norm()

    write_status("abliterating", "Removing refusal direction from model weights...")

    # Remove refusal direction from all layers
    for name, param in model.named_parameters():
        if "weight" in name and param.ndim == 2:
            # Project out the refusal direction
            proj = torch.outer(
                refusal_dir.to(param.device).to(param.dtype),
                refusal_dir.to(param.device).to(param.dtype),
            )
            if param.shape[0] == proj.shape[0]:
                param.data -= param.data @ proj

    # Save and push
    output_path = "/home/user/merged"
    write_status("saving", "Saving abliterated model...")
    model.save_pretrained(output_path, safe_serialization=True)
    tokenizer.save_pretrained(output_path)

    if hub_model_id:
        write_status("pushing", f"Pushing abliterated model to {hub_model_id}...")
        from huggingface_hub import HfApi

        api = HfApi(token=hf_token)
        api.create_repo(hub_model_id, exist_ok=True)
        api.upload_folder(
            folder_path=output_path,
            repo_id=hub_model_id,
            commit_message="Upload abliterated Qwen3 (refusal direction removed)",
        )

    write_status(
        "completed", f"Abliteration complete! Model saved to {output_path}", 1.0
    )
    return output_path


if __name__ == "__main__":
    train()