""" Merge LoRA adapter into base model and push to Hugging Face Hub. Run this AFTER training completes to create a standalone model. """ import os import sys import json import yaml import torch import logging from pathlib import Path from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel from huggingface_hub import HfApi logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) STATUS_FILE = "/home/user/training_status.json" def write_status(status: str, detail: str = "", progress: float = 0.0): data = {"status": status, "detail": detail, "progress": progress, "metrics": {}} Path(STATUS_FILE).write_text(json.dumps(data)) def merge_and_push( adapter_path: str = "/home/user/output/final_adapter", hub_model_id: str = "", push_to_hub: bool = True, ): """ Load the base model, merge the LoRA adapter, and optionally push to Hub. WARNING: This requires significant RAM/VRAM because the full model must be loaded. For the 80B MoE model, you'll need ~160GB RAM or ~80GB VRAM to merge in bf16. """ hf_token = os.environ.get("HF_TOKEN") if not hf_token: raise ValueError("HF_TOKEN environment variable is required") with open("config.yaml") as f: config = yaml.safe_load(f) model_name = config["model"]["name"] # ----------------------------------------------------------------------- # 1. Load base model in bf16 # ----------------------------------------------------------------------- write_status("merging", "Loading base model in bfloat16...", 0.1) logger.info(f"Loading base model: {model_name}") model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, token=hf_token, ) tokenizer = AutoTokenizer.from_pretrained( model_name, trust_remote_code=True, token=hf_token, ) # ----------------------------------------------------------------------- # 2. Load and merge LoRA adapter # ----------------------------------------------------------------------- write_status("merging", "Merging LoRA adapter into base model...", 0.4) logger.info(f"Loading adapter from: {adapter_path}") model = PeftModel.from_pretrained(model, adapter_path) model = model.merge_and_unload() logger.info("LoRA adapter merged successfully") # ----------------------------------------------------------------------- # 3. Save merged model # ----------------------------------------------------------------------- output_path = "/home/user/merged" write_status("merging", "Saving merged model...", 0.6) logger.info(f"Saving merged model to: {output_path}") model.save_pretrained(output_path, safe_serialization=True, max_shard_size="4GB") tokenizer.save_pretrained(output_path) # ----------------------------------------------------------------------- # 4. Push to Hub # ----------------------------------------------------------------------- if push_to_hub and hub_model_id: write_status("pushing", f"Pushing merged model to {hub_model_id}...", 0.8) logger.info(f"Pushing to: {hub_model_id}") api = HfApi(token=hf_token) api.create_repo(hub_model_id, exist_ok=True) api.upload_folder( folder_path=output_path, repo_id=hub_model_id, commit_message="Upload merged Qwen3-Coder-Next uncensored (LoRA merged)", ) logger.info(f"Model pushed to https://huggingface.co/{hub_model_id}") # Create model card model_card = f"""--- license: apache-2.0 base_model: {model_name} tags: - qwen3 - uncensored - fine-tuned - qlora - merged --- # {hub_model_id.split("/")[-1]} Fine-tuned and uncensored version of [{model_name}](https://huggingface.co/{model_name}). ## Training Details - **Method**: QLoRA 4-bit fine-tuning - **Base Model**: {model_name} (80B MoE / 3B active parameters) - **LoRA Rank**: {config["lora"]["r"]} - **LoRA Alpha**: {config["lora"]["lora_alpha"]} - **Target Modules**: {", ".join(config["lora"]["target_modules"])} - **Epochs**: {config["training"]["num_train_epochs"]} - **Learning Rate**: {config["training"]["learning_rate"]} - **Max Seq Length**: {config["training"]["max_seq_length"]} ## Usage ```python from transformers import AutoModelForCausalLM, AutoTokenizer model = AutoModelForCausalLM.from_pretrained("{hub_model_id}", torch_dtype="auto", device_map="auto") tokenizer = AutoTokenizer.from_pretrained("{hub_model_id}") messages = [{{"role": "user", "content": "Your prompt here"}}] text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer([text], return_tensors="pt").to(model.device) outputs = model.generate(**inputs, max_new_tokens=4096) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) ``` """ api.upload_file( path_or_fileobj=model_card.encode(), path_in_repo="README.md", repo_id=hub_model_id, commit_message="Add model card", ) write_status("completed", f"Merge complete! Model at {output_path}", 1.0) logger.info("Done!") return output_path if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--adapter-path", default="/home/user/output/final_adapter") parser.add_argument( "--hub-model-id", required=True, help="e.g. your-username/qwen3-coder-uncensored-merged", ) parser.add_argument("--no-push", action="store_true") args = parser.parse_args() merge_and_push( adapter_path=args.adapter_path, hub_model_id=args.hub_model_id, push_to_hub=not args.no_push, )