"""
Merge LoRA adapter into base model and push to Hugging Face Hub.
Run this AFTER training completes to create a standalone model.
"""

import os
import sys
import json
import yaml
import torch
import logging
from pathlib import Path

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from huggingface_hub import HfApi

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

STATUS_FILE = "/home/user/training_status.json"


def write_status(status: str, detail: str = "", progress: float = 0.0):
    data = {"status": status, "detail": detail, "progress": progress, "metrics": {}}
    Path(STATUS_FILE).write_text(json.dumps(data))


def merge_and_push(
    adapter_path: str = "/home/user/output/final_adapter",
    hub_model_id: str = "",
    push_to_hub: bool = True,
):
    """
    Load the base model, merge the LoRA adapter, and optionally push to Hub.

    WARNING: This requires significant RAM/VRAM because the full model must be loaded.
    For the 80B MoE model, you'll need ~160GB RAM or ~80GB VRAM to merge in bf16.
    """

    hf_token = os.environ.get("HF_TOKEN")
    if not hf_token:
        raise ValueError("HF_TOKEN environment variable is required")

    with open("config.yaml") as f:
        config = yaml.safe_load(f)

    model_name = config["model"]["name"]

    # -----------------------------------------------------------------------
    # 1. Load base model in bf16
    # -----------------------------------------------------------------------
    write_status("merging", "Loading base model in bfloat16...", 0.1)
    logger.info(f"Loading base model: {model_name}")

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
        token=hf_token,
    )

    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True,
        token=hf_token,
    )

    # -----------------------------------------------------------------------
    # 2. Load and merge LoRA adapter
    # -----------------------------------------------------------------------
    write_status("merging", "Merging LoRA adapter into base model...", 0.4)
    logger.info(f"Loading adapter from: {adapter_path}")

    model = PeftModel.from_pretrained(model, adapter_path)
    model = model.merge_and_unload()
    logger.info("LoRA adapter merged successfully")

    # -----------------------------------------------------------------------
    # 3. Save merged model
    # -----------------------------------------------------------------------
    output_path = "/home/user/merged"
    write_status("merging", "Saving merged model...", 0.6)
    logger.info(f"Saving merged model to: {output_path}")

    model.save_pretrained(output_path, safe_serialization=True, max_shard_size="4GB")
    tokenizer.save_pretrained(output_path)

    # -----------------------------------------------------------------------
    # 4. Push to Hub
    # -----------------------------------------------------------------------
    if push_to_hub and hub_model_id:
        write_status("pushing", f"Pushing merged model to {hub_model_id}...", 0.8)
        logger.info(f"Pushing to: {hub_model_id}")

        api = HfApi(token=hf_token)
        api.create_repo(hub_model_id, exist_ok=True)
        api.upload_folder(
            folder_path=output_path,
            repo_id=hub_model_id,
            commit_message="Upload merged Qwen3-Coder-Next uncensored (LoRA merged)",
        )
        logger.info(f"Model pushed to https://huggingface.co/{hub_model_id}")

        # Create model card
        model_card = f"""---
license: apache-2.0
base_model: {model_name}
tags:
  - qwen3
  - uncensored
  - fine-tuned
  - qlora
  - merged
---

# {hub_model_id.split("/")[-1]}

Fine-tuned and uncensored version of [{model_name}](https://huggingface.co/{model_name}).

## Training Details

- **Method**: QLoRA 4-bit fine-tuning
- **Base Model**: {model_name} (80B MoE / 3B active parameters)
- **LoRA Rank**: {config["lora"]["r"]}
- **LoRA Alpha**: {config["lora"]["lora_alpha"]}
- **Target Modules**: {", ".join(config["lora"]["target_modules"])}
- **Epochs**: {config["training"]["num_train_epochs"]}
- **Learning Rate**: {config["training"]["learning_rate"]}
- **Max Seq Length**: {config["training"]["max_seq_length"]}

## Usage

```python
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("{hub_model_id}", torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("{hub_model_id}")

messages = [{{"role": "user", "content": "Your prompt here"}}]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer([text], return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=4096)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
```
"""
        api.upload_file(
            path_or_fileobj=model_card.encode(),
            path_in_repo="README.md",
            repo_id=hub_model_id,
            commit_message="Add model card",
        )

    write_status("completed", f"Merge complete! Model at {output_path}", 1.0)
    logger.info("Done!")
    return output_path


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--adapter-path", default="/home/user/output/final_adapter")
    parser.add_argument(
        "--hub-model-id",
        required=True,
        help="e.g. your-username/qwen3-coder-uncensored-merged",
    )
    parser.add_argument("--no-push", action="store_true")
    args = parser.parse_args()

    merge_and_push(
        adapter_path=args.adapter_path,
        hub_model_id=args.hub_model_id,
        push_to_hub=not args.no_push,
    )