před 4 měsíci · 834c0d989c
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -18,10 +18,14 @@ concurrency:
 
				   cancel-in-progress: true
			
 
				 
			
 
				 env:
			
 
				-  # HuggingFace model info
			
 
				-  HF_MODEL_REPO: GaboxR67/MelBandRoformers
			
 
				-  HF_CHECKPOINT_PATH: melbandroformers/vocals/voc_fv6.ckpt
			
 
				-  HF_CONFIG_PATH: melbandroformers/vocals/voc_gabox.yaml
			
 
				+  # HuggingFace model info (Mel-Band)
			
 
				+  HF_MB_REPO: GaboxR67/MelBandRoformers
			
 
				+  HF_MB_CHECKPOINT: melbandroformers/vocals/voc_fv6.ckpt
			
 
				+  HF_MB_CONFIG: melbandroformers/vocals/voc_gabox.yaml
			
 
				+  # BS Roformer model info
			
 
				+  HF_BS_REPO: anvuew/BS-RoFormer
			
 
				+  HF_BS_CHECKPOINT: bs_roformer_anvuew_sdr_12.45.ckpt
			
 
				+  HF_BS_CONFIG: config.yaml
			
 
				   # Music-Source-Separation-Training repo
			
 
				   MSST_REPO: https://github.com/ZFTurbo/Music-Source-Separation-Training.git
			
 
				   # Enable sccache GitHub Actions cache
			
@@ -59,10 +63,17 @@ jobs:
 
				           from huggingface_hub import hf_hub_download
			
 
				           import os
			
 
				           token = os.environ.get('HF_TOKEN') or None
			
 
				-          hf_hub_download('${{ env.HF_MODEL_REPO }}', '${{ env.HF_CHECKPOINT_PATH }}', 
			
 
				+          
			
 
				+          # Download Mel-Band Roformer
			
 
				+          hf_hub_download('${{ env.HF_MB_REPO }}', '${{ env.HF_MB_CHECKPOINT }}', 
			
 
				                           local_dir='./model', token=token)
			
 
				-          hf_hub_download('${{ env.HF_MODEL_REPO }}', '${{ env.HF_CONFIG_PATH }}',
			
 
				+          hf_hub_download('${{ env.HF_MB_REPO }}', '${{ env.HF_MB_CONFIG }}',
			
 
				                           local_dir='./model', token=token)
			
 
				+          # Download BS Roformer
			
 
				+          hf_hub_download('${{ env.HF_BS_REPO }}', '${{ env.HF_BS_CHECKPOINT }}',
			
 
				+                          local_dir='./model_bs', token=token)
			
 
				+          hf_hub_download('${{ env.HF_BS_REPO }}', '${{ env.HF_BS_CONFIG }}',
			
 
				+                          local_dir='./model_bs', token=token)
			
 
				           "
			
 
				           
			
 
				       - name: Generate Test Audio
			
@@ -74,18 +85,40 @@ jobs:
 
				           python scripts/generate_test_data.py \
			
 
				             --model-repo msst \
			
 
				             --audio test_audio.wav \
			
 
				-            --checkpoint model/${{ env.HF_CHECKPOINT_PATH }} \
			
 
				-            --config model/${{ env.HF_CONFIG_PATH }} \
			
 
				+            --checkpoint model/${{ env.HF_MB_CHECKPOINT }} \
			
 
				+            --config model/${{ env.HF_MB_CONFIG }} \
			
 
				             --output test_data
			
 
				             
			
 
				-      - name: Convert Model to GGUF
			
 
				+      - name: Convert Model to GGUF (Mel-Band)
			
 
				         run: |
			
 
				           python scripts/convert_to_gguf.py \
			
 
				-            --ckpt model/${{ env.HF_CHECKPOINT_PATH }} \
			
 
				-            --config model/${{ env.HF_CONFIG_PATH }} \
			
 
				+            --ckpt model/${{ env.HF_MB_CHECKPOINT }} \
			
 
				+            --config model/${{ env.HF_MB_CONFIG }} \
			
 
				             --out model.gguf \
			
 
				             --dtype fp32
			
 
				             
			
 
				+      # ----- BS Roformer Setup -----
			
 
				+      - name: Generate Test Data (BS)
			
 
				+        run: |
			
 
				+          # Need to make sure msst is in python path
			
 
				+          export PYTHONPATH=$PYTHONPATH:$(pwd)/msst
			
 
				+          # Use real BS model
			
 
				+          python scripts/generate_test_data.py \
			
 
				+            --model-repo msst \
			
 
				+            --audio test_audio.wav \
			
 
				+            --checkpoint model_bs/${{ env.HF_BS_CHECKPOINT }} \
			
 
				+            --config model_bs/${{ env.HF_BS_CONFIG }} \
			
 
				+            --output test_data_bs
			
 
				+            
			
 
				+      - name: Convert Model to GGUF (BS)
			
 
				+        run: |
			
 
				+          python scripts/convert_to_gguf.py \
			
 
				+            --ckpt model_bs/${{ env.HF_BS_CHECKPOINT }} \
			
 
				+            --config model_bs/${{ env.HF_BS_CONFIG }} \
			
 
				+            --out model_bs.gguf \
			
 
				+            --dtype fp32 \
			
 
				+            --arch bs
			
 
				+            
			
 
				       - name: Upload Test Data Artifact
			
 
				         uses: actions/upload-artifact@v4
			
 
				         with:
			
@@ -94,6 +127,8 @@ jobs:
 
				             test_data/
			
 
				             model.gguf
			
 
				             test_audio.wav
			
 
				+            test_data_bs/
			
 
				+            model_bs.gguf
			
 
				           retention-days: 1
			
 
				 
			
 
				   # ===========================================================================
			
@@ -221,6 +256,15 @@ jobs:
 
				           MBR_FORCE_CPU: ${{ runner.os == 'macOS' && '1' || '' }}
			
 
				         run: ctest --test-dir build -C Release -V --output-on-failure --timeout 300
			
 
				         
			
 
				+      - name: Run Unit Tests (BS Roformer)
			
 
				+        if: matrix.test
			
 
				+        env:
			
 
				+          MBR_MODEL_PATH: ${{ github.workspace }}/model_bs.gguf
			
 
				+          MBR_TEST_DATA_DIR: ${{ github.workspace }}/test_data_bs
			
 
				+          MBR_ARCHITECTURE: bs
			
 
				+          MBR_FORCE_CPU: ${{ runner.os == 'macOS' && '1' || '' }}
			
 
				+        run: ctest --test-dir build -C Release -V --output-on-failure --timeout 300
			
 
				+        
			
 
				       # ----- CLI Tests -----
			
 
				       - name: Test CLI
			
 
				         if: matrix.test
			
--- a/.github/workflows/convert-model.yml
+++ b/.github/workflows/convert-model.yml
@@ -24,7 +24,7 @@ on:
 
				         description: '要转换的量化类型 (用逗号分隔, 留空则转换全部)'
			
 
				         required: false
			
 
				         type: string
			
 
				-        default: 'fp32,fp16,q8_0,q4_0,q4_1,q5_0,q5_1'
			
 
				+        default: ''
			
 
				       gguf_name:
			
 
				         description: 'GGUF 元数据中的模型名称 (可选, 默认: Mel-Band-Roformer Separator)'
			
 
				         required: false
			
@@ -33,6 +33,10 @@ on:
 
				         description: 'GGUF 元数据中的模型描述 (可选, 默认: Music source separation model)'
			
 
				         required: false
			
 
				         type: string
			
 
				+      architecture:
			
 
				+        description: '模型架构类型 (可选, 例如: bs_roformer, mel_band_roformer)'
			
 
				+        required: false
			
 
				+        type: string
			
 
				 
			
 
				 env:
			
 
				   SUPPORTED_QUANT_TYPES: 'fp32,fp16,q8_0,q4_0,q4_1,q5_0,q5_1'
			
@@ -109,6 +113,7 @@ jobs:
 
				           GGUF_NAME="${{ inputs.gguf_name }}"
			
 
				           GGUF_DESC="${{ inputs.gguf_description }}"
			
 
				           QUANT_TYPES="${{ inputs.quantization_types }}"
			
 
				+          ARCH="${{ inputs.architecture }}"
			
 
				           
			
 
				           # If no types specified, use all supported types
			
 
				           if [ -z "$QUANT_TYPES" ]; then
			
@@ -138,6 +143,7 @@ jobs:
 
				             CMD="python scripts/convert_to_gguf.py --ckpt \"$CHECKPOINT\" --config \"$CONFIG\" --out \"$OUTPUT_FILE\" --dtype \"$qtype\""
			
 
				             [ -n "$GGUF_NAME" ] && CMD="$CMD --name \"$GGUF_NAME\""
			
 
				             [ -n "$GGUF_DESC" ] && CMD="$CMD --description \"$GGUF_DESC\""
			
 
				+            [ -n "$ARCH" ]      && CMD="$CMD --arch \"$ARCH\""
			
 
				             
			
 
				             eval $CMD
			
 
				               
			
--- a/README.md
+++ b/README.md
@@ -6,11 +6,12 @@ High-performance C++ inference implementation for the Mel-Band-Roformer audio so
 
				 
			
 
				 ## 📖 Introduction
			
 
				 
			
 
				-This project is a pure C++ inference engine for the Mel-Band-Roformer audio source separation model, built on the [GGML](https://github.com/ggerganov/ggml) tensor library. It theoretically supports most Mel-Band-Roformer models and is primarily used for extracting vocals or accompaniment from music.
			
 
				+This project is a pure C++ inference engine for the **Mel-Band-Roformer** and **BS Roformer** audio source separation models, built on the [GGML](https://github.com/ggerganov/ggml) tensor library. It primarily used for extracting vocals or accompaniment from music.
			
 
				 
			
 
				 ### ✨ Key Features
			
 
				 
			
 
				 - 🚀 **High-Performance Inference**: Supports CPU/GPU (CUDA, Vulkan) acceleration
			
 
				+- 🏗️ **Multi-Architecture**: Support for both **Mel-Band Roformer** and **BS Roformer**
			
 
				 - 📦 **GGUF Model Format**: Unified model file format for easy distribution
			
 
				 - 🎚️ **Multiple Quantization Support**: FP32/FP16/Q8_0/Q4_0/Q4_1/Q5_0/Q5_1
			
 
				 - 🔧 **Easy Deployment**: Only requires executable and GGML library
			
@@ -132,6 +133,9 @@ python scripts/convert_to_gguf.py \
 
				     --config config.yaml \
			
 
				     --out model.gguf \
			
 
				     --dtype q8_0
			
 
				+
			
 
				+# For BS Roformer (optional, usually auto-detected)
			
 
				+python scripts/convert_to_gguf.py ... --arch bs
			
 
				 ```
			
 
				 
			
 
				 ### Supported Quantization Types
			
--- a/README.zh.md
+++ b/README.zh.md
@@ -6,11 +6,12 @@ Mel-Band-Roformer 音频源分离模型的高性能 C++ 推理实现。
 
				 
			
 
				 ## 📖 简介
			
 
				 
			
 
				-本项目是 Mel-Band-Roformer 音频源分离模型的纯 C++ 推理引擎，基于 [GGML](https://github.com/ggerganov/ggml) 张量库构建。理论上支持大部分 Mel-Band-Roformer 模型，主要用于从音乐中提取人声或伴奏。
			
 
				+本项目是 **Mel-Band-Roformer** 和 **BS Roformer** 音频源分离模型的纯 C++ 推理引擎，基于 [GGML](https://github.com/ggerganov/ggml) 张量库构建。主要用于从音乐中提取人声或伴奏。
			
 
				 
			
 
				 ### ✨ 主要特性
			
 
				 
			
 
				 - 🚀 **高性能推理**：支持 CPU/GPU (CUDA、Vulkan) 加速
			
 
				+- 🏗️ **多架构支持**：同时支持 **Mel-Band Roformer** 和 **BS Roformer**
			
 
				 - 📦 **GGUF 模型格式**：统一的模型文件格式，易于分发
			
 
				 - 🎚️ **多种量化支持**：FP32/FP16/Q8_0/Q4_0/Q4_1/Q5_0/Q5_1
			
 
				 - 🔧 **易于部署**：仅需可执行文件和 GGML 库
			
@@ -132,6 +133,9 @@ python scripts/convert_to_gguf.py \
 
				     --config config.yaml \
			
 
				     --out model.gguf \
			
 
				     --dtype q8_0
			
 
				+
			
 
				+# 转换 BS Roformer (可选，通常可自动检测)
			
 
				+python scripts/convert_to_gguf.py ... --arch bs
			
 
				 ```
			
 
				 
			
 
				 ### 支持的量化类型
			
--- a/scripts/convert_to_gguf.py
+++ b/scripts/convert_to_gguf.py
@@ -17,7 +17,172 @@ import gguf
 
				 from gguf.quants import quantize, GGMLQuantizationType
			
 
				 
			
 
				 
			
 
				-def generate_buffers(hparams):
			
 
				+def detect_architecture(config_dict):
			
 
				+    """
			
 
				+    Detect architecture from config.
			
 
				+    Returns: 'bs_roformer' or 'mel_band_roformer'
			
 
				+    """
			
 
				+
			
 
				+    # Check structural signatures in 'model' section
			
 
				+    model_config = config_dict.get("model", {})
			
 
				+
			
 
				+    has_freqs = "freqs_per_bands" in model_config
			
 
				+    has_num_bands = "num_bands" in model_config
			
 
				+
			
 
				+    if has_freqs:
			
 
				+        return "bs_roformer"
			
 
				+    if has_num_bands:
			
 
				+        return "mel_band_roformer"
			
 
				+
			
 
				+    # 3. If neither found, fail
			
 
				+    raise ValueError(
			
 
				+        "Auto-detection failed: Config missing 'freqs_per_bands' (BS) or 'num_bands' (Mel-Band). "
			
 
				+        "Please specify --arch manually."
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def normalize_arch(arch: str) -> str:
			
 
				+    """Normalize architecture name to full GGUF name."""
			
 
				+    mapping = {
			
 
				+        "bs": "bs_roformer",
			
 
				+        "bs_roformer": "bs_roformer",
			
 
				+        "mel": "mel_band_roformer",
			
 
				+        "mel_band": "mel_band_roformer",
			
 
				+        "mel_band_roformer": "mel_band_roformer",
			
 
				+    }
			
 
				+    result = mapping.get(arch.lower())
			
 
				+    if result is None:
			
 
				+        raise ValueError(
			
 
				+            f"Unknown architecture: '{arch}'. Supported: {list(mapping.keys())}"
			
 
				+        )
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def generate_buffers_bs(hparams):
			
 
				+    """BS Roformer: 从 freqs_per_bands 元组生成缓冲区"""
			
 
				+    # Default from bs_roformer.py
			
 
				+    DEFAULT_FREQS_PER_BANDS = (
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        2,
			
 
				+        4,
			
 
				+        4,
			
 
				+        4,
			
 
				+        4,
			
 
				+        4,
			
 
				+        4,
			
 
				+        4,
			
 
				+        4,
			
 
				+        4,
			
 
				+        4,
			
 
				+        4,
			
 
				+        4,
			
 
				+        12,
			
 
				+        12,
			
 
				+        12,
			
 
				+        12,
			
 
				+        12,
			
 
				+        12,
			
 
				+        12,
			
 
				+        12,
			
 
				+        24,
			
 
				+        24,
			
 
				+        24,
			
 
				+        24,
			
 
				+        24,
			
 
				+        24,
			
 
				+        24,
			
 
				+        24,
			
 
				+        48,
			
 
				+        48,
			
 
				+        48,
			
 
				+        48,
			
 
				+        48,
			
 
				+        48,
			
 
				+        48,
			
 
				+        48,
			
 
				+        128,
			
 
				+        129,
			
 
				+    )
			
 
				+
			
 
				+    freqs_per_bands = hparams.get("freqs_per_bands", DEFAULT_FREQS_PER_BANDS)
			
 
				+    stereo = hparams.get("stereo", False)
			
 
				+    audio_channels = 2 if stereo else 1
			
 
				+
			
 
				+    # Validate
			
 
				+    stft_n_fft = hparams.get("stft_n_fft", 2048)
			
 
				+    expected_freqs = stft_n_fft // 2 + 1
			
 
				+
			
 
				+    # Check sum
			
 
				+    sum_freqs = sum(freqs_per_bands)
			
 
				+    if sum_freqs != expected_freqs:
			
 
				+        print(
			
 
				+            f"[WARNING] sum(freqs_per_bands)={sum_freqs} != expected {expected_freqs}. Adjusting last band..."
			
 
				+        )
			
 
				+        # Note: In C++ logic relying on exact match might be strict, but let's warn for now.
			
 
				+        # Actually BS Roformer paper/code implies strict match for STFT reconstruction.
			
 
				+
			
 
				+    num_bands = len(freqs_per_bands)
			
 
				+
			
 
				+    freqs_per_bands_with_complex = tuple(
			
 
				+        2 * f * audio_channels for f in freqs_per_bands
			
 
				+    )
			
 
				+
			
 
				+    # num_freqs_per_band: i32 array
			
 
				+    num_freqs_per_band = np.array(freqs_per_bands, dtype=np.int32)
			
 
				+
			
 
				+    # BS doesn't use freq_indices re-indexing, but to keep compatible file structure
			
 
				+    # we create dummy full-range indices.
			
 
				+    total_freqs_stereo = expected_freqs * audio_channels
			
 
				+    freq_indices = np.arange(total_freqs_stereo, dtype=np.int32)
			
 
				+    num_bands_per_freq = np.ones(expected_freqs, dtype=np.int32)
			
 
				+
			
 
				+    print(f"Generated BS buffers: {num_bands} bands, {len(freq_indices)} indices")
			
 
				+
			
 
				+    return {
			
 
				+        "freq_indices": freq_indices,
			
 
				+        "num_freqs_per_band": num_freqs_per_band,
			
 
				+        "num_bands_per_freq": num_bands_per_freq,
			
 
				+        "num_bands": num_bands,
			
 
				+        "freqs_per_bands_with_complex": freqs_per_bands_with_complex,
			
 
				+        "freqs_per_bands_tuple": freqs_per_bands,  # Keep raw tuple for metadata
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def generate_buffers(hparams, arch="mel_band_roformer"):
			
 
				+    """
			
 
				+    Generate buffers for the specified architecture.
			
 
				+
			
 
				+    Args:
			
 
				+        hparams: Model hyperparameters
			
 
				+        arch: Architecture name ('bs_roformer' or 'mel_band_roformer')
			
 
				+    """
			
 
				+    if arch == "bs_roformer":
			
 
				+        return generate_buffers_bs(hparams)
			
 
				+
			
 
				+    # Mel-Band-Roformer Logic
			
 
				+    # ------------------------------------------------------------------------
			
 
				     """
			
 
				     Generate the buffers (freq_indices, num_bands_per_freq, etc.)
			
 
				     mimicking the logic in MelBandRoformer.__init__.
			
@@ -206,6 +371,10 @@ def map_key_name(key: str) -> str:
 
				         layer_idx = parts[5]  # 0, 2, 4
			
 
				         return f"mask_est.{est_idx}.freq.{freq_idx}.mlp.{layer_idx}.{suffix}"
			
 
				 
			
 
				+    # Final Norm
			
 
				+    if key.startswith("final_norm"):
			
 
				+        return f"final_norm.{suffix}"
			
 
				+
			
 
				     return key.replace(".", "_")
			
 
				 
			
 
				 
			
@@ -219,8 +388,9 @@ def convert(
 
				     output_path: str,
			
 
				     config_path: str,
			
 
				     dtype: str = "fp32",
			
 
				-    name: str = None,
			
 
				-    description: str = None,
			
 
				+    name: str | None = None,
			
 
				+    description: str | None = None,
			
 
				+    arch: str | None = None,
			
 
				 ):
			
 
				     """
			
 
				     Convert PyTorch checkpoint to GGUF format.
			
@@ -239,15 +409,29 @@ def convert(
 
				     with open(config_path) as f:
			
 
				         config_dict = yaml.load(f, Loader=yaml.FullLoader)
			
 
				 
			
 
				+    # Detect architecture
			
 
				+    if arch is None:
			
 
				+        try:
			
 
				+            arch = detect_architecture(config_dict)
			
 
				+            print(f"Auto-detected architecture: {arch}")
			
 
				+        except ValueError as e:
			
 
				+            print(f"Error: {e}")
			
 
				+            return
			
 
				+    else:
			
 
				+        # Normalize provided arch to full name
			
 
				+        arch = normalize_arch(arch)
			
 
				+
			
 
				     # Generate buffers
			
 
				     print("Generating buffers (standalone)...")
			
 
				-    buffers = generate_buffers(config_dict["model"])
			
 
				+    buffers = generate_buffers(config_dict["model"], arch=arch)
			
 
				     freq_indices = buffers["freq_indices"]
			
 
				     num_bands_per_freq = buffers["num_bands_per_freq"]
			
 
				     num_freqs_per_band = buffers["num_freqs_per_band"]
			
 
				 
			
 
				+    arch_name = arch
			
 
				+
			
 
				     # Create GGUF writer
			
 
				-    gguf_writer = gguf.GGUFWriter(output_path, "mel_band_roformer")
			
 
				+    gguf_writer = gguf.GGUFWriter(output_path, arch_name)
			
 
				 
			
 
				     # =========================================================================
			
 
				     # 1. Write Standard GGUF Metadata
			
@@ -266,6 +450,14 @@ def convert(
 
				 
			
 
				     gguf_writer.add_file_type(file_type_id)
			
 
				 
			
 
				+    # Write Architecture
			
 
				+    # gguf_writer.add_string(f"{arch_name}.architecture", arch) # Redundant with general.architecture
			
 
				+
			
 
				+    if arch_name == "bs_roformer" and "freqs_per_bands_tuple" in buffers:
			
 
				+        freqs_tuple = buffers["freqs_per_bands_tuple"]
			
 
				+        # Must be list for GGUFWriter
			
 
				+        gguf_writer.add_array(f"{arch_name}.freqs_per_bands", list(freqs_tuple))
			
 
				+
			
 
				     # Quantization version (required when quantized)
			
 
				     if target_qtype != GGMLQuantizationType.F32:
			
 
				         gguf_writer.add_quantization_version(2)
			
@@ -286,59 +478,80 @@ def convert(
 
				     print("Writing hyperparameters...")
			
 
				     hparams = config_dict["model"]
			
 
				 
			
 
				+    # Load state dict directly (no model class dependency)
			
 
				+    print(f"Loading checkpoint for architecture: {arch}")
			
 
				+
			
 
				+    raw_state_dict = None
			
 
				+    if "state_dict" in checkpoint:
			
 
				+        raw_state_dict = checkpoint["state_dict"]
			
 
				+    elif "model" in checkpoint:
			
 
				+        raw_state_dict = checkpoint["model"]
			
 
				+    else:
			
 
				+        raw_state_dict = checkpoint
			
 
				+
			
 
				+    if raw_state_dict is None:
			
 
				+        raise ValueError("Could not find state_dict in checkpoint")
			
 
				+
			
 
				+    # Clean up state dict (handle DDP "module." prefix)
			
 
				+    state_dict = {}
			
 
				+    for k, v in raw_state_dict.items():
			
 
				+        if k.startswith("module."):
			
 
				+            k = k[7:]
			
 
				+        state_dict[k] = v
			
 
				+
			
 
				     # Architecture specific parameters
			
 
				-    gguf_writer.add_uint32("mel_band_roformer.dim", hparams["dim"])
			
 
				-    gguf_writer.add_uint32("mel_band_roformer.depth", hparams["depth"])
			
 
				-    gguf_writer.add_uint32("mel_band_roformer.num_bands", hparams["num_bands"])
			
 
				+    gguf_writer.add_uint32(f"{arch_name}.dim", hparams["dim"])
			
 
				+    gguf_writer.add_uint32(f"{arch_name}.depth", hparams["depth"])
			
 
				+    # BS uses freqs_per_bands (no explicit num_bands), MelBand uses num_bands
			
 
				+    num_bands = buffers.get("num_bands", hparams.get("num_bands", 60))
			
 
				+    gguf_writer.add_uint32(f"{arch_name}.num_bands", num_bands)
			
 
				 
			
 
				     # STFT parameters
			
 
				-    gguf_writer.add_uint32(
			
 
				-        "mel_band_roformer.stft_n_fft", hparams.get("stft_n_fft", 2048)
			
 
				-    )
			
 
				+    gguf_writer.add_uint32(f"{arch_name}.stft_n_fft", hparams.get("stft_n_fft", 2048))
			
 
				     # Remove default for hop_length, must be present or fail/warn
			
 
				     gguf_writer.add_uint32(
			
 
				-        "mel_band_roformer.stft_hop_length", hparams.get("stft_hop_length", 441)
			
 
				+        f"{arch_name}.stft_hop_length", hparams.get("stft_hop_length", 441)
			
 
				     )
			
 
				     gguf_writer.add_uint32(
			
 
				-        "mel_band_roformer.stft_win_length", hparams.get("stft_win_length", 2048)
			
 
				+        f"{arch_name}.stft_win_length", hparams.get("stft_win_length", 2048)
			
 
				     )
			
 
				     gguf_writer.add_bool(
			
 
				-        "mel_band_roformer.stft_normalized", hparams.get("stft_normalized", False)
			
 
				+        f"{arch_name}.stft_normalized", hparams.get("stft_normalized", False)
			
 
				     )
			
 
				     gguf_writer.add_bool(
			
 
				-        "mel_band_roformer.zero_dc", hparams.get("zero_dc", True)
			
 
				+        f"{arch_name}.zero_dc", hparams.get("zero_dc", True)
			
 
				     )  # Defaults to True in reference implementation
			
 
				 
			
 
				     # Architecture details
			
 
				-    gguf_writer.add_uint32("mel_band_roformer.num_stems", hparams.get("num_stems", 1))
			
 
				-    gguf_writer.add_bool("mel_band_roformer.stereo", hparams.get("stereo", False))
			
 
				+    gguf_writer.add_uint32(f"{arch_name}.num_stems", hparams.get("num_stems", 1))
			
 
				+    gguf_writer.add_bool(f"{arch_name}.stereo", hparams.get("stereo", False))
			
 
				     gguf_writer.add_uint32(
			
 
				-        "mel_band_roformer.sample_rate", hparams.get("sample_rate", 44100)
			
 
				+        f"{arch_name}.sample_rate", hparams.get("sample_rate", 44100)
			
 
				     )
			
 
				 
			
 
				     gguf_writer.add_uint32(
			
 
				-        "mel_band_roformer.time_transformer_depth",
			
 
				+        f"{arch_name}.time_transformer_depth",
			
 
				         hparams.get("time_transformer_depth", 0),
			
 
				     )
			
 
				     gguf_writer.add_uint32(
			
 
				-        "mel_band_roformer.freq_transformer_depth",
			
 
				+        f"{arch_name}.freq_transformer_depth",
			
 
				         hparams.get("freq_transformer_depth", 0),
			
 
				     )
			
 
				     gguf_writer.add_uint32(
			
 
				-        "mel_band_roformer.linear_transformer_depth",
			
 
				+        f"{arch_name}.linear_transformer_depth",
			
 
				         hparams.get("linear_transformer_depth", 0),
			
 
				     )
			
 
				 
			
 
				     gguf_writer.add_uint32(
			
 
				-        "mel_band_roformer.mask_estimator_depth", hparams.get("mask_estimator_depth", 1)
			
 
				+        f"{arch_name}.mask_estimator_depth", hparams.get("mask_estimator_depth", 1)
			
 
				     )
			
 
				-    gguf_writer.add_uint32("mel_band_roformer.dim_head", hparams.get("dim_head", 64))
			
 
				-    gguf_writer.add_uint32("mel_band_roformer.heads", hparams.get("heads", 8))
			
 
				+    gguf_writer.add_uint32(f"{arch_name}.dim_head", hparams.get("dim_head", 64))
			
 
				+    gguf_writer.add_uint32(f"{arch_name}.heads", hparams.get("heads", 8))
			
 
				     gguf_writer.add_uint32(
			
 
				-        "mel_band_roformer.mlp_expansion_factor", hparams.get("mlp_expansion_factor", 4)
			
 
				+        f"{arch_name}.mlp_expansion_factor", hparams.get("mlp_expansion_factor", 4)
			
 
				     )
			
 
				     gguf_writer.add_bool(
			
 
				-        "mel_band_roformer.skip_connection", hparams.get("skip_connection", False)
			
 
				+        f"{arch_name}.skip_connection", hparams.get("skip_connection", False)
			
 
				     )
			
 
				 
			
 
				     # =========================================================================
			
@@ -356,24 +569,31 @@ def convert(
 
				     # num_overlap: from inference section
			
 
				     default_num_overlap = inference_config.get("num_overlap", 0)
			
 
				 
			
 
				-    gguf_writer.add_uint32("mel_band_roformer.default_chunk_size", default_chunk_size)
			
 
				-    gguf_writer.add_uint32("mel_band_roformer.default_num_overlap", default_num_overlap)
			
 
				+    gguf_writer.add_uint32(f"{arch_name}.default_chunk_size", default_chunk_size)
			
 
				+    gguf_writer.add_uint32(f"{arch_name}.default_num_overlap", default_num_overlap)
			
 
				 
			
 
				     # =========================================================================
			
 
				     # 4. Write Buffers (Always FP32/I32)
			
 
				     # =========================================================================
			
 
				     print("Writing buffers...")
			
 
				 
			
 
				-    # freq_indices (int32)
			
 
				-    gguf_writer.add_tensor("buffer_freq_indices", freq_indices.numpy().astype(np.int32))
			
 
				+    # freq_indices (int32) - may be torch.Tensor (MelBand) or np.ndarray (BS)
			
 
				+    fi = freq_indices.numpy() if hasattr(freq_indices, "numpy") else freq_indices
			
 
				+    gguf_writer.add_tensor("buffer_freq_indices", fi.astype(np.int32))
			
 
				     # num_bands_per_freq (int32)
			
 
				-    gguf_writer.add_tensor(
			
 
				-        "buffer_num_bands_per_freq", num_bands_per_freq.numpy().astype(np.int32)
			
 
				+    nbpf = (
			
 
				+        num_bands_per_freq.numpy()
			
 
				+        if hasattr(num_bands_per_freq, "numpy")
			
 
				+        else num_bands_per_freq
			
 
				     )
			
 
				+    gguf_writer.add_tensor("buffer_num_bands_per_freq", nbpf.astype(np.int32))
			
 
				     # num_freqs_per_band (int32)
			
 
				-    gguf_writer.add_tensor(
			
 
				-        "buffer_num_freqs_per_band", num_freqs_per_band.numpy().astype(np.int32)
			
 
				+    nfpb = (
			
 
				+        num_freqs_per_band.numpy()
			
 
				+        if hasattr(num_freqs_per_band, "numpy")
			
 
				+        else num_freqs_per_band
			
 
				     )
			
 
				+    gguf_writer.add_tensor("buffer_num_freqs_per_band", nfpb.astype(np.int32))
			
 
				 
			
 
				     # =========================================================================
			
 
				     # 5. Write Weights (Mixed Quantization)
			
@@ -384,6 +604,8 @@ def convert(
 
				     n_tensors = 0
			
 
				     n_quantized = 0
			
 
				 
			
 
				+    warnings_list = []
			
 
				+
			
 
				     for key, tensor in state_dict.items():
			
 
				         new_key = map_key_name(key)
			
 
				 
			
@@ -410,9 +632,8 @@ def convert(
 
				                 is_quantized = True
			
 
				                 n_quantized += 1
			
 
				             except Exception as e:
			
 
				-                print(
			
 
				-                    f"Warning: Failed to quantize {new_key} to {target_qtype.name}, falling back to F32. Error: {e}"
			
 
				-                )
			
 
				+                msg = f"Failed to quantize {new_key} to {target_qtype.name}, falling back to F32. Error: {e}"
			
 
				+                warnings_list.append(msg)
			
 
				                 gguf_writer.add_tensor(new_key, data)
			
 
				         else:
			
 
				             # Keep as F32
			
@@ -431,6 +652,15 @@ def convert(
 
				     gguf_writer.write_tensors_to_file()
			
 
				     gguf_writer.close()
			
 
				 
			
 
				+    if warnings_list:
			
 
				+        print("\n" + "=" * 80)
			
 
				+        print(
			
 
				+            f"WARNING: {len(warnings_list)} tensors failed to quantize (fallback to F32):"
			
 
				+        )
			
 
				+        for msg in warnings_list:
			
 
				+            print(f"  - {msg}")
			
 
				+        print("=" * 80 + "\n")
			
 
				+
			
 
				     file_size = os.path.getsize(output_path)
			
 
				     print(f"\nDone! Converted {n_tensors} tensors ({n_quantized} quantized)")
			
 
				     print(f"Output file size: {file_size / 1024 / 1024:.2f} MB")
			
@@ -480,6 +710,20 @@ Examples:
 
				         default=None,
			
 
				         help="Model description (default: 'Audio source separation model for vocal extraction')",
			
 
				     )
			
 
				+    parser.add_argument(
			
 
				+        "--arch",
			
 
				+        choices=["mel_band", "mel_band_roformer", "bs", "bs_roformer"],
			
 
				+        default=None,
			
 
				+        help="Architecture type (auto-detected if not specified)",
			
 
				+    )
			
 
				     args = parser.parse_args()
			
 
				 
			
 
				-    convert(args.ckpt, args.out, args.config, args.dtype, args.name, args.description)
			
 
				+    convert(
			
 
				+        args.ckpt,
			
 
				+        args.out,
			
 
				+        args.config,
			
 
				+        args.dtype,
			
 
				+        args.name,
			
 
				+        args.description,
			
 
				+        args.arch,
			
 
				+    )
			
--- a/scripts/generate_test_data.py
+++ b/scripts/generate_test_data.py
@@ -37,6 +37,7 @@ from einops import rearrange, pack, unpack
 
				 # Model imports are deferred until we know the model-repo path
			
 
				 # Model imports are deferred until we know the model-repo path
			
 
				 MelBandRoformer = None
			
 
				+BSRoformer = None
			
 
				 pack_one = None
			
 
				 unpack_one = None
			
 
				 # Inference utility
			
@@ -59,7 +60,7 @@ class MockModel(torch.nn.Module):
 
				 
			
 
				 def load_model_module(model_repo_path: Path):
			
 
				     """Dynamically load the MelBandRoformer model from the specified repository."""
			
 
				-    global MelBandRoformer, pack_one, unpack_one, inference_func
			
 
				+    global MelBandRoformer, BSRoformer, pack_one, unpack_one, inference_func
			
 
				 
			
 
				     if not model_repo_path.exists():
			
 
				         print("\n" + "=" * 70)
			
@@ -107,6 +108,13 @@ def load_model_module(model_repo_path: Path):
 
				         unpack_one = _unpack_one
			
 
				         MelBandRoformer = _MelBandRoformer
			
 
				 
			
 
				+        try:
			
 
				+            from models.bs_roformer.bs_roformer import BSRoformer as _BSRoformer
			
 
				+
			
 
				+            BSRoformer = _BSRoformer
			
 
				+        except ImportError:
			
 
				+            print("  Warning: Could not import BSRoformer from model repo.")
			
 
				+
			
 
				         # Import demix from utils.model_utils
			
 
				         from utils.model_utils import demix
			
 
				 
			
@@ -226,8 +234,27 @@ def generate_test_data(
 
				     with open(config_file) as f:
			
 
				         config = ConfigDict(yaml.load(f, Loader=yaml.FullLoader))
			
 
				 
			
 
				-    model = MelBandRoformer(**dict(config.model))
			
 
				+    model_type = "mel_band"
			
 
				+    if "freqs_per_bands" in config.model:
			
 
				+        model_type = "bs"
			
 
				+        if BSRoformer is None:
			
 
				+            print(
			
 
				+                "Error: BSRoformer class not loaded but config looks like BS Roformer."
			
 
				+            )
			
 
				+            return 1
			
 
				+        model = BSRoformer(**dict(config.model))
			
 
				+        print(f"  Architecture: Band Split Roformer")
			
 
				+    else:
			
 
				+        model = MelBandRoformer(**dict(config.model))
			
 
				+        print(f"  Architecture: Mel-Band Roformer")
			
 
				+
			
 
				     state_dict = torch.load(checkpoint, map_location="cpu")
			
 
				+    # Handle checkpoint structure
			
 
				+    if "state_dict" in state_dict:
			
 
				+        state_dict = state_dict["state_dict"]
			
 
				+    elif "model" in state_dict:
			
 
				+        state_dict = state_dict["model"]
			
 
				+
			
 
				     model.load_state_dict(state_dict)
			
 
				     model.eval()
			
 
				 
			
@@ -261,7 +288,7 @@ def generate_test_data(
 
				             raw_audio = rearrange(raw_audio, "b t -> b 1 t")
			
 
				 
			
 
				         batch, channels, raw_audio_length = raw_audio.shape
			
 
				-        istft_length = raw_audio_length if model.match_input_audio_length else None
			
 
				+        istft_length = raw_audio_length
			
 
				 
			
 
				         # STFT
			
 
				         raw_audio_packed, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t")
			
@@ -277,9 +304,17 @@ def generate_test_data(
 
				         stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c")
			
 
				 
			
 
				         # Frequency indexing
			
 
				-        batch_arange = torch.arange(batch, device=device)[..., None]
			
 
				-        x = stft_repr[batch_arange, model.freq_indices]
			
 
				-        x = rearrange(x, "b f t c -> b t (f c)")
			
 
				+        if model_type == "mel_band":
			
 
				+            batch_arange = torch.arange(batch, device=device)[..., None]
			
 
				+            x = stft_repr[batch_arange, model.freq_indices]
			
 
				+            x = rearrange(x, "b f t c -> b t (f c)")
			
 
				+        else:
			
 
				+            # BS Roformer: Direct usage
			
 
				+            x = stft_repr
			
 
				+            # If stft_repr is complex (view_as_real result: [b, f, t, 2])
			
 
				+            # BS model expects: [b, f, t, 2] -> rearrange to [b, t, (f * 2)]
			
 
				+            # Wait, bs_roformer.py: x = rearrange(x, 'b f t c -> b t (f c)')
			
 
				+            x = rearrange(x, "b f t c -> b t (f c)")
			
 
				 
			
 
				         # ===== CAPTURE: BandSplit Input =====
			
 
				         captured["band_split_in"] = x.clone()
			
@@ -304,6 +339,10 @@ def generate_test_data(
 
				             x = freq_transformer(x)
			
 
				             (x,) = unpack(x, ps, "* f d")
			
 
				 
			
 
				+        # BS Roformer: Apply global final_norm after all transformer layers
			
 
				+        if model_type == "bs" and hasattr(model, "final_norm"):
			
 
				+            x = model.final_norm(x)
			
 
				+
			
 
				         # ===== CAPTURE: Before Mask Estimator (= Transformer Output) =====
			
 
				         captured["before_mask_est"] = x.clone()
			
 
				 
			
@@ -325,27 +364,52 @@ def generate_test_data(
 
				 
			
 
				         from einops import repeat
			
 
				 
			
 
				-        scatter_indices = repeat(
			
 
				-            model.freq_indices,
			
 
				-            "f -> b n f t",
			
 
				-            b=batch,
			
 
				-            n=num_stems,
			
 
				-            t=stft_repr.shape[-1],
			
 
				-        )
			
 
				-        stft_repr_expanded_stems = repeat(stft_repr, "b 1 ... -> b n ...", n=num_stems)
			
 
				-        masks_summed = torch.zeros_like(stft_repr_expanded_stems).scatter_add_(
			
 
				-            2, scatter_indices, masks
			
 
				-        )
			
 
				+        if model_type == "mel_band":
			
 
				+            scatter_indices = repeat(
			
 
				+                model.freq_indices,
			
 
				+                "f -> b n f t",
			
 
				+                b=batch,
			
 
				+                n=num_stems,
			
 
				+                t=stft_repr.shape[-1],
			
 
				+            )
			
 
				+            stft_repr_expanded_stems = repeat(
			
 
				+                stft_repr, "b 1 ... -> b n ...", n=num_stems
			
 
				+            )
			
 
				+            masks_summed = torch.zeros_like(stft_repr_expanded_stems).scatter_add_(
			
 
				+                2, scatter_indices, masks
			
 
				+            )
			
 
				+
			
 
				+            denom = repeat(model.num_bands_per_freq, "f -> (f r) 1", r=channels)
			
 
				+            masks_averaged = masks_summed / denom.clamp(min=1e-8)
			
 
				+
			
 
				+            stft_repr = stft_repr * masks_averaged
			
 
				 
			
 
				-        denom = repeat(model.num_bands_per_freq, "f -> (f r) 1", r=channels)
			
 
				-        masks_averaged = masks_summed / denom.clamp(min=1e-8)
			
 
				+        else:
			
 
				+            # BS Roformer: Direct mask application
			
 
				+            # masks shape: [b, n, f, t, c] (rearranged above)
			
 
				+            # stft_repr shape: [b, 1, f, t, c] (rearranged above)
			
 
				 
			
 
				-        stft_repr = stft_repr * masks_averaged
			
 
				+            # BS model output masks are often [b, n, f, t] (complex/real?)
			
 
				+            # Wait, bs_roformer.py:
			
 
				+            # masks = torch.stack([fn(x) for fn in self.mask_estimators], dim=1)
			
 
				+            # masks = rearrange(masks, 'b n t (f c) -> b n f t c', c = 2)
			
 
				+            # x = x * masks.sum(dim=1) # summation over stems? No, output separate stems.
			
 
				+            # return x * masks
			
 
				+
			
 
				+            # So here: stft_repr * masks is correct.
			
 
				+            stft_repr = stft_repr * masks
			
 
				 
			
 
				         # ISTFT
			
 
				-        stft_repr = rearrange(
			
 
				-            stft_repr, "b n (f s) t -> (b n s) f t", s=model.audio_channels
			
 
				-        )
			
 
				+        if model_type == "mel_band":
			
 
				+            stft_repr = rearrange(
			
 
				+                stft_repr, "b n (f s) t -> (b n s) f t", s=model.audio_channels
			
 
				+            )
			
 
				+        else:
			
 
				+            # BS Roformer: stft_repr is [b, n, (Freq*Stereo), t] (complex)
			
 
				+            # Unpack stereo and flatten batch/stems/stereo for istft
			
 
				+            stft_repr = rearrange(
			
 
				+                stft_repr, "b n (f s) t -> (b n s) f t", s=model.audio_channels
			
 
				+            )
			
 
				 
			
 
				         if getattr(model, "zero_dc", False):
			
 
				             # Zero out DC component
			
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -46,54 +46,82 @@ void MelBandRoformer::LoadWeights(const std::string& path) {
 
				         throw std::runtime_error("Failed to load GGUF file: " + path);
			
 
				     }
			
 
				 
			
 
				-    // 1. Read Hyperparameters
			
 
				-    int kv_idx;
			
 
				+    // 1. Read Architecture first to determine key prefix
			
 
				+    int kv_idx = gguf_find_key(ctx_gguf, "general.architecture");
			
 
				+    if (kv_idx >= 0) {
			
 
				+        architecture_ = gguf_get_val_str(ctx_gguf, kv_idx);
			
 
				+    } else {
			
 
				+        throw std::runtime_error("Key 'general.architecture' not found in GGUF file. Please re-convert the model with the latest script.");
			
 
				+    }
			
 
				     
			
 
				-    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.stft_n_fft");
			
 
				+    std::cout << "Architecture: " << architecture_ << std::endl;
			
 
				+    
			
 
				+    // Normalization for legacy models (if any) or simplified internal handling
			
 
				+    if (architecture_ == "bs") architecture_ = "bs_roformer";
			
 
				+    if (architecture_ == "mel_band") architecture_ = "mel_band_roformer";
			
 
				+
			
 
				+    std::string kp = architecture_ + "."; // key prefix, e.g. "bs_roformer." or "mel_band_roformer."
			
 
				+
			
 
				+    // Set internal flags based on architecture
			
 
				+    if (architecture_ == "bs_roformer") {
			
 
				+        has_final_norm_ = true;
			
 
				+        transformer_norm_output_ = false;
			
 
				+    } else {
			
 
				+        // mel_band_roformer
			
 
				+        has_final_norm_ = false;
			
 
				+        transformer_norm_output_ = true;
			
 
				+    }
			
 
				+
			
 
				+    // 2. Read Hyperparameters using key prefix
			
 
				+    
			
 
				+    kv_idx = gguf_find_key(ctx_gguf, (kp + "stft_n_fft").c_str());
			
 
				     if (kv_idx >= 0) n_fft_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
			
 
				     
			
 
				-    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.stft_hop_length");
			
 
				+    kv_idx = gguf_find_key(ctx_gguf, (kp + "stft_hop_length").c_str());
			
 
				     if (kv_idx >= 0) hop_length_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
			
 
				     
			
 
				-    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.stft_win_length");
			
 
				+    kv_idx = gguf_find_key(ctx_gguf, (kp + "stft_win_length").c_str());
			
 
				     if (kv_idx >= 0) win_length_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
			
 
				 
			
 
				-    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.dim");
			
 
				+    kv_idx = gguf_find_key(ctx_gguf, (kp + "dim").c_str());
			
 
				     if (kv_idx >= 0) dim_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
			
 
				 
			
 
				-    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.num_bands");
			
 
				+    kv_idx = gguf_find_key(ctx_gguf, (kp + "num_bands").c_str());
			
 
				     if (kv_idx >= 0) num_bands_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
			
 
				     
			
 
				-    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.depth");
			
 
				+    kv_idx = gguf_find_key(ctx_gguf, (kp + "depth").c_str());
			
 
				     if (kv_idx >= 0) depth_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
			
 
				 
			
 
				     // New Parameters
			
 
				-    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.num_stems");
			
 
				+    kv_idx = gguf_find_key(ctx_gguf, (kp + "num_stems").c_str());
			
 
				     if (kv_idx >= 0) num_stems_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
			
 
				     
			
 
				-    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.skip_connection");
			
 
				+    kv_idx = gguf_find_key(ctx_gguf, (kp + "skip_connection").c_str());
			
 
				     if (kv_idx >= 0) skip_connection_ = gguf_get_val_bool(ctx_gguf, kv_idx);
			
 
				 
			
 
				-    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.stft_normalized");
			
 
				+    kv_idx = gguf_find_key(ctx_gguf, (kp + "stft_normalized").c_str());
			
 
				     if (kv_idx >= 0) stft_normalized_ = gguf_get_val_bool(ctx_gguf, kv_idx);
			
 
				 
			
 
				-    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.zero_dc");
			
 
				+    kv_idx = gguf_find_key(ctx_gguf, (kp + "zero_dc").c_str());
			
 
				     if (kv_idx >= 0) zero_dc_ = gguf_get_val_bool(ctx_gguf, kv_idx);
			
 
				 
			
 
				-    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.mask_estimator_depth");
			
 
				+    kv_idx = gguf_find_key(ctx_gguf, (kp + "mask_estimator_depth").c_str());
			
 
				     if (kv_idx >= 0) mask_estimator_depth_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
			
 
				 
			
 
				-    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.sample_rate");
			
 
				+    kv_idx = gguf_find_key(ctx_gguf, (kp + "mlp_expansion_factor").c_str());
			
 
				+    if (kv_idx >= 0) mlp_expansion_factor_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
			
 
				+
			
 
				+    kv_idx = gguf_find_key(ctx_gguf, (kp + "sample_rate").c_str());
			
 
				     if (kv_idx >= 0) sample_rate_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
			
 
				     
			
 
				     // Inference defaults (optional, fallback to hardcoded values)
			
 
				-    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.default_chunk_size");
			
 
				+    kv_idx = gguf_find_key(ctx_gguf, (kp + "default_chunk_size").c_str());
			
 
				     if (kv_idx >= 0) default_chunk_size_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
			
 
				     
			
 
				-    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.default_num_overlap");
			
 
				+    kv_idx = gguf_find_key(ctx_gguf, (kp + "default_num_overlap").c_str());
			
 
				     if (kv_idx >= 0) default_num_overlap_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
			
 
				     
			
 
				-    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.linear_transformer_depth");
			
 
				+    kv_idx = gguf_find_key(ctx_gguf, (kp + "linear_transformer_depth").c_str());
			
 
				     if (kv_idx >= 0) {
			
 
				         int lin_depth = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
			
 
				         if (lin_depth > 0) {
			
@@ -108,7 +136,6 @@ void MelBandRoformer::LoadWeights(const std::string& path) {
 
				     std::cout << "Inference Defaults: chunk_size=" << default_chunk_size_ 
			
 
				               << ", num_overlap=" << default_num_overlap_ << std::endl;
			
 
				 
			
 
				-
			
 
				     // 2. Allocate backend buffer for ALL tensors
			
 
				     buffer_weights_ = ggml_backend_alloc_ctx_tensors_from_buft(
			
 
				         ctx_weights_, 
			
@@ -171,6 +198,19 @@ void MelBandRoformer::LoadWeights(const std::string& path) {
 
				     int n_tensors = gguf_get_n_tensors(ctx_gguf);
			
 
				     std::cout << "Loaded " << n_tensors << " tensors" << std::endl;
			
 
				     
			
 
				+    // Dynamic MLP detection
			
 
				+    // Try to find mask_est.0.freq.0.mlp.{N}.weight
			
 
				+    mlp_num_layers_ = 0;
			
 
				+    for (int idx = 0; idx <= 20; idx += 2) {  // Check indices 0, 2, 4... up to 10 layers
			
 
				+        std::string probe = "mask_est.0.freq.0.mlp." + std::to_string(idx) + ".weight";
			
 
				+        if (GetWeight(probe) != nullptr) {
			
 
				+            mlp_num_layers_++;
			
 
				+        } else {
			
 
				+            break;
			
 
				+        }
			
 
				+    }
			
 
				+    std::cout << "Detected MLP layers: " << mlp_num_layers_ << std::endl;
			
 
				+
			
 
				     gguf_free(ctx_gguf);
			
 
				 }
			
 
				 
			
@@ -188,6 +228,12 @@ std::vector<int> MelBandRoformer::GetDimInputs() const {
 
				 }
			
 
				 
			
 
				 int MelBandRoformer::GetTotalDimInput() const {
			
 
				+    if (architecture_ == "bs") {
			
 
				+        // BS: All frequencies * stereo * complex
			
 
				+        int n_freq = n_fft_ / 2 + 1;
			
 
				+        return n_freq * 2 * 2;  // freq * stereo * complex
			
 
				+    }
			
 
				+
			
 
				     int total = 0;
			
 
				     for (int i = 0; i < num_bands_; ++i) {
			
 
				         total += num_freqs_per_band_[i] * 4;
			
@@ -432,13 +478,15 @@ ggml_tensor* MelBandRoformer::BuildTransformersGraph(
 
				 
			
 
				         
			
 
				         // Time Transformer Final Norm
			
 
				-        // blk.{l}.time_norm.weight
			
 
				-        std::string time_norm_name = "blk." + std::to_string(layer) + ".time_norm.weight";
			
 
				-        ggml_tensor* time_norm_w = GetWeight(time_norm_name);
			
 
				-        if (!time_norm_w) { std::cerr << "Missing: " << time_norm_name << "\n"; return nullptr; }
			
 
				-        
			
 
				-        x_packed = ggml_rms_norm(ctx, x_packed, 1e-12f);
			
 
				-        x_packed = ggml_mul(ctx, x_packed, time_norm_w);
			
 
				+        // Only if transformer_norm_output_ is true (MelBand)
			
 
				+        if (transformer_norm_output_) {
			
 
				+            std::string time_norm_name = "blk." + std::to_string(layer) + ".time_norm.weight";
			
 
				+            ggml_tensor* time_norm_w = GetWeight(time_norm_name);
			
 
				+            if (!time_norm_w) { std::cerr << "Missing: " << time_norm_name << "\n"; return nullptr; }
			
 
				+            
			
 
				+            x_packed = ggml_rms_norm(ctx, x_packed, 1e-12f);
			
 
				+            x_packed = ggml_mul(ctx, x_packed, time_norm_w);
			
 
				+        }
			
 
				         
			
 
				         x = ggml_reshape_4d(ctx, x_packed, D, T, F, B);
			
 
				         x = ggml_permute(ctx, x, 0, 2, 1, 3);
			
@@ -597,13 +645,15 @@ ggml_tensor* MelBandRoformer::BuildTransformersGraph(
 
				         x_freq_packed = ggml_add(ctx, f_x_resid1, f_ff_block_out);
			
 
				         
			
 
				         // Freq Transformer Final Norm
			
 
				-        // blk.{l}.freq_norm.weight
			
 
				-        std::string freq_norm_name = "blk." + std::to_string(layer) + ".freq_norm.weight";
			
 
				-        ggml_tensor* freq_norm_w = GetWeight(freq_norm_name);
			
 
				-        if (!freq_norm_w) { std::cerr << "Missing: " << freq_norm_name << "\n"; return nullptr; }
			
 
				-        
			
 
				-        x_freq_packed = ggml_rms_norm(ctx, x_freq_packed, 1e-12f);
			
 
				-        x_freq_packed = ggml_mul(ctx, x_freq_packed, freq_norm_w);
			
 
				+        // Only if transformer_norm_output_ is true (MelBand)
			
 
				+        if (transformer_norm_output_) {
			
 
				+            std::string freq_norm_name = "blk." + std::to_string(layer) + ".freq_norm.weight";
			
 
				+            ggml_tensor* freq_norm_w = GetWeight(freq_norm_name);
			
 
				+            if (!freq_norm_w) { std::cerr << "Missing: " << freq_norm_name << "\n"; return nullptr; }
			
 
				+            
			
 
				+            x_freq_packed = ggml_rms_norm(ctx, x_freq_packed, 1e-12f);
			
 
				+            x_freq_packed = ggml_mul(ctx, x_freq_packed, freq_norm_w);
			
 
				+        }
			
 
				         
			
 
				         x = ggml_reshape_4d(ctx, x_freq_packed, D, F, T, B);
			
 
				         
			
@@ -612,6 +662,14 @@ ggml_tensor* MelBandRoformer::BuildTransformersGraph(
 
				         }
			
 
				     }
			
 
				     
			
 
				+    // Global Final Norm (BS Roformer only)
			
 
				+    if (has_final_norm_) {
			
 
				+        ggml_tensor* final_norm_w = GetWeight("final_norm.weight");
			
 
				+        if (!final_norm_w) { std::cerr << "Missing: final_norm.weight\n"; return nullptr; }
			
 
				+        x = ggml_rms_norm(ctx, x, 1e-12f);
			
 
				+        x = ggml_mul(ctx, x, final_norm_w);
			
 
				+    }
			
 
				+    
			
 
				     return x;
			
 
				 }
			
 
				 
			
@@ -631,19 +689,22 @@ ggml_tensor* MelBandRoformer::BuildMaskEstimatorGraph(
 
				     const int NUM_STEMS = num_stems_;
			
 
				     
			
 
				     // Calculate band_out_dims from mask_est.0.freq.{b}.mlp.4.weight shape
			
 
				+    // Calculate band_out_dims from last MLP weight
			
 
				     std::vector<int> band_out_dims(NUM_BANDS);
			
 
				     int total_out_dim = 0;
			
 
				+    
			
 
				+    // Last MLP layer index is (mlp_num_layers_ - 1) * 2
			
 
				+    int last_mlp_idx = (mlp_num_layers_ - 1) * 2;
			
 
				 
			
 
				     for (int b = 0; b < NUM_BANDS; ++b) {
			
 
				-        // mask_est.0.freq.{b}.mlp.4.weight
			
 
				-        // Assuming all stems have same architecture, check stem 0
			
 
				-        std::string w4_name = "mask_est.0.freq." + std::to_string(b) + ".mlp.4.weight";
			
 
				-        ggml_tensor* w4 = GetWeight(w4_name);
			
 
				-        if (!w4) {
			
 
				-            std::cerr << "Missing weight for dim check: " << w4_name << std::endl;
			
 
				+        // mask_est.0.freq.{b}.mlp.{last}.weight
			
 
				+        std::string w_last_name = "mask_est.0.freq." + std::to_string(b) + ".mlp." + std::to_string(last_mlp_idx) + ".weight";
			
 
				+        ggml_tensor* w_last = GetWeight(w_last_name);
			
 
				+        if (!w_last) {
			
 
				+            std::cerr << "Missing weight for dim check: " << w_last_name << std::endl;
			
 
				             return nullptr;
			
 
				         }
			
 
				-        band_out_dims[b] = static_cast<int>(w4->ne[1]) / 2;  // GLU halves the dimension
			
 
				+        band_out_dims[b] = static_cast<int>(w_last->ne[1]) / 2;  // GLU halves the dimension
			
 
				         total_out_dim += band_out_dims[b];
			
 
				     }
			
 
				     
			
@@ -671,38 +732,41 @@ ggml_tensor* MelBandRoformer::BuildMaskEstimatorGraph(
 
				             std::string prefix = "mask_est." + std::to_string(s) + ".freq." + std::to_string(b) + ".mlp.";
			
 
				             
			
 
				             // MLP Layer 0
			
 
				-            ggml_tensor* w0 = GetWeight(prefix + "0.weight");
			
 
				-            ggml_tensor* bias0 = GetWeight(prefix + "0.bias");
			
 
				-            if (!w0 || !bias0) { std::cerr << "Missing mask weights s=" << s << " b=" << b << "\n"; return nullptr; }
			
 
				-            
			
 
				-            ggml_tensor* layer0 = ggml_mul_mat(ctx, w0, band_in);
			
 
				-            layer0 = ggml_add(ctx, layer0, bias0);
			
 
				-            layer0 = ggml_tanh(ctx, layer0);
			
 
				-            
			
 
				-            // MLP Layer 2
			
 
				-            ggml_tensor* w2 = GetWeight(prefix + "2.weight");
			
 
				-            ggml_tensor* bias2 = GetWeight(prefix + "2.bias");
			
 
				-            
			
 
				-            ggml_tensor* layer2 = ggml_mul_mat(ctx, w2, layer0);
			
 
				-            layer2 = ggml_add(ctx, layer2, bias2);
			
 
				-            layer2 = ggml_tanh(ctx, layer2);
			
 
				-            
			
 
				-            // MLP Layer 4
			
 
				-            ggml_tensor* w4 = GetWeight(prefix + "4.weight");
			
 
				-            ggml_tensor* bias4 = GetWeight(prefix + "4.bias");
			
 
				-            
			
 
				-            ggml_tensor* mlp_out = ggml_mul_mat(ctx, w4, layer2);
			
 
				-            mlp_out = ggml_add(ctx, mlp_out, bias4);
			
 
				+            // Dynamic MLP Construction
			
 
				+            ggml_tensor* mlp_current = band_in;
			
 
				+
			
 
				+            for (int layer_idx = 0; layer_idx < mlp_num_layers_; ++layer_idx) {
			
 
				+                int seq_idx = layer_idx * 2; // 0, 2, 4...
			
 
				+                
			
 
				+                std::string w_name = prefix + std::to_string(seq_idx) + ".weight";
			
 
				+                std::string b_name = prefix + std::to_string(seq_idx) + ".bias";
			
 
				+                
			
 
				+                ggml_tensor* w = GetWeight(w_name);
			
 
				+                ggml_tensor* b = GetWeight(b_name);
			
 
				+                
			
 
				+                if (!w || !b) {
			
 
				+                    std::cerr << "Missing mask weights s=" << s << " b=" << b << " l=" << seq_idx << "\n";
			
 
				+                    return nullptr;
			
 
				+                }
			
 
				+                
			
 
				+                mlp_current = ggml_mul_mat(ctx, w, mlp_current);
			
 
				+                mlp_current = ggml_add(ctx, mlp_current, b);
			
 
				+                
			
 
				+                // Activation (Tanh) for all but last layer
			
 
				+                if (layer_idx < mlp_num_layers_ - 1) {
			
 
				+                    mlp_current = ggml_tanh(ctx, mlp_current);
			
 
				+                }
			
 
				+            }
			
 
				             
			
 
				             // GLU
			
 
				             int dim_out = band_out_dims[b];
			
 
				             
			
 
				-            ggml_tensor* glu_a = ggml_view_3d(ctx, mlp_out,
			
 
				+            ggml_tensor* glu_a = ggml_view_3d(ctx, mlp_current,
			
 
				                                               dim_out, n_frames, batch,
			
 
				-                                              mlp_out->nb[1], mlp_out->nb[2], 0);
			
 
				-            ggml_tensor* glu_b = ggml_view_3d(ctx, mlp_out,
			
 
				+                                              mlp_current->nb[1], mlp_current->nb[2], 0);
			
 
				+            ggml_tensor* glu_b = ggml_view_3d(ctx, mlp_current,
			
 
				                                               dim_out, n_frames, batch,
			
 
				-                                              mlp_out->nb[1], mlp_out->nb[2],
			
 
				+                                              mlp_current->nb[1], mlp_current->nb[2],
			
 
				                                               dim_out * sizeof(float));
			
 
				             
			
 
				             glu_a = ggml_cont(ctx, glu_a);
			
--- a/src/model.h
+++ b/src/model.h
@@ -52,6 +52,12 @@ public:
 
				     bool GetSTFTNormalized() const { return stft_normalized_; }
			
 
				     int GetZeroDC() const { return zero_dc_; }
			
 
				     int GetSampleRate() const { return sample_rate_; }
			
 
				+    int GetMlpExpansionFactor() const { return mlp_expansion_factor_; }
			
 
				+    
			
 
				+    // BS Roformer Support
			
 
				+    const std::string& GetArchitecture() const { return architecture_; }
			
 
				+    bool HasFinalNorm() const { return has_final_norm_; }
			
 
				+    bool GetTransformerNormOutput() const { return transformer_norm_output_; }
			
 
				     
			
 
				     // Inference defaults (from GGUF, can be overridden at runtime)
			
 
				     int GetDefaultChunkSize() const { return default_chunk_size_; }
			
@@ -143,6 +149,13 @@ private:
 
				     bool stft_normalized_ = false;
			
 
				     bool zero_dc_ = false;
			
 
				     int mask_estimator_depth_ = 1;
			
 
				+    int mlp_expansion_factor_ = 4;
			
 
				+    
			
 
				+    // BS Roformer Specific
			
 
				+    std::string architecture_ = "mel_band";  // "mel_band" or "bs"
			
 
				+    bool has_final_norm_ = false;            // BS has a global final norm
			
 
				+    bool transformer_norm_output_ = true;    // MelBand=true, BS=false
			
 
				+    int mlp_num_layers_ = 3;                 // Detected from weights (BS=2 for depth=2)
			
 
				     int sample_rate_ = 44100;
			
 
				 
			
 
				     // Inference defaults