1 ay önce · 63b9b68da9
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,7 @@
 
															 build*
														
 
															-tests/data
														
 
															+tests/data
														
 
															+testdata/*
														
 
															+rel
														
 
															+rel/*
														
 
															+ggml/*
														
 
															+libav/*
														
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,6 @@
 
															+[submodule "libav"]
														
 
															+	path = libav
														
 
															+	url = https://github.com/libav/libav
														
 
															+[submodule "ggml"]
														
 
															+	path = ggml
														
 
															+	url = https://github.com/ggml-org/ggml/
														
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,29 +8,30 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
															 # Build Options
														
 
															 #================================================
														
 
															-option(GGML_CUDA "Enable CUDA backend" ON)
														
 
															-option(BSR_BUILD_TESTS "Build tests" OFF)
														
 
															-option(BSR_BUILD_CLI "Build CLI application" ON)
														
 
															+option(BSR_BUILD_CLI ON)
														
 
															+option(BSR_BUILD_TESTS OFF)
														
 
															+option(GGML_CUDA OFF) # cuda
														
 
															+option(GGML_CUDA_FA ON)  # leave it on
														
 
															+option(GGML_CUDA_NO_VMM ON)  # not needed
														
 
															+
														
 
															+option(GGML_HIP OFF)   # rocm
														
 
															+option(GGML_HIP_RCCL  OFF) # not needed
														
 
															+option(GGML_HIP_GRAPH  OFF) # definitely not needed
														
 
															+option(GGML_HIP_ROCWMMA_FATTN ON) # turn it off if you have rocm flash attention issues
														
 
															+option(GGML_HIP_NO_VMM ON) # Breaks support in most cards right now leave it on
														
 
															+
														
 
															+
														
 
															+
														
 
															+option(GGML_CUDA_FA_ALL_QUANT ON) # leave it on
														
 
															 #================================================
														
 
															 # Dependencies - GGML (Flexible Resolution)
														
 
															 #================================================
														
 
															-# Strategy: Allow ggml to be shared across multiple projects
														
 
															-# 1. Check if ggml target already exists (e.g., from parent project like whisper.cpp)
														
 
															-# 2. If not, try to find ggml via CMAKE_PREFIX_PATH or GGML_DIR
														
 
															-# 3. If not found, use local ggml (submodule or sibling directory)
														
 
															-
														
 
															 if(NOT TARGET ggml)
														
 
															-    # Try to find ggml package first (for system-wide or parent project installation)
														
 
															     find_package(ggml QUIET CONFIG)
														
 
															     if(NOT ggml_FOUND)
														
 
															-        # ggml not found as package, look for source directory
														
 
															-        # Priority 1: GGML_DIR variable (explicitly set by user or parent project)
														
 
															-        # Priority 2: Submodule in ggml/
														
 
															-        # Priority 3: Sibling directory ../ggml
														
 
															-        
														
 
															         if(DEFINED GGML_DIR)
														
 
															             set(GGML_PATH "${GGML_DIR}")
														
 
															             message(STATUS "Using GGML from GGML_DIR: ${GGML_PATH}")
														
@@ -50,7 +51,6 @@ if(NOT TARGET ggml)
 
															             )
														
 
															         endif()
														
 
															-        # Add ggml as subdirectory
														
 
															         add_subdirectory(${GGML_PATH} ggml EXCLUDE_FROM_ALL)
														
 
															     else()
														
 
															         message(STATUS "Using GGML from installed package")
														
@@ -81,8 +81,6 @@ target_include_directories(bs_roformer PRIVATE
 
															 target_link_libraries(bs_roformer PUBLIC ggml)
														
 
															 if(GGML_CUDA AND TARGET ggml-cuda)
														
 
															-    # Fix for CI: Link against CUDA stubs if the driver is not present
														
 
															-    # This prevents errors like "libcuda.so.1 needed by ... not found" during linking
														
 
															     find_package(CUDAToolkit REQUIRED)
														
 
															     if(TARGET CUDA::cuda_driver)
														
 
															         target_link_libraries(bs_roformer PUBLIC CUDA::cuda_driver)
														
@@ -90,12 +88,7 @@ if(GGML_CUDA AND TARGET ggml-cuda)
 
															     endif()
														
 
															 endif()
														
 
															-# Compiler options
														
 
															-if(MSVC)
														
 
															-    target_compile_options(bs_roformer PRIVATE /W3 /utf-8)
														
 
															-else()
														
 
															-    target_compile_options(bs_roformer PRIVATE -Wall -Wextra)
														
 
															-endif()
														
 
															+target_compile_options(bs_roformer PRIVATE -Wall -Wextra)
														
 
															 # OpenMP support
														
 
															 find_package(OpenMP)
														
@@ -157,7 +150,6 @@ endfunction()
 
															 #================================================
														
 
															 if(BSR_BUILD_CLI)
														
 
															-    # audio.cpp implements AudioFile utilities (using dr_wav)
														
 
															     add_executable(bs_roformer-cli 
														
 
															         cli/main.cpp 
														
 
															         src/audio.cpp
														
@@ -166,11 +158,17 @@ if(BSR_BUILD_CLI)
 
															     target_include_directories(bs_roformer-cli PRIVATE 
														
 
															         src 
														
 
															         third_party
														
 
															+        ${CMAKE_SOURCE_DIR}/libav
														
 
															     )
														
 
															-    if(MSVC)
														
 
															-        target_compile_options(bs_roformer-cli PRIVATE /W3 /utf-8)
														
 
															-    endif()
														
 
															+    # Link against libav libraries from submodule and system dependencies
														
 
															+    target_link_libraries(bs_roformer-cli PRIVATE
														
 
															+        ${CMAKE_SOURCE_DIR}/libav/libavformat/libavformat.a
														
 
															+        ${CMAKE_SOURCE_DIR}/libav/libavcodec/libavcodec.a
														
 
															+        ${CMAKE_SOURCE_DIR}/libav/libavresample/libavresample.a
														
 
															+        ${CMAKE_SOURCE_DIR}/libav/libavutil/libavutil.a
														
 
															+        z bz2
														
 
															+    )
														
 
															     bsr_copy_ggml_runtime_dlls(bs_roformer-cli)
														
 
															 endif()
														
@@ -185,4 +183,4 @@ if(BSR_BUILD_TESTS)
 
															     message(STATUS "Tests: ENABLED")
														
 
															 else()
														
 
															     message(STATUS "Tests: DISABLED (use -DBSR_BUILD_TESTS=ON to enable)")
														
 
															-endif()
														
 
															+endif()
														
--- a/cli/main.cpp
+++ b/cli/main.cpp
@@ -6,7 +6,10 @@
 
															 #include <cstdlib>
														
 
															 void print_usage(const char* program_name) {
														
 
															-    std::cerr << "Usage: " << program_name << " <model.gguf> <input.wav> <output.wav> [options]" << std::endl;
														
 
															+    std::cerr << "Usage: " << program_name << " <model.gguf> <input_audio> <output.wav> [options]" << std::endl;
														
 
															+    std::cerr << std::endl;
														
 
															+    std::cerr << "Input audio can be any common format (WAV, MP3, FLAC, OGG, etc.)" << std::endl;
														
 
															+    std::cerr << "Audio is automatically resampled to 44100 Hz if needed." << std::endl;
														
 
															     std::cerr << std::endl;
														
 
															     std::cerr << "Options:" << std::endl;
														
 
															     std::cerr << "  --chunk-size <N>   Chunk size in samples (default: from model, fallback 352800)" << std::endl;
														
@@ -94,31 +97,8 @@ int main(int argc, char* argv[]) {
 
															                   << input_audio.channels << " channels, " 
														
 
															                   << input_audio.sampleRate << " Hz" << std::endl;
														
 
															-        // 1. Check Sample Rate
														
 
															-        int required_sr = engine.GetSampleRate();
														
 
															-        std::cout << "Model expects sample rate: " << required_sr << " Hz" << std::endl;
														
 
															-
														
 
															-        if (input_audio.sampleRate != required_sr) {
														
 
															-            throw std::runtime_error("Input audio sample rate must be " + std::to_string(required_sr) + 
														
 
															-                                     " Hz. Current: " + std::to_string(input_audio.sampleRate));
														
 
															-        }
														
 
															-
														
 
															-        // 2. Check Channels & Auto-Expand Mono
														
 
															-        if (input_audio.channels == 1) {
														
 
															-             std::cout << "[Info] Input is Mono. Expanding to Stereo..." << std::endl;
														
 
															-             std::vector<float> stereo_data(input_audio.samples * 2);
														
 
															-             for(size_t i=0; i<input_audio.samples; ++i) {
														
 
															-                 stereo_data[i*2 + 0] = input_audio.data[i];
														
 
															-                 stereo_data[i*2 + 1] = input_audio.data[i];
														
 
															-             }
														
 
															-             input_audio.data = std::move(stereo_data);
														
 
															-             input_audio.channels = 2;
														
 
															-             input_audio.samples *= 2;
														
 
															-        } else if (input_audio.channels != 2) {
														
 
															-             // We can either reject or try to process first 2 channels? 
														
 
															-             // Ideally reject to be safer, or warn.
														
 
															-             throw std::runtime_error("Input audio must be Stereo (2 channels) or Mono (1 channel). Current: " + std::to_string(input_audio.channels));
														
 
															-        }
														
 
															+        // AudioFile::Load automatically resamples to 44100 Hz and converts to stereo
														
 
															+        // No need for manual sample rate check or mono expansion
														
 
															         std::cout << "Processing with chunk_size=" << chunk_size 
														
 
															                   << ", overlap=" << num_overlap << std::endl;
														
@@ -167,7 +147,7 @@ int main(int argc, char* argv[]) {
 
															             AudioBuffer output_audio_buf;
														
 
															             output_audio_buf.data = std::move(output_stems[i]); // Move to avoid copy
														
 
															             output_audio_buf.channels = 2; // Output is always stereo
														
 
															-            output_audio_buf.sampleRate = required_sr;
														
 
															+            output_audio_buf.sampleRate = 44100;
														
 
															             output_audio_buf.samples = output_audio_buf.data.size();
														
 
															             std::cout << "Saving output stem " << i << ": " << current_output_path << std::endl;
														
--- a/ggml
+++ b/ggml
@@ -0,0 +1 @@
 
															+Subproject commit 57ea0bc119d722d74594196cc5b494a34dd87be4
														
--- a/include/bs_roformer/audio.h
+++ b/include/bs_roformer/audio.h
@@ -16,23 +16,26 @@ struct AudioBuffer {
 
															 /**
														
 
															  * Audio file I/O utilities.
														
 
															- * Supports WAV format (via dr_wav).
														
 
															+ * Supports any common audio format (WAV, MP3, FLAC, OGG, etc.) via FFmpeg/libav.
														
 
															+ * Automatically resamples to target sample rate if needed.
														
 
															  */
														
 
															 class AudioFile {
														
 
															 public:
														
 
															     /**
														
 
															-     * Load audio from a WAV file.
														
 
															-     * @param path Path to the WAV file
														
 
															+     * Load audio from any common audio file format.
														
 
															+     * Audio is automatically resampled to 44100 Hz and converted to stereo float32.
														
 
															+     * @param path Path to the audio file
														
 
															+     * @param target_sample_rate Target sample rate (default: 44100 Hz)
														
 
															      * @return AudioBuffer containing the loaded audio data
														
 
															-     * @throws std::runtime_error if the file cannot be opened
														
 
															+     * @throws std::runtime_error if the file cannot be opened or decoded
														
 
															      */
														
 
															-    static AudioBuffer Load(const std::string& path);
														
 
															+    static AudioBuffer Load(const std::string& path, int target_sample_rate = 44100);
														
 
															     /**
														
 
															-     * Save audio to a WAV file.
														
 
															+     * Save audio to a WAV file (PCM float32).
														
 
															      * @param path Path to save the WAV file
														
 
															      * @param buffer AudioBuffer containing audio data to save
														
 
															      * @throws std::runtime_error if the file cannot be written
														
 
															      */
														
 
															     static void Save(const std::string& path, const AudioBuffer& buffer);
														
 
															-};
														
 
															+};
														
--- a/include/bs_roformer/inference.h
+++ b/include/bs_roformer/inference.h
@@ -1,14 +1,17 @@
 
															 #pragma once
														
 
															-
														
 
															-#include <vector>
														
 
															-#include <string>
														
 
															-#include <memory>
														
 
															+#include <cstdint>
														
 
															 #include <functional>
														
 
															+#include <memory>
														
 
															+#include <string>
														
 
															+#include <vector>
														
 
															 // Forward declaration
														
 
															 class BSRoformer;
														
 
															 // Forward declaration
														
 
															-namespace ggml { struct context; struct cgraph; }
														
 
															+namespace ggml {
														
 
															+struct context;
														
 
															+struct cgraph;
														
 
															+}
														
 
															 class Inference {
														
 
															 public:
														
@@ -21,11 +24,11 @@ public:
 
															     // Uses overlap-add chunking to handle long files
														
 
															     // Process a full audio track (interleaved stereo float32)
														
 
															     // Returns a vector of stems, where each stem is an interleaved stereo float vector
														
 
															-    std::vector<std::vector<float>> Process(const std::vector<float>& input_audio, 
														
 
															-                               int chunk_size = 352800, 
														
 
															-                               int num_overlap = 2,
														
 
															-                               std::function<void(float)> progress_callback = nullptr,
														
 
															-                               CancelCallback cancel_callback = nullptr);
														
 
															+    std::vector<std::vector<float>> Process(const std::vector<float>& input_audio,
														
 
															+        int chunk_size = 352800,
														
 
															+        int num_overlap = 2,
														
 
															+        std::function<void(float)> progress_callback = nullptr,
														
 
															+        CancelCallback cancel_callback = nullptr);
														
 
															     // Low-level chunk processing (public for testing)
														
 
															     std::vector<std::vector<float>> ProcessChunk(const std::vector<float>& chunk_audio);
														
@@ -39,29 +42,29 @@ public:
 
															     // Static helper for Overlap-Add logic (matches Python exactly)
														
 
															     // model_func: input [samples], output [stems][samples] (interleaved stereo)
														
 
															     using ModelCallback = std::function<std::vector<std::vector<float>>(const std::vector<float>&)>;
														
 
															-    static std::vector<std::vector<float>> ProcessOverlapAdd(const std::vector<float>& input_audio, 
														
 
															-                                                int chunk_size, 
														
 
															-                                                int num_overlap,
														
 
															-                                                ModelCallback model_func,
														
 
															-                                                std::function<void(float)> progress_callback = nullptr,
														
 
															-                                                CancelCallback cancel_callback = nullptr);
														
 
															+    static std::vector<std::vector<float>> ProcessOverlapAdd(const std::vector<float>& input_audio,
														
 
															+        int chunk_size,
														
 
															+        int num_overlap,
														
 
															+        ModelCallback model_func,
														
 
															+        std::function<void(float)> progress_callback = nullptr,
														
 
															+        CancelCallback cancel_callback = nullptr);
														
 
															 private:
														
 
															     // Pipelined Overlap-Add
														
 
															-    std::vector<std::vector<float>> ProcessOverlapAddPipelined(const std::vector<float>& input_audio, 
														
 
															-                                                  int chunk_size, 
														
 
															-                                                  int num_overlap,
														
 
															-                                                  std::function<void(float)> progress_callback,
														
 
															-                                                  CancelCallback cancel_callback);
														
 
															+    std::vector<std::vector<float>> ProcessOverlapAddPipelined(const std::vector<float>& input_audio,
														
 
															+        int chunk_size,
														
 
															+        int num_overlap,
														
 
															+        std::function<void(float)> progress_callback,
														
 
															+        CancelCallback cancel_callback);
														
 
															 private:
														
 
															     std::unique_ptr<BSRoformer> model_;
														
 
															-    
														
 
															+
														
 
															     // Persistent Graph State
														
 
															     struct ggml_context* ctx_ = nullptr;
														
 
															     struct ggml_cgraph* gf_ = nullptr;
														
 
															     struct ggml_gallocr* allocr_ = nullptr;
														
 
															-    
														
 
															+
														
 
															     // Cached Input Tensors (owned by ctx_)
														
 
															     struct ggml_tensor* input_tensor_ = nullptr;
														
 
															     struct ggml_tensor* pos_time_ = nullptr;
														
@@ -78,30 +81,30 @@ private:
 
															     // Pipelined State Data
														
 
															     struct ChunkState {
														
 
															         int id = -1;
														
 
															-        std::vector<float> input_audio;       // Original chunk audio
														
 
															-        std::vector<float> stft_flattened;    // [Prepared Input for GPU]
														
 
															+        std::vector<float> input_audio; // Original chunk audio
														
 
															+        std::vector<float> stft_flattened; // [Prepared Input for GPU]
														
 
															         std::vector<std::vector<float>> stft_outputs; // Kept for reconstruction
														
 
															         int n_frames = 0;
														
 
															-        
														
 
															-        std::vector<float> mask_output;       // Output from GPU
														
 
															-        std::vector<std::vector<float>> final_audio;       // Result after ISTFT [stems][samples]
														
 
															+
														
 
															+        std::vector<float> mask_output; // Output from GPU
														
 
															+        std::vector<std::vector<float>> final_audio; // Result after ISTFT [stems][samples]
														
 
															     };
														
 
															     // Helper to ensure graph is built for specific n_frames
														
 
															     bool EnsureGraph(int n_frames);
														
 
															     void ComputeSTFT(const std::vector<float>& input_audio,
														
 
															-                     std::vector<std::vector<float>>& stft_outputs,
														
 
															-                     int& n_frames);
														
 
															-                     
														
 
															+        std::vector<std::vector<float>>& stft_outputs,
														
 
															+        int& n_frames);
														
 
															+
														
 
															     void PrepareModelInput(const std::vector<std::vector<float>>& stft_outputs,
														
 
															-                           int n_frames,
														
 
															-                           std::vector<float>& model_input_rearranged);
														
 
															+        int n_frames,
														
 
															+        std::vector<float>& model_input_rearranged);
														
 
															     void PostProcessAndISTFT(const std::vector<float>& mask_output,
														
 
															-                             const std::vector<std::vector<float>>& stft_outputs,
														
 
															-                             int n_frames,
														
 
															-                             std::vector<std::vector<float>>& output_audio);
														
 
															+        const std::vector<std::vector<float>>& stft_outputs,
														
 
															+        int n_frames,
														
 
															+        std::vector<std::vector<float>>& output_audio);
														
 
															     // Pipeline Steps
														
 
															     std::shared_ptr<ChunkState> PreProcessChunk(const std::vector<float>& chunk_audio, int id);
														
--- a/libav
+++ b/libav
@@ -0,0 +1 @@
 
															+Subproject commit c4642788e83b0858bca449f9b6e71ddb015dfa5d
														
--- a/scripts/convert_to_gguf.py
+++ b/scripts/convert_to_gguf.py
@@ -6,37 +6,39 @@ Supports quantization: FP32, FP16, Q8_0, Q4_0, Q4_1, Q5_0, Q5_1
 
															 Mixed Quantization: Keeps Norms/Biases as FP32 to avoid CUDA alignment issues.
														
 
															 """
														
 
															-import os
														
 
															 import argparse
														
 
															-import torch
														
 
															+import os
														
 
															+
														
 
															+import gguf
														
 
															+import librosa
														
 
															 import numpy as np
														
 
															+import torch
														
 
															 import yaml
														
 
															-import librosa
														
 
															-from einops import repeat, reduce, rearrange
														
 
															-import gguf
														
 
															-from gguf.quants import quantize, GGMLQuantizationType
														
 
															+from einops import rearrange, reduce, repeat
														
 
															+from gguf.quants import GGMLQuantizationType, quantize
														
 
															+from safetensors.torch import load_file as load_safetensors
														
 
															 def detect_architecture(config_dict):
														
 
															     """
														
 
															     Detect architecture from config.
														
 
															-    Returns: 'bs_roformer' or 'mel_band_roformer'
														
 
															+    Returns: 'bs_roformer', 'bs_roformer_v2', or 'mel_band_roformer'
														
 
															     """
														
 
															-
														
 
															-    # Check structural signatures in 'model' section
														
 
															-    model_config = config_dict.get("model", {})
														
 
															+    model_config = config_dict.get("model", config_dict)
														
 
															     has_freqs = "freqs_per_bands" in model_config
														
 
															+    has_freqs_out = "freqs_per_bands_out" in model_config
														
 
															     has_num_bands = "num_bands" in model_config
														
 
															+    if has_freqs and has_freqs_out:
														
 
															+        return "bs_roformer_v2"
														
 
															     if has_freqs:
														
 
															         return "bs_roformer"
														
 
															     if has_num_bands:
														
 
															         return "mel_band_roformer"
														
 
															-
														
 
															-    # 3. If neither found, fail
														
 
															+    
														
 
															     raise ValueError(
														
 
															-        "Auto-detection failed: Config missing 'freqs_per_bands' (BS) or 'num_bands' (Mel-Band). "
														
 
															+        "Auto-detection failed: Config missing 'freqs_per_bands'/'freqs_per_bands_out' (BS_V2), 'freqs_per_bands' (BS), or 'num_bands' (Mel-Band). "
														
 
															         "Please specify --arch manually."
														
 
															     )
														
@@ -46,6 +48,7 @@ def normalize_arch(arch: str) -> str:
 
															     mapping = {
														
 
															         "bs": "bs_roformer",
														
 
															         "bs_roformer": "bs_roformer",
														
 
															+        "bs_roformer_v2": "bs_roformer_v2",
														
 
															         "mel": "mel_band_roformer",
														
 
															         "mel_band": "mel_band_roformer",
														
 
															         "mel_band_roformer": "mel_band_roformer",
														
@@ -178,7 +181,7 @@ def generate_buffers(hparams, arch="mel_band_roformer"):
 
															         hparams: Model hyperparameters
														
 
															         arch: Architecture name ('bs_roformer' or 'mel_band_roformer')
														
 
															     """
														
 
															-    if arch == "bs_roformer":
														
 
															+    if arch == "bs_roformer" or arch == "bs_roformer_v2":
														
 
															         return generate_buffers_bs(hparams)
														
 
															     # Mel-Band-Roformer Logic
														
@@ -234,9 +237,9 @@ def generate_buffers(hparams, arch="mel_band_roformer"):
 
															     }
														
 
															-# ============================================================================
														
 
															+# ============================================================================ 
														
 
															 # Quantization Helper
														
 
															-# ============================================================================
														
 
															+# ============================================================================ 
														
 
															 def get_target_quantization_type(dtype_str: str) -> GGMLQuantizationType:
														
@@ -297,9 +300,9 @@ def should_quantize(name: str) -> bool:
 
															     return False
														
 
															-# ============================================================================
														
 
															+# ============================================================================ 
														
 
															 # Key Name Mapping
														
 
															-# ============================================================================
														
 
															+# ============================================================================ 
														
 
															 def map_key_name(key: str) -> str:
														
@@ -378,9 +381,9 @@ def map_key_name(key: str) -> str:
 
															     return key.replace(".", "_")
														
 
															-# ============================================================================
														
 
															+# ============================================================================ 
														
 
															 # Main Conversion
														
 
															-# ============================================================================
														
 
															+# ============================================================================ 
														
 
															 def convert(
														
@@ -396,18 +399,21 @@ def convert(
 
															     Convert PyTorch checkpoint to GGUF format.
														
 
															     """
														
 
															     print(f"Loading checkpoint: {ckpt_path}")
														
 
															-    checkpoint = torch.load(ckpt_path, map_location="cpu")
														
 
															-
														
 
															-    if "state_dict" in checkpoint:
														
 
															-        state_dict = checkpoint["state_dict"]
														
 
															-    elif "model" in checkpoint:
														
 
															-        state_dict = checkpoint["model"]
														
 
															+    if ckpt_path.endswith(".safetensors"):
														
 
															+        state_dict = load_safetensors(ckpt_path)
														
 
															     else:
														
 
															-        state_dict = checkpoint
														
 
															+        checkpoint = torch.load(ckpt_path, map_location="cpu", weights_only=False)
														
 
															+
														
 
															+        if "state_dict" in checkpoint:
														
 
															+            state_dict = checkpoint["state_dict"]
														
 
															+        elif "model" in checkpoint:
														
 
															+            state_dict = checkpoint["model"]
														
 
															+        else:
														
 
															+            state_dict = checkpoint
														
 
															     print(f"Loading config: {config_path}")
														
 
															     with open(config_path) as f:
														
 
															-        config_dict = yaml.load(f, Loader=yaml.FullLoader)
														
 
															+        config_dict = yaml.safe_load(f)
														
 
															     # Detect architecture
														
 
															     if arch is None:
														
@@ -423,7 +429,7 @@ def convert(
 
															     # Generate buffers
														
 
															     print("Generating buffers (standalone)...")
														
 
															-    buffers = generate_buffers(config_dict["model"], arch=arch)
														
 
															+    buffers = generate_buffers(config_dict, arch=arch)
														
 
															     freq_indices = buffers["freq_indices"]
														
 
															     num_bands_per_freq = buffers["num_bands_per_freq"]
														
 
															     num_freqs_per_band = buffers["num_freqs_per_band"]
														
@@ -439,7 +445,7 @@ def convert(
 
															     print("Writing metadata...")
														
 
															     # General metadata
														
 
															-    model_name = name if name else "Mel-Band-Roformer Separator"
														
 
															+    model_name = name if name else "BSRoformer Separator"
														
 
															     model_description = description if description else "Music source separation model"
														
 
															     gguf_writer.add_name(model_name)
														
 
															     gguf_writer.add_description(model_description)
														
@@ -457,6 +463,11 @@ def convert(
 
															         freqs_tuple = buffers["freqs_per_bands_tuple"]
														
 
															         # Must be list for GGUFWriter
														
 
															         gguf_writer.add_array(f"{arch_name}.freqs_per_bands", list(freqs_tuple))
														
 
															+    
														
 
															+    if arch_name == "bs_roformer_v2":
														
 
															+        gguf_writer.add_array(f"{arch_name}.freqs_per_bands", list(config_dict["freqs_per_bands"]))
														
 
															+        gguf_writer.add_array(f"{arch_name}.freqs_per_bands_out", list(config_dict["freqs_per_bands_out"]))
														
 
															+
														
 
															     # Quantization version (required when quantized)
														
 
															     if target_qtype != GGMLQuantizationType.F32:
														
@@ -476,18 +487,12 @@ def convert(
 
															     # 2. Write Hyperparameters
														
 
															     # =========================================================================
														
 
															     print("Writing hyperparameters...")
														
 
															-    hparams = config_dict["model"]
														
 
															+    hparams = config_dict
														
 
															     # Load state dict directly (no model class dependency)
														
 
															     print(f"Loading checkpoint for architecture: {arch}")
														
 
															-    raw_state_dict = None
														
 
															-    if "state_dict" in checkpoint:
														
 
															-        raw_state_dict = checkpoint["state_dict"]
														
 
															-    elif "model" in checkpoint:
														
 
															-        raw_state_dict = checkpoint["model"]
														
 
															-    else:
														
 
															-        raw_state_dict = checkpoint
														
 
															+    raw_state_dict = state_dict
														
 
															     if raw_state_dict is None:
														
 
															         raise ValueError("Could not find state_dict in checkpoint")
														
@@ -500,10 +505,10 @@ def convert(
 
															         state_dict[k] = v
														
 
															     # Architecture specific parameters
														
 
															-    gguf_writer.add_uint32(f"{arch_name}.dim", hparams["dim"])
														
 
															-    gguf_writer.add_uint32(f"{arch_name}.depth", hparams["depth"])
														
 
															+    gguf_writer.add_uint32(f"{arch_name}.dim", hparams["hidden_size"])
														
 
															+    gguf_writer.add_uint32(f"{arch_name}.depth", hparams["num_hidden_layers"])
														
 
															     # BS uses freqs_per_bands (no explicit num_bands), MelBand uses num_bands
														
 
															-    num_bands = buffers.get("num_bands", hparams.get("num_bands", 60))
														
 
															+    num_bands = buffers.get("num_bands", len(hparams.get("freqs_per_bands", [])))
														
 
															     gguf_writer.add_uint32(f"{arch_name}.num_bands", num_bands)
														
 
															     # STFT parameters
														
@@ -519,24 +524,50 @@ def convert(
 
															         f"{arch_name}.stft_normalized", hparams.get("stft_normalized", False)
														
 
															     )
														
 
															     gguf_writer.add_bool(
														
 
															-        f"{arch_name}.zero_dc", hparams.get("zero_dc", True)
														
 
															-    )  # Defaults to True in reference implementation
														
 
															+        f"{arch_name}.zero_dc", hparams.get("zero_dc", True) # Defaults to True in reference implementation
														
 
															+    )
														
 
															     # Architecture details
														
 
															     gguf_writer.add_uint32(f"{arch_name}.num_stems", hparams.get("num_stems", 1))
														
 
															     gguf_writer.add_bool(f"{arch_name}.stereo", hparams.get("stereo", False))
														
 
															     gguf_writer.add_uint32(
														
 
															-        f"{arch_name}.sample_rate", hparams.get("sample_rate", 44100)
														
 
															+        f"{arch_name}.sample_rate", hparams.get("wave_sample_rate", 44100)
														
 
															     )
														
 
															-    gguf_writer.add_uint32(
														
 
															-        f"{arch_name}.time_transformer_depth",
														
 
															-        hparams.get("time_transformer_depth", 0),
														
 
															-    )
														
 
															-    gguf_writer.add_uint32(
														
 
															-        f"{arch_name}.freq_transformer_depth",
														
 
															-        hparams.get("freq_transformer_depth", 0),
														
 
															-    )
														
 
															+    if arch_name == "bs_roformer_v2":
														
 
															+        gguf_writer.add_uint32(
														
 
															+            f"{arch_name}.time_transformer_depth",
														
 
															+            hparams.get("time_transformer_depth", 1),
														
 
															+        )
														
 
															+        gguf_writer.add_uint32(
														
 
															+            f"{arch_name}.freq_transformer_depth",
														
 
															+            hparams.get("freq_transformer_depth", 1),
														
 
															+        )
														
 
															+        gguf_writer.add_uint32(
														
 
															+            f"{arch_name}.num_key_value_heads", hparams.get("num_key_value_heads", 4)
														
 
															+        )
														
 
															+        gguf_writer.add_uint32(
														
 
															+            f"{arch_name}.intermediate_size", hparams.get("intermediate_size", 1152)
														
 
															+        )
														
 
															+        gguf_writer.add_uint32(
														
 
															+            f"{arch_name}.num_input_channels", hparams.get("num_input_channels", 2)
														
 
															+        )
														
 
															+        gguf_writer.add_uint32(
														
 
															+            f"{arch_name}.band_proj_size", hparams.get("band_proj_size", 256)
														
 
															+        )
														
 
															+        gguf_writer.add_uint32(
														
 
															+            f"{arch_name}.register_token_num", hparams.get("register_token_num", 4)
														
 
															+        )
														
 
															+    else:
														
 
															+        gguf_writer.add_uint32(
														
 
															+            f"{arch_name}.time_transformer_depth",
														
 
															+            hparams.get("time_transformer_depth", 0),
														
 
															+        )
														
 
															+        gguf_writer.add_uint32(
														
 
															+            f"{arch_name}.freq_transformer_depth",
														
 
															+            hparams.get("freq_transformer_depth", 0),
														
 
															+        )
														
 
															+
														
 
															     gguf_writer.add_uint32(
														
 
															         f"{arch_name}.linear_transformer_depth",
														
 
															         hparams.get("linear_transformer_depth", 0),
														
@@ -545,8 +576,8 @@ def convert(
 
															     gguf_writer.add_uint32(
														
 
															         f"{arch_name}.mask_estimator_depth", hparams.get("mask_estimator_depth", 1)
														
 
															     )
														
 
															-    gguf_writer.add_uint32(f"{arch_name}.dim_head", hparams.get("dim_head", 64))
														
 
															-    gguf_writer.add_uint32(f"{arch_name}.heads", hparams.get("heads", 8))
														
 
															+    gguf_writer.add_uint32(f"{arch_name}.dim_head", hparams.get("head_dim", 64))
														
 
															+    gguf_writer.add_uint32(f"{arch_name}.heads", hparams.get("num_attention_heads", 8))
														
 
															     gguf_writer.add_uint32(
														
 
															         f"{arch_name}.mlp_expansion_factor", hparams.get("mlp_expansion_factor", 4)
														
 
															     )
														
@@ -563,11 +594,11 @@ def convert(
 
															     audio_config = config_dict.get("audio", {})
														
 
															     # chunk_size: prefer inference.chunk_size, fallback to audio.chunk_size
														
 
															-    default_chunk_size = inference_config.get(
														
 
															-        "chunk_size", audio_config.get("chunk_size", 352800)
														
 
															+    default_chunk_size = hparams.get(
														
 
															+        "wave_chunk_size", 352800
														
 
															     )
														
 
															     # num_overlap: from inference section
														
 
															-    default_num_overlap = inference_config.get("num_overlap", 0)
														
 
															+    default_num_overlap = inference_config.get("num_overlap", 2)
														
 
															     gguf_writer.add_uint32(f"{arch_name}.default_chunk_size", default_chunk_size)
														
 
															     gguf_writer.add_uint32(f"{arch_name}.default_num_overlap", default_num_overlap)
														
@@ -679,7 +710,7 @@ Examples:
 
															     parser.add_argument(
														
 
															         "--ckpt", type=str, required=True, help="Path to PyTorch checkpoint"
														
 
															     )
														
 
															-    parser.add_argument("--config", type=str, required=True, help="Path to YAML config")
														
 
															+    parser.add_argument("--config", type=str, required=True, help="Path to YAML or JSON config")
														
 
															     parser.add_argument("--out", type=str, required=True, help="Output GGUF file path")
														
 
															     parser.add_argument(
														
 
															         "--dtype",
														
@@ -702,7 +733,7 @@ Examples:
 
															         "--name",
														
 
															         type=str,
														
 
															         default=None,
														
 
															-        help="Model name (default: 'Mel-Band-Roformer Vocal Separator')",
														
 
															+        help="Model name (default: 'BSRoformer Vocal Separator')",
														
 
															     )
														
 
															     parser.add_argument(
														
 
															         "--description",
														
@@ -712,7 +743,7 @@ Examples:
 
															     )
														
 
															     parser.add_argument(
														
 
															         "--arch",
														
 
															-        choices=["mel_band", "mel_band_roformer", "bs", "bs_roformer"],
														
 
															+        choices=["mel_band", "mel_band_roformer", "bs", "bs_roformer", "bs_roformer_v2"],
														
 
															         default=None,
														
 
															         help="Architecture type (auto-detected if not specified)",
														
 
															     )
														
--- a/src/audio.cpp
+++ b/src/audio.cpp
@@ -1,31 +1,225 @@
 
															-#define DR_WAV_IMPLEMENTATION
														
 
															-#include "dr_libs/dr_wav.h"
														
 
															 #include "bs_roformer/audio.h"
														
 
															+#ifdef __cplusplus
														
 
															+extern "C" {
														
 
															+#endif
														
 
															+#include <libavformat/avformat.h>
														
 
															+#include <libavcodec/avcodec.h>
														
 
															+#include <libavutil/avutil.h>
														
 
															+#include <libavutil/samplefmt.h>
														
 
															+#include <libavutil/channel_layout.h>
														
 
															+#include <libavutil/frame.h>
														
 
															+#include <libavutil/log.h>
														
 
															+#include <libavutil/mem.h>
														
 
															+#include <libavutil/dict.h>
														
 
															+#ifdef __cplusplus
														
 
															+}
														
 
															+#endif
														
 
															 #include <iostream>
														
 
															+#include <vector>
														
 
															-AudioBuffer AudioFile::Load(const std::string& path) {
														
 
															+#define DR_WAV_IMPLEMENTATION
														
 
															+#include "dr_libs/dr_wav.h"
														
 
															+
														
 
															+static void InitFFmpeg() {
														
 
															+    av_log_set_level(AV_LOG_ERROR);
														
 
															+    av_register_all();
														
 
															+}
														
 
															+
														
 
															+AudioBuffer AudioFile::Load(const std::string& path, int target_sample_rate) {
														
 
															     AudioBuffer buffer;
														
 
															-    drwav_uint64 totalPCMFrames;
														
 
															+    buffer.channels = 0;
														
 
															+    buffer.sampleRate = 0;
														
 
															+    buffer.samples = 0;
														
 
															+
														
 
															+    AVFormatContext* fmt_ctx = nullptr;
														
 
															+    AVCodecContext* codec_ctx = nullptr;
														
 
															+    int audio_stream_index = -1;
														
 
															-    float* pData = drwav_open_file_and_read_pcm_frames_f32(
														
 
															-        path.c_str(), &buffer.channels, &buffer.sampleRate, &totalPCMFrames, NULL);
														
 
															-        
														
 
															-    if (!pData) {
														
 
															+    InitFFmpeg();
														
 
															+    
														
 
															+    if (avformat_open_input(&fmt_ctx, path.c_str(), nullptr, nullptr) < 0) {
														
 
															         throw std::runtime_error("Failed to open audio file: " + path);
														
 
															     }
														
 
															-    buffer.samples = totalPCMFrames * buffer.channels;
														
 
															-    buffer.data.assign(pData, pData + buffer.samples);
														
 
															-    drwav_free(pData, NULL);
														
 
															+    if (avformat_find_stream_info(fmt_ctx, nullptr) < 0) {
														
 
															+        avformat_close_input(&fmt_ctx);
														
 
															+        throw std::runtime_error("Failed to find stream info");
														
 
															+    }
														
 
															+    
														
 
															+    for (unsigned int i = 0; i < fmt_ctx->nb_streams; i++) {
														
 
															+        if (fmt_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
														
 
															+            audio_stream_index = i;
														
 
															+            break;
														
 
															+        }
														
 
															+    }
														
 
															+    
														
 
															+    if (audio_stream_index == -1) {
														
 
															+        avformat_close_input(&fmt_ctx);
														
 
															+        throw std::runtime_error("No audio stream found in file: " + path);
														
 
															+    }
														
 
															+    
														
 
															+    AVCodecParameters* codecpar = fmt_ctx->streams[audio_stream_index]->codecpar;
														
 
															+    const AVCodec* codec = avcodec_find_decoder(codecpar->codec_id);
														
 
															+    if (!codec) {
														
 
															+        avformat_close_input(&fmt_ctx);
														
 
															+        throw std::runtime_error("Codec not found for audio stream");
														
 
															+    }
														
 
															-    // Validation
														
 
															-    if (buffer.sampleRate != 44100) {
														
 
															-        std::cerr << "Warning: Input sample rate is " << buffer.sampleRate 
														
 
															-                  << " Hz. Model expects 44100 Hz." << std::endl;
														
 
															-        // In a full implementation, we would resample here.
														
 
															-        // For now, we warn.
														
 
															+    codec_ctx = avcodec_alloc_context3(codec);
														
 
															+    if (!codec_ctx) {
														
 
															+        avformat_close_input(&fmt_ctx);
														
 
															+        throw std::runtime_error("Failed to allocate codec context");
														
 
															     }
														
 
															+    if (avcodec_parameters_to_context(codec_ctx, codecpar) < 0) {
														
 
															+        avcodec_free_context(&codec_ctx);
														
 
															+        avformat_close_input(&fmt_ctx);
														
 
															+        throw std::runtime_error("Failed to copy codec parameters");
														
 
															+    }
														
 
															+    
														
 
															+    if (avcodec_open2(codec_ctx, codec, nullptr) < 0) {
														
 
															+        avcodec_free_context(&codec_ctx);
														
 
															+        avformat_close_input(&fmt_ctx);
														
 
															+        throw std::runtime_error("Failed to open codec");
														
 
															+    }
														
 
															+    
														
 
															+    int input_sample_rate = codec_ctx->sample_rate;
														
 
															+    int input_channels = codec_ctx->channels;
														
 
															+    
														
 
															+    std::vector<float> audio_data;
														
 
															+    AVPacket* pkt = av_packet_alloc();
														
 
															+    if (!pkt) {
														
 
															+        avcodec_free_context(&codec_ctx);
														
 
															+        avformat_close_input(&fmt_ctx);
														
 
															+        throw std::runtime_error("Failed to allocate packet");
														
 
															+    }
														
 
															+    
														
 
															+    AVFrame* decoded_frame = av_frame_alloc();
														
 
															+    if (!decoded_frame) {
														
 
															+        av_packet_free(&pkt);
														
 
															+        avcodec_free_context(&codec_ctx);
														
 
															+        avformat_close_input(&fmt_ctx);
														
 
															+        throw std::runtime_error("Failed to allocate frame");
														
 
															+    }
														
 
															+    
														
 
															+    int ret;
														
 
															+    while ((ret = av_read_frame(fmt_ctx, pkt)) >= 0) {
														
 
															+        if (pkt->stream_index != audio_stream_index) {
														
 
															+            av_packet_unref(pkt);
														
 
															+            continue;
														
 
															+        }
														
 
															+        
														
 
															+        ret = avcodec_send_packet(codec_ctx, pkt);
														
 
															+        if (ret < 0) {
														
 
															+            av_packet_unref(pkt);
														
 
															+            continue;
														
 
															+        }
														
 
															+        
														
 
															+        ret = avcodec_receive_frame(codec_ctx, decoded_frame);
														
 
															+        if (ret >= 0) {
														
 
															+            int nb_samples = decoded_frame->nb_samples;
														
 
															+            int nb_channels = av_get_channel_layout_nb_channels(decoded_frame->channel_layout);
														
 
															+            if (nb_channels <= 0) {
														
 
															+                nb_channels = input_channels;
														
 
															+            }
														
 
															+            
														
 
															+            std::vector<float> channel_data(nb_channels * nb_samples);
														
 
															+            
														
 
															+            int sample_fmt = decoded_frame->format;
														
 
															+            
														
 
															+            if (sample_fmt == AV_SAMPLE_FMT_FLT) {
														
 
															+                float* data = (float*)decoded_frame->data[0];
														
 
															+                for (int i = 0; i < nb_samples * nb_channels; i++) {
														
 
															+                    channel_data[i] = data[i];
														
 
															+                }
														
 
															+            } else if (sample_fmt == AV_SAMPLE_FMT_FLTP) {
														
 
															+                for (int c = 0; c < nb_channels; c++) {
														
 
															+                    float* channel = (float*)decoded_frame->data[c];
														
 
															+                    for (int i = 0; i < nb_samples; i++) {
														
 
															+                        channel_data[c * nb_samples + i] = channel[i];
														
 
															+                    }
														
 
															+                }
														
 
															+            } else if (sample_fmt == AV_SAMPLE_FMT_S16) {
														
 
															+                int16_t* data = (int16_t*)decoded_frame->data[0];
														
 
															+                for (int i = 0; i < nb_samples * nb_channels; i++) {
														
 
															+                    channel_data[i] = data[i] / 32768.0f;
														
 
															+                }
														
 
															+            } else if (sample_fmt == AV_SAMPLE_FMT_S16P) {
														
 
															+                for (int c = 0; c < nb_channels; c++) {
														
 
															+                    int16_t* channel = (int16_t*)decoded_frame->data[c];
														
 
															+                    for (int i = 0; i < nb_samples; i++) {
														
 
															+                        channel_data[c * nb_samples + i] = channel[i] / 32768.0f;
														
 
															+                    }
														
 
															+                }
														
 
															+            } else if (sample_fmt == AV_SAMPLE_FMT_S32) {
														
 
															+                int32_t* data = (int32_t*)decoded_frame->data[0];
														
 
															+                for (int i = 0; i < nb_samples * nb_channels; i++) {
														
 
															+                    channel_data[i] = data[i] / 2147483648.0f;
														
 
															+                }
														
 
															+            } else if (sample_fmt == AV_SAMPLE_FMT_S32P) {
														
 
															+                for (int c = 0; c < nb_channels; c++) {
														
 
															+                    int32_t* channel = (int32_t*)decoded_frame->data[c];
														
 
															+                    for (int i = 0; i < nb_samples; i++) {
														
 
															+                        channel_data[c * nb_samples + i] = channel[i] / 2147483648.0f;
														
 
															+                    }
														
 
															+                }
														
 
															+            } else {
														
 
															+                float* data = (float*)decoded_frame->data[0];
														
 
															+                for (int i = 0; i < nb_samples * nb_channels; i++) {
														
 
															+                    channel_data[i] = data[i];
														
 
															+                }
														
 
															+            }
														
 
															+            
														
 
															+            int resampled_samples = (nb_samples * target_sample_rate) / input_sample_rate;
														
 
															+            std::vector<float> resampled_data(nb_channels * resampled_samples);
														
 
															+            
														
 
															+            for (int c = 0; c < nb_channels; c++) {
														
 
															+                for (int i = 0; i < resampled_samples; i++) {
														
 
															+                    float src_idx = (float)i * input_sample_rate / target_sample_rate;
														
 
															+                    int src_idx_int = (int)src_idx;
														
 
															+                    float frac = src_idx - src_idx_int;
														
 
															+                    
														
 
															+                    if (src_idx_int + 1 < nb_samples) {
														
 
															+                        resampled_data[c * resampled_samples + i] = 
														
 
															+                            channel_data[c * nb_samples + src_idx_int] * (1.0f - frac) +
														
 
															+                            channel_data[c * nb_samples + src_idx_int + 1] * frac;
														
 
															+                    } else if (src_idx_int < nb_samples) {
														
 
															+                        resampled_data[c * resampled_samples + i] = 
														
 
															+                            channel_data[c * nb_samples + src_idx_int];
														
 
															+                    } else {
														
 
															+                        resampled_data[c * resampled_samples + i] = 0.0f;
														
 
															+                    }
														
 
															+                }
														
 
															+            }
														
 
															+            
														
 
															+            if (nb_channels >= 2) {
														
 
															+                for (int i = 0; i < resampled_samples; i++) {
														
 
															+                    audio_data.push_back(resampled_data[0 * resampled_samples + i]);
														
 
															+                    audio_data.push_back(resampled_data[1 * resampled_samples + i]);
														
 
															+                }
														
 
															+            } else {
														
 
															+                for (int i = 0; i < resampled_samples; i++) {
														
 
															+                    audio_data.push_back(resampled_data[0 * resampled_samples + i]);
														
 
															+                    audio_data.push_back(resampled_data[0 * resampled_samples + i]);
														
 
															+                }
														
 
															+            }
														
 
															+            
														
 
															+            av_frame_unref(decoded_frame);
														
 
															+        }
														
 
															+        
														
 
															+        av_packet_unref(pkt);
														
 
															+    }
														
 
															+    
														
 
															+    av_frame_free(&decoded_frame);
														
 
															+    av_packet_free(&pkt);
														
 
															+    avcodec_free_context(&codec_ctx);
														
 
															+    avformat_close_input(&fmt_ctx);
														
 
															+    
														
 
															+    buffer.data = std::move(audio_data);
														
 
															+    buffer.channels = 2;
														
 
															+    buffer.sampleRate = target_sample_rate;
														
 
															+    buffer.samples = buffer.data.size();
														
 
															+    
														
 
															     return buffer;
														
 
															 }
														
@@ -38,7 +232,7 @@ void AudioFile::Save(const std::string& path, const AudioBuffer& buffer) {
 
															     format.bitsPerSample = 32;
														
 
															     drwav wav;
														
 
															-    if (!drwav_init_file_write(&wav, path.c_str(), &format, NULL)) {
														
 
															+    if (!drwav_init_file_write(&wav, path.c_str(), &format, nullptr)) {
														
 
															         throw std::runtime_error("Failed to open file for writing: " + path);
														
 
															     }
														
@@ -46,6 +240,6 @@ void AudioFile::Save(const std::string& path, const AudioBuffer& buffer) {
 
															     drwav_uninit(&wav);
														
 
															     if (framesWritten != buffer.samples / buffer.channels) {
														
 
															-         throw std::runtime_error("Failed to write all samples to " + path);
														
 
															+        throw std::runtime_error("Failed to write all samples to " + path);
														
 
															     }
														
 
															-}
														
 
															+}
														
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -66,6 +66,9 @@ void BSRoformer::LoadWeights(const std::string& path) {
 
															     if (architecture_ == "bs_roformer") {
														
 
															         has_final_norm_ = true;
														
 
															         transformer_norm_output_ = false;
														
 
															+    } else if (architecture_ == "bs_roformer_v2") {
														
 
															+        is_v2_model_ = true;
														
 
															+        // V2-specific logic can be added here if needed
														
 
															     } else {
														
 
															         // mel_band_roformer
														
 
															         has_final_norm_ = false;
														
@@ -113,6 +116,29 @@ void BSRoformer::LoadWeights(const std::string& path) {
 
															     kv_idx = gguf_find_key(ctx_gguf, (kp + "sample_rate").c_str());
														
 
															     if (kv_idx >= 0) sample_rate_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
														
 
															+
														
 
															+    if (is_v2_model_) {
														
 
															+        kv_idx = gguf_find_key(ctx_gguf, (kp + "time_transformer_depth").c_str());
														
 
															+        if (kv_idx >= 0) time_transformer_depth_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
														
 
															+        
														
 
															+        kv_idx = gguf_find_key(ctx_gguf, (kp + "freq_transformer_depth").c_str());
														
 
															+        if (kv_idx >= 0) freq_transformer_depth_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
														
 
															+
														
 
															+        kv_idx = gguf_find_key(ctx_gguf, (kp + "num_key_value_heads").c_str());
														
 
															+        if (kv_idx >= 0) num_key_value_heads_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
														
 
															+
														
 
															+        kv_idx = gguf_find_key(ctx_gguf, (kp + "intermediate_size").c_str());
														
 
															+        if (kv_idx >= 0) intermediate_size_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
														
 
															+
														
 
															+        kv_idx = gguf_find_key(ctx_gguf, (kp + "num_input_channels").c_str());
														
 
															+        if (kv_idx >= 0) num_input_channels_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
														
 
															+
														
 
															+        kv_idx = gguf_find_key(ctx_gguf, (kp + "band_proj_size").c_str());
														
 
															+        if (kv_idx >= 0) band_proj_size_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
														
 
															+
														
 
															+        kv_idx = gguf_find_key(ctx_gguf, (kp + "register_token_num").c_str());
														
 
															+        if (kv_idx >= 0) register_token_num_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
														
 
															+    }
														
 
															     // Inference defaults (optional, fallback to hardcoded values)
														
 
															     kv_idx = gguf_find_key(ctx_gguf, (kp + "default_chunk_size").c_str());
														
@@ -229,7 +255,13 @@ std::vector<int> BSRoformer::GetDimInputs() const {
 
															 }
														
 
															 int BSRoformer::GetTotalDimInput() const {
														
 
															-    if (architecture_ == "bs") {
														
 
															+    if (is_v2_model_) {
														
 
															+        int total = 0;
														
 
															+        for (int i = 0; i < num_bands_; ++i) {
														
 
															+            total += num_freqs_per_band_[i] * 2;
														
 
															+        }
														
 
															+        return total;
														
 
															+    } else if (architecture_ == "bs_roformer") {
														
 
															         // BS: All frequencies * stereo * complex
														
 
															         int n_freq = n_fft_ / 2 + 1;
														
 
															         return n_freq * 2 * 2;  // freq * stereo * complex
														
@@ -242,7 +274,7 @@ int BSRoformer::GetTotalDimInput() const {
 
															     return total;
														
 
															 }
														
 
															-// ========== Graph Building Functions ==========
														
 
															+// ========== Graph Building Functions ========== 
														
 
															 ggml_tensor* BSRoformer::BuildBandSplitGraph(
														
 
															     ggml_context* ctx,
														
@@ -251,6 +283,46 @@ ggml_tensor* BSRoformer::BuildBandSplitGraph(
 
															     int n_frames,
														
 
															     int batch
														
 
															 ) {
														
 
															+    if (is_v2_model_) {
														
 
															+        // V2 model band split
														
 
															+        ggml_tensor* x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, band_proj_size_, num_bands_, n_frames, batch);
														
 
															+
														
 
															+        size_t offset_elements = 0;
														
 
															+        for (int i = 0; i < num_bands_; ++i) {
														
 
															+            int dim_in = num_freqs_per_band_[i] * 2; // V2 uses 2 instead of 4
														
 
															+
														
 
															+            ggml_tensor* band_input = ggml_view_3d(ctx, input,
														
 
															+                                                   dim_in, n_frames, batch,
														
 
															+                                                   input->nb[1], input->nb[2],
														
 
															+                                                   offset_elements * sizeof(float));
														
 
															+            
														
 
															+            std::string norm_name = "band_split." + std::to_string(i) + ".norm.weight";
														
 
															+            ggml_tensor* norm_w = GetWeight(norm_name);
														
 
															+            if (!norm_w) { std::cerr << "Missing weight: " << norm_name << std::endl; return nullptr; }
														
 
															+
														
 
															+            ggml_tensor* normed = ggml_rms_norm(ctx, band_input, 1e-6f);
														
 
															+            normed = ggml_mul(ctx, normed, norm_w);
														
 
															+
														
 
															+            std::string linear_w_name = "band_split." + std::to_string(i) + ".linear.weight";
														
 
															+            std::string linear_b_name = "band_split." + std::to_string(i) + ".linear.bias";
														
 
															+            ggml_tensor* linear_w = GetWeight(linear_w_name);
														
 
															+            ggml_tensor* linear_b = GetWeight(linear_b_name);
														
 
															+            if (!linear_w || !linear_b) { std::cerr << "Missing weights for band " << i << std::endl; return nullptr; }
														
 
															+
														
 
															+            ggml_tensor* projected = ggml_mul_mat(ctx, linear_w, normed);
														
 
															+            projected = ggml_add(ctx, projected, linear_b);
														
 
															+
														
 
															+            ggml_tensor* out_slice = ggml_view_3d(ctx, x,
														
 
															+                                                  band_proj_size_, n_frames, batch,
														
 
															+                                                  x->nb[2], x->nb[3],
														
 
															+                                                  i * x->nb[1]);
														
 
															+            
														
 
															+            ggml_build_forward_expand(gf, ggml_cpy(ctx, projected, out_slice));
														
 
															+            offset_elements += dim_in;
														
 
															+        }
														
 
															+        return x;
														
 
															+    }
														
 
															+
														
 
															     // Following test_10_full_model.cpp implementation
														
 
															     // Input: [total_dim_input, n_frames, batch]
														
 
															     // Output: [dim, num_bands, n_frames, batch]
														
@@ -321,6 +393,9 @@ ggml_tensor* BSRoformer::BuildTransformersGraph(
 
															     int n_frames,
														
 
															     int batch
														
 
															 ) {
														
 
															+    if (is_v2_model_) {
														
 
															+        return BuildTransformersGraphV2(ctx, input, gf, pos_time_exp, pos_freq_exp, n_frames, batch);
														
 
															+    }
														
 
															     // Following test_10_full_model.cpp implementation
														
 
															     // Input: [dim, num_bands, n_frames, batch]
														
@@ -341,7 +416,7 @@ ggml_tensor* BSRoformer::BuildTransformersGraph(
 
															                 x = ggml_add(ctx, x, s);
														
 
															             }
														
 
															         }
														
 
															-        // ========== TIME TRANSFORMER ==========
														
 
															+        // ========== TIME TRANSFORMER ========== 
														
 
															         // Permute: [D, F, T, B] -> [D, T, F, B]
														
 
															         x = ggml_permute(ctx, x, 0, 2, 1, 3);
														
 
															         x = ggml_cont(ctx, x);
														
@@ -493,7 +568,7 @@ ggml_tensor* BSRoformer::BuildTransformersGraph(
 
															         x = ggml_permute(ctx, x, 0, 2, 1, 3);
														
 
															         x = ggml_cont(ctx, x);
														
 
															-        // ========== FREQ TRANSFORMER ==========
														
 
															+        // ========== FREQ TRANSFORMER ========== 
														
 
															         int tb = T * B;
														
 
															         ggml_tensor* x_freq_packed = ggml_reshape_3d(ctx, x, D, F, tb);
														
@@ -558,12 +633,8 @@ ggml_tensor* BSRoformer::BuildTransformersGraph(
 
															         ggml_tensor* fV_fa = fV; // fV is contiguous [DIM_HEAD, F, HEADS, tb]
														
 
															         // float scale is already defined in scope (Time Transformer block) or re-define if shadowed loop?
														
 
															-        // Actually 'scale' was defined inside the Time Transformer loop, so it persists? 
														
 
															-        // No, Freq Transformer is in the same loop logic? 
														
 
															-        // Let's check scope. It's in the same 'layer' loop.
														
 
															-        // But previously I removed the definition line in Time Transformer too? No, I added it back above.
														
 
															-        // Wait, best to redeclare or rely on scope? 
														
 
															-        // Time Transformer code block vs Freq Transformer.
														
 
															+        // Actually 'scale' was defined inside the Time Transformer loop, so it persists?
														
 
															+        // No, Freq Transformer is in the same loop logic?
														
 
															         // Let's just use the value. 
														
 
															         // Re-reading Freq Block:
														
 
															         // Need to be safe. Redefine 'scale' if needed or ensuring it's available.
														
@@ -707,7 +778,7 @@ ggml_tensor* BSRoformer::BuildMaskEstimatorGraph(
 
															         total_out_dim += band_out_dims[b];
														
 
															     }
														
 
															-    ggml_tensor* x = input;  // [D, F, T, B]
														
 
															+    ggml_tensor* x = input;  // [D, F, T, B] 
														
 
															     // Create mask_output tensor: [total_out_dim, num_stems, n_frames, batch]
														
 
															     ggml_tensor* mask_output = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, total_out_dim, NUM_STEMS, n_frames, batch);
														
@@ -800,3 +871,105 @@ ggml_tensor* BSRoformer::BuildMaskEstimatorGraph(
 
															     return mask_check;
														
 
															 }
														
 
															+
														
 
															+ggml_tensor* BSRoformer::BuildTransformersGraphV2(
														
 
															+    ggml_context* ctx,
														
 
															+    ggml_tensor* input,
														
 
															+    ggml_cgraph* gf,
														
 
															+    ggml_tensor* pos_time_exp,
														
 
															+    ggml_tensor* pos_freq_exp,
														
 
															+    int n_frames,
														
 
															+    int batch
														
 
															+) {
														
 
															+    ggml_tensor* x = input;
														
 
															+
														
 
															+    for (int layer = 0; layer < depth_; ++layer) {
														
 
															+        // Time Transformer
														
 
															+        for (int time_layer = 0; time_layer < time_transformer_depth_; ++time_layer) {
														
 
															+            std::string time_prefix = "blk." + std::to_string(layer) + ".time_attn." + std::to_string(time_layer);
														
 
															+            ggml_tensor* x_norm = ggml_rms_norm(ctx, x, 1e-6f);
														
 
															+            ggml_tensor* t_attn_norm_w = GetWeight(time_prefix + ".norm.weight");
														
 
															+            x_norm = ggml_mul(ctx, x_norm, t_attn_norm_w);
														
 
															+
														
 
															+            ggml_tensor* t_qkv_w = GetWeight(time_prefix + ".qkv.weight");
														
 
															+            ggml_tensor* qkv_out = ggml_mul_mat(ctx, t_qkv_w, x_norm);
														
 
															+
														
 
															+            // Split Q, K, V
														
 
															+            ggml_tensor* Q = ggml_view_2d(ctx, qkv_out, dim_, n_frames, qkv_out->nb[1], 0);
														
 
															+            ggml_tensor* K = ggml_view_2d(ctx, qkv_out, dim_, n_frames, qkv_out->nb[1], dim_ * sizeof(float));
														
 
															+            ggml_tensor* V = ggml_view_2d(ctx, qkv_out, dim_, n_frames, qkv_out->nb[1], dim_ * 2 * sizeof(float));
														
 
															+
														
 
															+            // RoPE
														
 
															+            Q = ggml_rope_ext(ctx, Q, pos_time_exp, nullptr, dim_head_, GGML_ROPE_TYPE_NORMAL, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
														
 
															+            K = ggml_rope_ext(ctx, K, pos_time_exp, nullptr, dim_head_, GGML_ROPE_TYPE_NORMAL, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
														
 
															+
														
 
															+            // Attention
														
 
															+            ggml_tensor* attn = ggml_flash_attn_ext(ctx, Q, K, V, nullptr, 1.0f / sqrtf(dim_head_), 0.0f, 0.0f);
														
 
															+            
														
 
															+            // Output projection
														
 
															+            ggml_tensor* t_out_w = GetWeight(time_prefix + ".out.weight");
														
 
															+            attn = ggml_mul_mat(ctx, t_out_w, attn);
														
 
															+
														
 
															+            x = ggml_add(ctx, x, attn);
														
 
															+
														
 
															+            // MLP
														
 
															+            ggml_tensor* x_mlp = ggml_rms_norm(ctx, x, 1e-6f);
														
 
															+            ggml_tensor* mlp_norm_w = GetWeight("blk." + std::to_string(layer) + ".time_ff." + std::to_string(time_layer) + ".norm.weight");
														
 
															+            x_mlp = ggml_mul(ctx, x_mlp, mlp_norm_w);
														
 
															+
														
 
															+            ggml_tensor* mlp_in_w = GetWeight("blk." + std::to_string(layer) + ".time_ff." + std::to_string(time_layer) + ".in.weight");
														
 
															+            x_mlp = ggml_mul_mat(ctx, mlp_in_w, x_mlp);
														
 
															+            x_mlp = ggml_gelu(ctx, x_mlp);
														
 
															+
														
 
															+            ggml_tensor* mlp_out_w = GetWeight("blk." + std::to_string(layer) + ".time_ff." + std::to_string(time_layer) + ".out.weight");
														
 
															+            x_mlp = ggml_mul_mat(ctx, mlp_out_w, x_mlp);
														
 
															+
														
 
															+            x = ggml_add(ctx, x, x_mlp);
														
 
															+        }
														
 
															+
														
 
															+        // Freq Transformer
														
 
															+        for (int freq_layer = 0; freq_layer < freq_transformer_depth_; ++freq_layer) {
														
 
															+            std::string freq_prefix = "blk." + std::to_string(layer) + ".freq_attn." + std::to_string(freq_layer);
														
 
															+            ggml_tensor* x_norm = ggml_rms_norm(ctx, x, 1e-6f);
														
 
															+            ggml_tensor* f_attn_norm_w = GetWeight(freq_prefix + ".norm.weight");
														
 
															+            x_norm = ggml_mul(ctx, x_norm, f_attn_norm_w);
														
 
															+
														
 
															+            ggml_tensor* f_qkv_w = GetWeight(freq_prefix + ".qkv.weight");
														
 
															+            ggml_tensor* qkv_out = ggml_mul_mat(ctx, f_qkv_w, x_norm);
														
 
															+
														
 
															+            // Split Q, K, V
														
 
															+            ggml_tensor* Q = ggml_view_2d(ctx, qkv_out, dim_, num_bands_, qkv_out->nb[1], 0);
														
 
															+            ggml_tensor* K = ggml_view_2d(ctx, qkv_out, dim_, num_bands_, qkv_out->nb[1], dim_ * sizeof(float));
														
 
															+            ggml_tensor* V = ggml_view_2d(ctx, qkv_out, dim_, num_bands_, qkv_out->nb[1], dim_ * 2 * sizeof(float));
														
 
															+
														
 
															+            // RoPE
														
 
															+            Q = ggml_rope_ext(ctx, Q, pos_freq_exp, nullptr, dim_head_, GGML_ROPE_TYPE_NORMAL, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
														
 
															+            K = ggml_rope_ext(ctx, K, pos_freq_exp, nullptr, dim_head_, GGML_ROPE_TYPE_NORMAL, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
														
 
															+
														
 
															+            // Attention
														
 
															+            ggml_tensor* attn = ggml_flash_attn_ext(ctx, Q, K, V, nullptr, 1.0f / sqrtf(dim_head_), 0.0f, 0.0f);
														
 
															+            
														
 
															+            // Output projection
														
 
															+            ggml_tensor* f_out_w = GetWeight(freq_prefix + ".out.weight");
														
 
															+            attn = ggml_mul_mat(ctx, f_out_w, attn);
														
 
															+
														
 
															+            x = ggml_add(ctx, x, attn);
														
 
															+
														
 
															+            // MLP
														
 
															+            ggml_tensor* x_mlp = ggml_rms_norm(ctx, x, 1e-6f);
														
 
															+            ggml_tensor* mlp_norm_w = GetWeight("blk." + std::to_string(layer) + ".freq_ff." + std::to_string(freq_layer) + ".norm.weight");
														
 
															+            x_mlp = ggml_mul(ctx, x_mlp, mlp_norm_w);
														
 
															+
														
 
															+            ggml_tensor* mlp_in_w = GetWeight("blk." + std::to_string(layer) + ".freq_ff." + std::to_string(freq_layer) + ".in.weight");
														
 
															+            x_mlp = ggml_mul_mat(ctx, mlp_in_w, x_mlp);
														
 
															+            x_mlp = ggml_gelu(ctx, x_mlp);
														
 
															+
														
 
															+            ggml_tensor* mlp_out_w = GetWeight("blk." + std::to_string(layer) + ".freq_ff." + std::to_string(freq_layer) + ".out.weight");
														
 
															+            x_mlp = ggml_mul_mat(ctx, mlp_out_w, x_mlp);
														
 
															+
														
 
															+            x = ggml_add(ctx, x, x_mlp);
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    return x;
														
 
															+}
														
--- a/src/model.h
+++ b/src/model.h
@@ -58,6 +58,8 @@ public:
 
															     const std::string& GetArchitecture() const { return architecture_; }
														
 
															     bool HasFinalNorm() const { return has_final_norm_; }
														
 
															     bool GetTransformerNormOutput() const { return transformer_norm_output_; }
														
 
															+
														
 
															+    bool IsV2Model() const { return is_v2_model_; }
														
 
															     // Inference defaults (from GGUF, can be overridden at runtime)
														
 
															     int GetDefaultChunkSize() const { return default_chunk_size_; }
														
@@ -158,6 +160,16 @@ private:
 
															     int mlp_num_layers_ = 3;                 // Detected from weights (BS=2 for depth=2)
														
 
															     int sample_rate_ = 44100;
														
 
															+    // V2 Params
														
 
															+    bool is_v2_model_ = false;
														
 
															+    int time_transformer_depth_ = 1;
														
 
															+    int freq_transformer_depth_ = 1;
														
 
															+    int num_key_value_heads_ = 4;
														
 
															+    int intermediate_size_ = 1152;
														
 
															+    int num_input_channels_ = 2;
														
 
															+    int band_proj_size_ = 256;
														
 
															+    int register_token_num_ = 4;
														
 
															+
														
 
															     // Inference defaults
														
 
															     int default_chunk_size_ = 352800;
														
 
															     int default_num_overlap_ = 2;
														
@@ -169,4 +181,14 @@ private:
 
															     // Helper to load GGUF
														
 
															     void LoadWeights(const std::string& path);
														
 
															+
														
 
															+    ggml_tensor* BuildTransformersGraphV2(
														
 
															+        ggml_context* ctx,
														
 
															+        ggml_tensor* input,
														
 
															+        ggml_cgraph* gf,
														
 
															+        ggml_tensor* pos_time_exp,
														
 
															+        ggml_tensor* pos_freq_exp,
														
 
															+        int n_frames,
														
 
															+        int batch = 1
														
 
															+    );
														
 
															 };
	`@@ -0,0 +1 @@`
			`+Subproject commit 57ea0bc119d722d74594196cc5b494a34dd87be4`
	`@@ -0,0 +1 @@`
			`+Subproject commit c4642788e83b0858bca449f9b6e71ddb015dfa5d`