před 4 měsíci · 993ad971b9
--- a/README.md
+++ b/README.md
@@ -2,11 +2,11 @@
 
															 [中文](README.zh.md) | English
														
 
															-High-performance C++ inference implementation for the Mel-Band-Roformer audio source separation model.
														
 
															+High-performance C++ inference implementation for the **BS Roformer** and **Mel-Band-Roformer** audio source separation model.
														
 
															 ## 📖 Introduction
														
 
															-This project is a pure C++ inference engine for the **Mel-Band-Roformer** and **BS Roformer** audio source separation models, built on the [GGML](https://github.com/ggerganov/ggml) tensor library. It primarily used for extracting vocals or accompaniment from music.
														
 
															+This project is a pure C++ inference engine for the **BS Roformer** and **Mel-Band-Roformer** audio source separation models, built on the [GGML](https://github.com/ggerganov/ggml) tensor library. It primarily used for extracting vocals or accompaniment from music.
														
 
															 ### ✨ Key Features
														
--- a/README.zh.md
+++ b/README.zh.md
@@ -2,11 +2,11 @@
 
															 中文 | [English](README.md)
														
 
															-Mel-Band-Roformer 音频源分离模型的高性能 C++ 推理实现。
														
 
															+**BS Roformer** 和 **Mel-Band-Roformer** 音频源分离模型的高性能 C++ 推理实现。
														
 
															 ## 📖 简介
														
 
															-本项目是 **Mel-Band-Roformer** 和 **BS Roformer** 音频源分离模型的纯 C++ 推理引擎，基于 [GGML](https://github.com/ggerganov/ggml) 张量库构建。主要用于从音乐中提取人声或伴奏。
														
 
															+本项目是 **BS Roformer** 和 **Mel-Band-Roformer** 音频源分离模型的纯 C++ 推理引擎，基于 [GGML](https://github.com/ggerganov/ggml) 张量库构建。主要用于从音乐中提取人声或伴奏。
														
 
															 ### ✨ 主要特性
														
--- a/include/bs_roformer/inference.h
+++ b/include/bs_roformer/inference.h
@@ -62,6 +62,10 @@ private:
 
															     struct ggml_tensor* pos_freq_ = nullptr;
														
 
															     struct ggml_tensor* mask_out_tensor_ = nullptr;
														
 
															+    // Cached Host Data (to avoid reallocation)
														
 
															+    std::vector<int32_t> pos_time_data_;
														
 
															+    std::vector<int32_t> pos_freq_data_;
														
 
															+
														
 
															     // Current config state
														
 
															     int cached_n_frames_ = -1;
														
--- a/src/inference.cpp
+++ b/src/inference.cpp
@@ -324,22 +324,36 @@ void Inference::RunInference(std::shared_ptr<ChunkState> state) {
 
															     int n_frames = state->n_frames;
														
 
															     // Prepare position data
														
 
															-    // TODO: Cache these to avoid allocation every frame if size is constant
														
 
															-    std::vector<int32_t> pos_time_data(n_frames * n_bands);
														
 
															-    for(int i=0; i < n_frames * n_bands; ++i) pos_time_data[i] = i % n_frames;
														
 
															+    // Use cached vectors to avoid allocation
														
 
															+    int required_time_size = n_frames * n_bands;
														
 
															+    if (pos_time_data_.size() != required_time_size) {
														
 
															+        pos_time_data_.resize(required_time_size);
														
 
															+        for(int i=0; i < required_time_size; ++i) pos_time_data_[i] = i % n_frames;
														
 
															+    }
														
 
															-    std::vector<int32_t> pos_freq_data(n_bands * n_frames);
														
 
															-    for(int i=0; i < n_bands * n_frames; ++i) pos_freq_data[i] = i % n_bands;
														
 
															+    int required_freq_size = n_bands * n_frames;
														
 
															+    // Note: pos_freq logic (i % n_bands) depends on n_bands (constant) and total size.
														
 
															+    // If n_frames changes, size changes, and values might depend on n_frames?
														
 
															+    // Wait, pos_freq_data[i] = i % n_bands. 
														
 
															+    // This is valid regardless of n_frames as long as size is correct.
														
 
															+    // But we should regenerate if size changes.
														
 
															+    if (pos_freq_data_.size() != required_freq_size) {
														
 
															+        pos_freq_data_.resize(required_freq_size);
														
 
															+        for(int i=0; i < required_freq_size; ++i) pos_freq_data_[i] = i % n_bands;
														
 
															+    }
														
 
															     // 4. Host -> Device
														
 
															     ggml_backend_tensor_set(input_tensor_, state->stft_flattened.data(), 0, ggml_nbytes(input_tensor_));
														
 
															-    ggml_backend_tensor_set(pos_time_, pos_time_data.data(), 0, ggml_nbytes(pos_time_));
														
 
															-    ggml_backend_tensor_set(pos_freq_, pos_freq_data.data(), 0, ggml_nbytes(pos_freq_));
														
 
															+    ggml_backend_tensor_set(pos_time_, pos_time_data_.data(), 0, ggml_nbytes(pos_time_));
														
 
															+    ggml_backend_tensor_set(pos_freq_, pos_freq_data_.data(), 0, ggml_nbytes(pos_freq_));
														
 
															     // 5. Compute
														
 
															     ggml_backend_graph_compute(model_->GetBackend(), gf_);
														
 
															     // 6. Device -> Host
														
 
															+    // Avoid reallocation if size roughly matches? 
														
 
															+    // ggml_nelements(mask_out_tensor_) is fixed for a given n_frames.
														
 
															+    // state->mask_output is a vector. resize handles it (no op if same size).
														
 
															     state->mask_output.resize(ggml_nelements(mask_out_tensor_));
														
 
															     ggml_backend_tensor_get(mask_out_tensor_, state->mask_output.data(), 0, ggml_nbytes(mask_out_tensor_));
														
 
															 }
														
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -58,16 +58,16 @@ void BSRoformer::LoadWeights(const std::string& path) {
 
															     // Normalization for legacy models (if any) or simplified internal handling
														
 
															     if (architecture_ == "bs") architecture_ = "bs_roformer";
														
 
															-    if (architecture_ == "mel_band") architecture_ = "bs_roformer";
														
 
															+    if (architecture_ == "mel_band") architecture_ = "mel_band_roformer";
														
 
															-    std::string kp = architecture_ + "."; // key prefix, e.g. "bs_roformer." or "bs_roformer."
														
 
															+    std::string kp = architecture_ + "."; // key prefix, e.g. "bs_roformer." or "mel_band_roformer."
														
 
															     // Set internal flags based on architecture
														
 
															     if (architecture_ == "bs_roformer") {
														
 
															         has_final_norm_ = true;
														
 
															         transformer_norm_output_ = false;
														
 
															     } else {
														
 
															-        // bs_roformer
														
 
															+        // mel_band_roformer
														
 
															         has_final_norm_ = false;
														
 
															         transformer_norm_output_ = true;
														
 
															     }
														
@@ -201,7 +201,8 @@ void BSRoformer::LoadWeights(const std::string& path) {
 
															     // Dynamic MLP detection
														
 
															     // Try to find mask_est.0.freq.0.mlp.{N}.weight
														
 
															     mlp_num_layers_ = 0;
														
 
															-    for (int idx = 0; idx <= 20; idx += 2) {  // Check indices 0, 2, 4... up to 10 layers
														
 
															+    const int MAX_MLP_LAYERS = 20;
														
 
															+    for (int idx = 0; idx <= MAX_MLP_LAYERS; idx += 2) {  // Check indices 0, 2, 4... up to MAX
														
 
															         std::string probe = "mask_est.0.freq.0.mlp." + std::to_string(idx) + ".weight";
														
 
															         if (GetWeight(probe) != nullptr) {
														
 
															             mlp_num_layers_++;
														
--- a/src/stft.h
+++ b/src/stft.h
@@ -18,6 +18,10 @@
 
															 #include <cstring>
														
 
															 #include <algorithm> // for std::swap
														
 
															+#ifdef USE_OPENMP
														
 
															+#include <omp.h>
														
 
															+#endif
														
 
															+
														
 
															 #ifndef M_PI
														
 
															 #define M_PI 3.14159265358979323846
														
 
															 #endif
														
@@ -105,21 +109,34 @@ inline void fft_radix2(Complex* data, int n, bool inverse = false) {
 
															  * @param input Real input array of size n
														
 
															  * @param output Complex output array of size n/2+1
														
 
															  * @param n Size of input (must be power of 2)
														
 
															+ * @param buffer Temporary buffer of size n (optional, handled internally if null)
														
 
															  */
														
 
															-inline void rfft(const float* input, Complex* output, int n) {
														
 
															+inline void rfft(const float* input, Complex* output, int n, std::vector<Complex>* buffer_ptr = nullptr) {
														
 
															     // Copy to complex buffer
														
 
															-    std::vector<Complex> buffer(n);
														
 
															-    for (int i = 0; i < n; ++i) {
														
 
															-        buffer[i] = Complex(input[i], 0.0f);
														
 
															-    }
														
 
															-    
														
 
															-    // Compute full FFT
														
 
															-    fft_radix2(buffer.data(), n, false);
														
 
															-    
														
 
															-    // Extract first n/2+1 coefficients (one-sided)
														
 
															-    int n_out = n / 2 + 1;
														
 
															-    for (int i = 0; i < n_out; ++i) {
														
 
															-        output[i] = buffer[i];
														
 
															+    // Use provided buffer to avoid allocation
														
 
															+    if (buffer_ptr) {
														
 
															+        if (buffer_ptr->size() < static_cast<size_t>(n)) buffer_ptr->resize(n);
														
 
															+        for (int i = 0; i < n; ++i) {
														
 
															+            (*buffer_ptr)[i] = Complex(input[i], 0.0f);
														
 
															+        }
														
 
															+        fft_radix2(buffer_ptr->data(), n, false);
														
 
															+        
														
 
															+        int n_out = n / 2 + 1;
														
 
															+        for (int i = 0; i < n_out; ++i) {
														
 
															+            output[i] = (*buffer_ptr)[i];
														
 
															+        }
														
 
															+    } else {
														
 
															+        std::vector<Complex> buffer(n);
														
 
															+        for (int i = 0; i < n; ++i) {
														
 
															+            buffer[i] = Complex(input[i], 0.0f);
														
 
															+        }
														
 
															+        
														
 
															+        fft_radix2(buffer.data(), n, false);
														
 
															+        
														
 
															+        int n_out = n / 2 + 1;
														
 
															+        for (int i = 0; i < n_out; ++i) {
														
 
															+            output[i] = buffer[i];
														
 
															+        }
														
 
															     }
														
 
															 }
														
@@ -128,25 +145,37 @@ inline void rfft(const float* input, Complex* output, int n) {
 
															  * @param input Complex input array of size n/2+1
														
 
															  * @param output Real output array of size n
														
 
															  * @param n_out Size of output (must be power of 2)
														
 
															+ * @param buffer Temporary buffer of size n_out (optional)
														
 
															  */
														
 
															-inline void irfft(const Complex* input, float* output, int n_out) {
														
 
															+inline void irfft(const Complex* input, float* output, int n_out, std::vector<Complex>* buffer_ptr = nullptr) {
														
 
															     int n_freq = n_out / 2 + 1;
														
 
															-    // Reconstruct full spectrum (conjugate symmetry)
														
 
															-    std::vector<Complex> buffer(n_out);
														
 
															-    for (int i = 0; i < n_freq; ++i) {
														
 
															-        buffer[i] = input[i];
														
 
															-    }
														
 
															-    for (int i = n_freq; i < n_out; ++i) {
														
 
															-        buffer[i] = std::conj(buffer[n_out - i]);
														
 
															-    }
														
 
															-    
														
 
															-    // Compute inverse FFT
														
 
															-    fft_radix2(buffer.data(), n_out, true);
														
 
															-    
														
 
															-    // Extract real part
														
 
															-    for (int i = 0; i < n_out; ++i) {
														
 
															-        output[i] = buffer[i].real();
														
 
															+    if (buffer_ptr) {
														
 
															+        if (buffer_ptr->size() < static_cast<size_t>(n_out)) buffer_ptr->resize(n_out);
														
 
															+        for (int i = 0; i < n_freq; ++i) {
														
 
															+            (*buffer_ptr)[i] = input[i];
														
 
															+        }
														
 
															+         for (int i = n_freq; i < n_out; ++i) {
														
 
															+            (*buffer_ptr)[i] = std::conj((*buffer_ptr)[n_out - i]);
														
 
															+        }
														
 
															+        fft_radix2(buffer_ptr->data(), n_out, true);
														
 
															+        for (int i = 0; i < n_out; ++i) {
														
 
															+            output[i] = (*buffer_ptr)[i].real();
														
 
															+        }
														
 
															+    } else {
														
 
															+        std::vector<Complex> buffer(n_out);
														
 
															+        for (int i = 0; i < n_freq; ++i) {
														
 
															+            buffer[i] = input[i];
														
 
															+        }
														
 
															+        for (int i = n_freq; i < n_out; ++i) {
														
 
															+            buffer[i] = std::conj(buffer[n_out - i]);
														
 
															+        }
														
 
															+        
														
 
															+        fft_radix2(buffer.data(), n_out, true);
														
 
															+        
														
 
															+        for (int i = 0; i < n_out; ++i) {
														
 
															+            output[i] = buffer[i].real();
														
 
															+        }
														
 
															     }
														
 
															 }
														
@@ -224,42 +253,42 @@ inline void compute_stft(
 
															         std::memcpy(window_padded.data(), window, n_fft * sizeof(float));
														
 
															     }
														
 
															-    // Buffers
														
 
															-    std::vector<float> frame(n_fft);
														
 
															-    std::vector<Complex> fft_out(n_freq);
														
 
															+    // Pre-allocate thread-local buffers
														
 
															+    int max_threads = 1;
														
 
															+    #ifdef USE_OPENMP
														
 
															+    max_threads = omp_get_max_threads();
														
 
															+    #endif
														
 
															+    
														
 
															+    std::vector<std::vector<float>> thread_frames(max_threads, std::vector<float>(n_fft));
														
 
															+    std::vector<std::vector<Complex>> thread_fft_outs(max_threads, std::vector<Complex>(n_freq));
														
 
															+    std::vector<std::vector<Complex>> thread_fft_buffers(max_threads, std::vector<Complex>(n_fft));
														
 
															     // Process each frame
														
 
															     #ifdef USE_OPENMP
														
 
															     #pragma omp parallel for
														
 
															     #endif
														
 
															     for (int f = 0; f < n_frames; ++f) {
														
 
															-        int start = f * hop_length;
														
 
															+        int tid = 0;
														
 
															+        #ifdef USE_OPENMP
														
 
															+        tid = omp_get_thread_num();
														
 
															+        #endif
														
 
															-        // Extract and window frame
														
 
															-        // Need private buffer for frame and fft_out if logical threads share memory?
														
 
															-        // Wait, std::vector inside loop is local to block, so essentially thread-private?
														
 
															-        // YES. Variables declared inside the loop are private to the iteration/thread.
														
 
															+        std::vector<float>& frame = thread_frames[tid];
														
 
															-        // However, we need to be careful about allocating vectors inside a loop in parallel (heap contention).
														
 
															-        // It's better to allocate buffers per thread or use raw arrays.
														
 
															-        // For simplicity and since n_fft is small (2048), stack array or thread_local vector is better.
														
 
															-        // But std::vector inside parallel for is safe but might allocate.
														
 
															-        // n_fft=2048 float is 8KB. 
														
 
															-        std::vector<float> frame(n_fft); // Allocation!
														
 
															-        std::vector<Complex> fft_out(n_freq);
														
 
															+        int start = f * hop_length;
														
 
															         for (int i = 0; i < n_fft; ++i) {
														
 
															             frame[i] = padded[start + i] * window_padded[i];
														
 
															         }
														
 
															-        // Compute FFT
														
 
															-        rfft(frame.data(), fft_out.data(), n_fft);
														
 
															+        // Compute FFT using pre-allocated buffers
														
 
															+        rfft(frame.data(), thread_fft_outs[tid].data(), n_fft, &thread_fft_buffers[tid]);
														
 
															         // Store in output [n_freq, n_frames, 2] format
														
 
															         for (int k = 0; k < n_freq; ++k) {
														
 
															             // Note: Output layout is [Freq, Time, 2]
														
 
															-            output[(k * n_frames + f) * 2 + 0] = fft_out[k].real();
														
 
															-            output[(k * n_frames + f) * 2 + 1] = fft_out[k].imag();
														
 
															+            output[(k * n_frames + f) * 2 + 0] = thread_fft_outs[tid][k].real();
														
 
															+            output[(k * n_frames + f) * 2 + 1] = thread_fft_outs[tid][k].imag();
														
 
															         }
														
 
															     }
														
 
															 }
														
@@ -304,24 +333,30 @@ inline void compute_istft(
 
															         std::memcpy(window_padded.data(), window, n_fft * sizeof(float));
														
 
															     }
														
 
															-    // Overlap-add buffers
														
 
															-    // This is tricky for parallelization: race condition on y (overlap-add).
														
 
															-    // We CANNOT parallelize the write to 'y' easily without atomic float add (slow/hard) or reduction.
														
 
															-    // APPROACH:
														
 
															-    // 1. Parallel IFFT: Compute all frames' time-domain signals into a large buffer [n_frames, n_fft].
														
 
															-    // 2. Serial Overlap-Add: Add them up. (Overlap-add is O(N_Frames * N_FFT), same complexity, but memory bound).
														
 
															-    // Serial part might be fast enough if FFT is the heavy lifter.
														
 
															-    // FFT is O(N log N). Overlap add is O(N). FFT dominates.
														
 
															-    
														
 
															     // Step 1: Compute all IFFTs in parallel
														
 
															     std::vector<float> frames_time_domain(n_frames * n_fft);
														
 
															+    // Pre-allocate thread-local buffers
														
 
															+    int max_threads = 1;
														
 
															+    #ifdef USE_OPENMP
														
 
															+    max_threads = omp_get_max_threads();
														
 
															+    #endif
														
 
															+    
														
 
															+    std::vector<std::vector<Complex>> thread_fft_ins(max_threads, std::vector<Complex>(n_freq));
														
 
															+    std::vector<std::vector<float>> thread_frame_outs(max_threads, std::vector<float>(n_fft));
														
 
															+    std::vector<std::vector<Complex>> thread_fft_buffers(max_threads, std::vector<Complex>(n_fft));
														
 
															+    
														
 
															     #ifdef USE_OPENMP
														
 
															     #pragma omp parallel for
														
 
															     #endif
														
 
															     for (int f = 0; f < n_frames; ++f) {
														
 
															-        std::vector<Complex> fft_in(n_freq);
														
 
															-        std::vector<float> frame_out(n_fft);
														
 
															+        int tid = 0;
														
 
															+        #ifdef USE_OPENMP
														
 
															+        tid = omp_get_thread_num();
														
 
															+        #endif
														
 
															+        
														
 
															+        std::vector<Complex>& fft_in = thread_fft_ins[tid];
														
 
															+        std::vector<float>& frame_out = thread_frame_outs[tid];
														
 
															         // Extract complex spectrum
														
 
															         for (int k = 0; k < n_freq; ++k) {
														
@@ -331,7 +366,7 @@ inline void compute_istft(
 
															         }
														
 
															         // IFFT
														
 
															-        irfft(fft_in.data(), frame_out.data(), n_fft);
														
 
															+        irfft(fft_in.data(), frame_out.data(), n_fft, &thread_fft_buffers[tid]);
														
 
															         // Store
														
 
															         std::memcpy(&frames_time_domain[f * n_fft], frame_out.data(), n_fft * sizeof(float));