4 mēneši atpakaļ · e5db284b19
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,2 @@
 
				-build
			
 
				+build*
			
 
				 tests/data
			
--- a/scripts/generate_test_data.py
+++ b/scripts/generate_test_data.py
@@ -300,6 +300,29 @@ def generate_test_data(
 
				             return_complex=True,
			
 
				         )
			
 
				         stft_repr = torch.view_as_real(stft_repr)
			
 
				+
			
 
				+        # ===== CAPTURE: Raw STFT/ISTFT for C++ Verification =====
			
 
				+        # Unpack to [batch, channels, freq, time, 2]
			
 
				+        stft_raw_unpacked = unpack_one(
			
 
				+            stft_repr, batch_audio_channel_packed_shape, "* f t c"
			
 
				+        )
			
 
				+        captured["stft_raw"] = stft_raw_unpacked.clone()
			
 
				+
			
 
				+        # Compute ISTFT directly on this raw STFT (Identity check)
			
 
				+        stft_complex = torch.view_as_complex(stft_repr)
			
 
				+        istft_check = torch.istft(
			
 
				+            stft_complex,
			
 
				+            **model.stft_kwargs,
			
 
				+            window=stft_window,
			
 
				+            return_complex=False,
			
 
				+            length=istft_length,
			
 
				+        )
			
 
				+        istft_check_unpacked = unpack_one(
			
 
				+            istft_check, batch_audio_channel_packed_shape, "* t"
			
 
				+        )
			
 
				+        captured["istft_raw"] = istft_check_unpacked.clone()
			
 
				+        # ========================================================
			
 
				+
			
 
				         stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c")
			
 
				         stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c")
			
 
				 
			
--- a/src/inference.cpp
+++ b/src/inference.cpp
@@ -11,6 +11,10 @@
 
				 #include <ggml-backend.h>
			
 
				 #include <chrono>
			
 
				 #include <future>
			
 
				+#include <queue>
			
 
				+#include <thread>
			
 
				+#include <mutex>
			
 
				+#include <condition_variable>
			
 
				 
			
 
				 using Complex = std::complex<float>;
			
 
				 
			
@@ -386,6 +390,63 @@ std::vector<std::vector<float>> Inference::ProcessChunk(const std::vector<float>
 
				 // Pipelined Overlap-Add Logic
			
 
				 // =================================================================================================
			
 
				 
			
 
				+// =================================================================================================
			
 
				+// Thread Safe Queue
			
 
				+// =================================================================================================
			
 
				+
			
 
				+template <typename T>
			
 
				+class ThreadSafeQueue {
			
 
				+public:
			
 
				+    ThreadSafeQueue(size_t max_size) : max_size_(max_size), shutdown_(false) {}
			
 
				+
			
 
				+    ~ThreadSafeQueue() {
			
 
				+        Shutdown();
			
 
				+    }
			
 
				+
			
 
				+    void Push(T item) {
			
 
				+        std::unique_lock<std::mutex> lock(mutex_);
			
 
				+        cv_push_.wait(lock, [this] { return queue_.size() < max_size_ || shutdown_; });
			
 
				+        if (shutdown_) return;
			
 
				+        queue_.push(std::move(item));
			
 
				+        cv_pop_.notify_one();
			
 
				+    }
			
 
				+
			
 
				+    bool Pop(T& item) {
			
 
				+        std::unique_lock<std::mutex> lock(mutex_);
			
 
				+        cv_pop_.wait(lock, [this] { return !queue_.empty() || shutdown_; });
			
 
				+        if (queue_.empty() && shutdown_) return false;
			
 
				+        item = std::move(queue_.front());
			
 
				+        queue_.pop();
			
 
				+        cv_push_.notify_one();
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    void Shutdown() {
			
 
				+        {
			
 
				+            std::lock_guard<std::mutex> lock(mutex_);
			
 
				+            shutdown_ = true;
			
 
				+        }
			
 
				+        cv_push_.notify_all();
			
 
				+        cv_pop_.notify_all();
			
 
				+    }
			
 
				+
			
 
				+private:
			
 
				+    std::queue<T> queue_;
			
 
				+    size_t max_size_;
			
 
				+    bool shutdown_;
			
 
				+    std::mutex mutex_;
			
 
				+    std::condition_variable cv_push_;
			
 
				+    std::condition_variable cv_pop_;
			
 
				+};
			
 
				+
			
 
				+// =================================================================================================
			
 
				+// Pipelined Overlap-Add Logic
			
 
				+// =================================================================================================
			
 
				+
			
 
				+// =================================================================================================
			
 
				+// Pipelined Overlap-Add Logic (Optimized 3-Stage)
			
 
				+// =================================================================================================
			
 
				+
			
 
				 std::vector<std::vector<float>> Inference::ProcessOverlapAddPipelined(const std::vector<float>& input_audio, 
			
 
				                                                          int chunk_size, 
			
 
				                                                          int num_overlap,
			
@@ -443,6 +504,7 @@ std::vector<std::vector<float>> Inference::ProcessOverlapAddPipelined(const std:
 
				     std::vector<std::vector<float>> result; // [stems][samples]
			
 
				     std::vector<float> counter(n_padded_samples * channels, 0.0f);
			
 
				     std::vector<float> window_base = GetWindow(chunk_size, fade_size);
			
 
				+    std::mutex result_mutex; // Protects 'result' and 'counter'
			
 
				     
			
 
				     // lambda to extract chunk 'i'
			
 
				     auto extract_chunk = [&](int i) -> std::vector<float> {
			
@@ -476,11 +538,14 @@ std::vector<std::vector<float>> Inference::ProcessOverlapAddPipelined(const std:
 
				     };
			
 
				 
			
 
				     // lambda to accumulate result 'state' at offset 'i'
			
 
				+    // Now protected by mutex
			
 
				     auto accumulate_result = [&](std::shared_ptr<ChunkState> state, int i) {
			
 
				         if (!state) return;
			
 
				-        const std::vector<std::vector<float>>& chunk_out_stems = state->final_audio; // Now [stems][samples]
			
 
				+        const std::vector<std::vector<float>>& chunk_out_stems = state->final_audio;
			
 
				         if (chunk_out_stems.empty()) return;
			
 
				         
			
 
				+        std::lock_guard<std::mutex> lock(result_mutex);
			
 
				+
			
 
				         // Lazy Initialize result
			
 
				         if (result.empty()) {
			
 
				             int num_stems = chunk_out_stems.size();
			
@@ -505,9 +570,9 @@ std::vector<std::vector<float>> Inference::ProcessOverlapAddPipelined(const std:
 
				             
			
 
				             for (int s = 0; s < num_stems; ++s) {
			
 
				                  if (s >= chunk_out_stems.size()) continue;
			
 
				-                 const auto& stem_chunk = chunk_out_stems[s];
			
 
				-                 result[s][res_idx + 0] += stem_chunk[chk_idx + 0] * w;
			
 
				-                 result[s][res_idx + 1] += stem_chunk[chk_idx + 1] * w;
			
 
				+                 // result[s] is huge, but we access linearly in this block
			
 
				+                 result[s][res_idx + 0] += chunk_out_stems[s][chk_idx + 0] * w;
			
 
				+                 result[s][res_idx + 1] += chunk_out_stems[s][chk_idx + 1] * w;
			
 
				             }
			
 
				             
			
 
				             // Counter is same for all stems, just update once
			
@@ -516,92 +581,69 @@ std::vector<std::vector<float>> Inference::ProcessOverlapAddPipelined(const std:
 
				         }
			
 
				     };
			
 
				 
			
 
				-    // ==========================================================
			
 
				-    // Pipeline Loop
			
 
				-    // ==========================================================
			
 
				-    
			
 
				-    // Future for the NEXT chunk's preprocessing
			
 
				-    std::future<std::shared_ptr<ChunkState>> next_prep_future;
			
 
				-    
			
 
				-    // Future for the PREVIOUS chunk's postprocessing
			
 
				-    std::future<void> prev_post_future;
			
 
				+    // =================================================================================================
			
 
				+    // 3-Stage Pipeline
			
 
				+    // =================================================================================================
			
 
				     
			
 
				-    std::shared_ptr<ChunkState> prev_state = nullptr;
			
 
				+    // Queues
			
 
				+    // Bounded size to prevents running out of memory
			
 
				+    // 3 items buffer is enough to keep GPU busy
			
 
				+    ThreadSafeQueue<std::shared_ptr<ChunkState>> input_queue(3);
			
 
				+    ThreadSafeQueue<std::shared_ptr<ChunkState>> output_queue(3);
			
 
				     
			
 
				-    int i = 0;
			
 
				-    int current_offset = 0;
			
 
				-    
			
 
				-    // Bootstrap: Start PreProcessing first chunk
			
 
				-    {
			
 
				-        std::vector<float> chunk0 = extract_chunk(0);
			
 
				-        // Async launch
			
 
				-        next_prep_future = std::async(std::launch::async, 
			
 
				-            [this](std::vector<float> c, int id) { return this->PreProcessChunk(c, id); }, 
			
 
				-            std::move(chunk0), 0);
			
 
				-    }
			
 
				+    // Structure to hold chunk metadata together
			
 
				+    struct ChunkTask {
			
 
				+        int offset;
			
 
				+        std::shared_ptr<ChunkState> state;
			
 
				+    };
			
 
				     
			
 
				-    while (current_offset < n_padded_samples) {
			
 
				-        // 1. Wait for PRE-processing of CURRENT chunk
			
 
				-        if (next_prep_future.valid()) {
			
 
				-            // This blocks until STFT is done.
			
 
				-            // In steady state, this should be ready or nearly ready while GPU was busy.
			
 
				-        }
			
 
				-        auto current_state = next_prep_future.get();
			
 
				-        
			
 
				-        // 2. Start PRE-processing of NEXT chunk (if exists)
			
 
				-        int next_offset = current_offset + step;
			
 
				-        if (next_offset < n_padded_samples) {
			
 
				-             std::vector<float> chunk_next = extract_chunk(next_offset);
			
 
				-             next_prep_future = std::async(std::launch::async, 
			
 
				-                [this](std::vector<float> c, int id) { return this->PreProcessChunk(c, id); }, 
			
 
				-                std::move(chunk_next), next_offset);
			
 
				-        } else {
			
 
				-            // No more next chunks
			
 
				-        }
			
 
				-        
			
 
				-        // 3. Run Inference on CURRENT chunk (GPU Sync)
			
 
				-        // This blocks heavily.
			
 
				-        RunInference(current_state);
			
 
				-        
			
 
				-        // 4. Wait for POST-processing of PREVIOUS chunk
			
 
				-        if (prev_post_future.valid()) {
			
 
				-            prev_post_future.get();
			
 
				+    // 1. Preprocessor Thread
			
 
				+    auto preproccessor = std::thread([&]() {
			
 
				+        int current_offset = 0;
			
 
				+        while (current_offset < n_padded_samples) {
			
 
				+            std::vector<float> chunk = extract_chunk(current_offset);
			
 
				+            
			
 
				+            auto state = PreProcessChunk(chunk, current_offset); 
			
 
				+            
			
 
				+            input_queue.Push(state);
			
 
				+            current_offset += step;
			
 
				+        }
			
 
				+        input_queue.Shutdown();
			
 
				+    });
			
 
				+    
			
 
				+    // 3. Postprocessor Thread
			
 
				+    auto postprocessor = std::thread([&]() {
			
 
				+        std::shared_ptr<ChunkState> state;
			
 
				+        while (output_queue.Pop(state)) {
			
 
				+            // This does ISTFT (CPU intensive)
			
 
				+            PostProcessChunk(state);
			
 
				+            
			
 
				+            // Accumulate (Memory bandwidth intensive + Mutex)
			
 
				+            accumulate_result(state, state->id); // state->id holds offset
			
 
				+            
			
 
				+            if (progress_callback) {
			
 
				+                float progress = (float)std::min(state->id + step, n_padded_samples) / n_padded_samples;
			
 
				+                progress_callback(progress);
			
 
				+            }
			
 
				         }
			
 
				+    });
			
 
				+    
			
 
				+    // 2. Main Thread (Inference Loop)
			
 
				+    std::shared_ptr<ChunkState> state;
			
 
				+    while (true) {
			
 
				+        bool ok = input_queue.Pop(state);
			
 
				+        if (!ok) break; // Input queue shutdown and empty
			
 
				         
			
 
				-        // 5. Accumulate PREVIOUS chunk result (Serial, fast)
			
 
				-        // Note: PostProcessChunk fills 'final_audio', but doesn't accumulate to 'result'.
			
 
				-        // We do accumulation here on main thread to avoid races on 'result' buffer.
			
 
				-        if (prev_state) {
			
 
				-            int prev_offset = current_offset - step;
			
 
				-            accumulate_result(prev_state, prev_offset);
			
 
				-            prev_state = nullptr; // Free memory
			
 
				-        }
			
 
				+        // This does GGML Inference (GPU intensive, Blocking)
			
 
				+        RunInference(state);
			
 
				         
			
 
				-        // 6. Start POST-processing of CURRENT chunk
			
 
				-        prev_state = current_state;
			
 
				-        // Use shared_ptr copy
			
 
				-        prev_post_future = std::async(std::launch::async, 
			
 
				-            [this](std::shared_ptr<ChunkState> s) { this->PostProcessChunk(s); }, 
			
 
				-            prev_state);
			
 
				-            
			
 
				-        // Advance
			
 
				-        current_offset += step;
			
 
				-
			
 
				-        if (progress_callback) {
			
 
				-            float progress = (float)std::min(current_offset, n_padded_samples) / n_padded_samples;
			
 
				-            progress_callback(progress);
			
 
				-        }
			
 
				+        output_queue.Push(state);
			
 
				     }
			
 
				     
			
 
				-    // Drain Pipeline
			
 
				-    // Wait for last post-process
			
 
				-    if (prev_post_future.valid()) {
			
 
				-        prev_post_future.get();
			
 
				-    }
			
 
				-    if (prev_state) {
			
 
				-        int prev_offset = current_offset - step;
			
 
				-        accumulate_result(prev_state, prev_offset);
			
 
				-    }
			
 
				+    // Wait for threads
			
 
				+    output_queue.Shutdown();
			
 
				+    if (preproccessor.joinable()) preproccessor.join();
			
 
				+    if (postprocessor.joinable()) postprocessor.join();
			
 
				     
			
 
				     // Normalize and Crop
			
 
				     // result is [stems][samples]
			
--- a/src/stft.h
+++ b/src/stft.h
@@ -1,22 +1,22 @@
 
				 #pragma once
			
 
				 /**
			
 
				- * stft.h - STFT/ISTFT implementation
			
 
				+ * stft.h - STFT/ISTFT implementation (Optimized)
			
 
				  * 
			
 
				  * Implements:
			
 
				- * - Hann window generation
			
 
				+ * - Table-based Hann window generation
			
 
				+ * - Table-based Radix-2 FFT (Twiddle factors & Bit-reversal)
			
 
				+ * - Thread-safe Memory Pooling (STFTBuffer)
			
 
				  * - Center padding (reflect mode)
			
 
				  * - Frame extraction
			
 
				- * - Radix-2 Cooley-Tukey FFT
			
 
				- * - Real-to-complex FFT (rfft)
			
 
				- * - Inverse FFT (irfft)
			
 
				- * - Full STFT/ISTFT matching torch.stft/torch.istft
			
 
				  */
			
 
				 
			
 
				 #include <cmath>
			
 
				 #include <vector>
			
 
				 #include <complex>
			
 
				 #include <cstring>
			
 
				-#include <algorithm> // for std::swap
			
 
				+#include <algorithm>
			
 
				+#include <memory>
			
 
				+#include <mutex>
			
 
				 
			
 
				 #ifdef USE_OPENMP
			
 
				 #include <omp.h>
			
@@ -28,19 +28,44 @@
 
				 
			
 
				 namespace stft {
			
 
				 
			
 
				-// Complex number type
			
 
				 using Complex = std::complex<float>;
			
 
				 
			
 
				 //=============================================================================
			
 
				-// Window Functions
			
 
				+// Memory Pooling
			
 
				 //=============================================================================
			
 
				 
			
 
				 /**
			
 
				- * Generate Hann window matching torch.hann_window()
			
 
				- * PyTorch uses periodic=True by default for STFT compatibility
			
 
				- * Periodic formula: 0.5 * (1 - cos(2*pi*n / N))
			
 
				- * Symmetric formula: 0.5 * (1 - cos(2*pi*n / (N-1)))
			
 
				+ * Thread-local buffer storage to avoid frequent allocations in STFT/ISTFT loops.
			
 
				  */
			
 
				+struct STFTBuffer {
			
 
				+    // FFT buffers
			
 
				+    std::vector<Complex> fft_in;
			
 
				+    std::vector<Complex> fft_out;
			
 
				+    std::vector<Complex> fft_scratch;
			
 
				+    
			
 
				+    // Frame buffers
			
 
				+    std::vector<float> frame_in;
			
 
				+    std::vector<float> frame_out;
			
 
				+    
			
 
				+    // Window buffers
			
 
				+    std::vector<float> window_padded;
			
 
				+    std::vector<float> padded_audio;
			
 
				+    
			
 
				+    void Resize(int n_fft, int padded_len = 0) {
			
 
				+        if (fft_in.size() != n_fft) fft_in.resize(n_fft);
			
 
				+        if (fft_out.size() != n_fft) fft_out.resize(n_fft);
			
 
				+        if (fft_scratch.size() != n_fft) fft_scratch.resize(n_fft);
			
 
				+        if (frame_in.size() != n_fft) frame_in.resize(n_fft);
			
 
				+        if (frame_out.size() != n_fft) frame_out.resize(n_fft);
			
 
				+        if (window_padded.size() != n_fft) window_padded.resize(n_fft);
			
 
				+        if (padded_len > 0 && padded_audio.size() < padded_len) padded_audio.resize(padded_len);
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+//=============================================================================
			
 
				+// Window Functions
			
 
				+//=============================================================================
			
 
				+
			
 
				 inline void hann_window(float* out, int size, bool periodic = true) {
			
 
				     int divisor = periodic ? size : (size - 1);
			
 
				     for (int i = 0; i < size; ++i) {
			
@@ -49,153 +74,151 @@ inline void hann_window(float* out, int size, bool periodic = true) {
 
				 }
			
 
				 
			
 
				 //=============================================================================
			
 
				-// FFT Implementation (Cooley-Tukey Radix-2)
			
 
				+// FFT Implementation (Table-based Cooley-Tukey Radix-2)
			
 
				 //=============================================================================
			
 
				 
			
 
				-/**
			
 
				- * Bit-reversal permutation for radix-2 FFT
			
 
				- */
			
 
				-inline void bit_reverse(Complex* data, int n) {
			
 
				-    int j = 0;
			
 
				-    for (int i = 0; i < n - 1; ++i) {
			
 
				-        if (i < j) {
			
 
				-            std::swap(data[i], data[j]);
			
 
				-        }
			
 
				-        int m = n >> 1;
			
 
				-        while (j >= m && m > 0) {
			
 
				-            j -= m;
			
 
				-            m >>= 1;
			
 
				+class TableFFT {
			
 
				+public:
			
 
				+    static TableFFT& GetInstance(int n_fft) {
			
 
				+        static std::mutex mtx;
			
 
				+        static std::unique_ptr<TableFFT> instance;
			
 
				+        static int current_n_fft = -1;
			
 
				+
			
 
				+        std::lock_guard<std::mutex> lock(mtx);
			
 
				+        if (!instance || current_n_fft != n_fft) {
			
 
				+            instance = std::make_unique<TableFFT>(n_fft);
			
 
				+            current_n_fft = n_fft;
			
 
				         }
			
 
				-        j += m;
			
 
				+        return *instance;
			
 
				     }
			
 
				-}
			
 
				 
			
 
				-/**
			
 
				- * In-place Cooley-Tukey radix-2 FFT
			
 
				- * @param data Complex array of size n (must be power of 2)
			
 
				- * @param n Size of array
			
 
				- * @param inverse If true, compute inverse FFT
			
 
				- */
			
 
				-inline void fft_radix2(Complex* data, int n, bool inverse = false) {
			
 
				-    bit_reverse(data, n);
			
 
				-    
			
 
				-    // Danielson-Lanczos lemma
			
 
				-    for (int len = 2; len <= n; len <<= 1) {
			
 
				-        float angle = (inverse ? 2.0f : -2.0f) * static_cast<float>(M_PI) / len;
			
 
				-        Complex w_n(std::cos(angle), std::sin(angle));
			
 
				-        
			
 
				-        for (int i = 0; i < n; i += len) {
			
 
				-            Complex w(1.0f, 0.0f);
			
 
				-            for (int j = 0; j < len / 2; ++j) {
			
 
				-                Complex u = data[i + j];
			
 
				-                Complex t = w * data[i + j + len / 2];
			
 
				-                data[i + j] = u + t;
			
 
				-                data[i + j + len / 2] = u - t;
			
 
				-                w *= w_n;
			
 
				-            }
			
 
				-        }
			
 
				+    TableFFT(int n) : n_(n) {
			
 
				+        Precomputetables();
			
 
				+    }
			
 
				+
			
 
				+    void Forward(Complex* data) const {
			
 
				+        BitReverse(data);
			
 
				+        Compute(data, false);
			
 
				     }
			
 
				     
			
 
				-    // Normalize for inverse FFT
			
 
				-    if (inverse) {
			
 
				-        for (int i = 0; i < n; ++i) {
			
 
				-            data[i] /= static_cast<float>(n);
			
 
				+    void Inverse(Complex* data) const {
			
 
				+        BitReverse(data);
			
 
				+        Compute(data, true);
			
 
				+        
			
 
				+        // Normalize
			
 
				+        float inv_n = 1.0f / n_;
			
 
				+        for (int i = 0; i < n_; ++i) {
			
 
				+            data[i] *= inv_n;
			
 
				         }
			
 
				     }
			
 
				-}
			
 
				 
			
 
				-/**
			
 
				- * Real-to-complex FFT (rfft) matching torch.fft.rfft
			
 
				- * @param input Real input array of size n
			
 
				- * @param output Complex output array of size n/2+1
			
 
				- * @param n Size of input (must be power of 2)
			
 
				- * @param buffer Temporary buffer of size n (optional, handled internally if null)
			
 
				- */
			
 
				-inline void rfft(const float* input, Complex* output, int n, std::vector<Complex>* buffer_ptr = nullptr) {
			
 
				-    // Copy to complex buffer
			
 
				-    // Use provided buffer to avoid allocation
			
 
				-    if (buffer_ptr) {
			
 
				-        if (buffer_ptr->size() < static_cast<size_t>(n)) buffer_ptr->resize(n);
			
 
				-        for (int i = 0; i < n; ++i) {
			
 
				-            (*buffer_ptr)[i] = Complex(input[i], 0.0f);
			
 
				+private:
			
 
				+    int n_;
			
 
				+    std::vector<int> bit_reverse_indices_;
			
 
				+    std::vector<Complex> twiddles_fwd_;
			
 
				+    std::vector<Complex> twiddles_inv_;
			
 
				+
			
 
				+    void Precomputetables() {
			
 
				+        // 1. Bit Reverse
			
 
				+        bit_reverse_indices_.resize(n_);
			
 
				+        int j = 0;
			
 
				+        for (int i = 0; i < n_ - 1; ++i) {
			
 
				+            bit_reverse_indices_[i] = (i < j) ? j : i; // Store swap target
			
 
				+            int m = n_ >> 1;
			
 
				+            while (j >= m && m > 0) {
			
 
				+                j -= m;
			
 
				+                m >>= 1;
			
 
				+            }
			
 
				+            j += m;
			
 
				         }
			
 
				-        fft_radix2(buffer_ptr->data(), n, false);
			
 
				+        bit_reverse_indices_[n_ - 1] = n_ - 1;
			
 
				+
			
 
				+        // 2. Twiddles
			
 
				+        // We only need twiddles for len = 2, 4, 8 ... n
			
 
				+        // Total count is roughly N.
			
 
				+        // Structure: [len=2: w], [len=4: w, w^2], ...
			
 
				+        // Simplification: Store W_N^k for k=0..N/2-1.
			
 
				+        // Then step=N/len.
			
 
				+        twiddles_fwd_.resize(n_ / 2);
			
 
				+        twiddles_inv_.resize(n_ / 2);
			
 
				         
			
 
				-        int n_out = n / 2 + 1;
			
 
				-        for (int i = 0; i < n_out; ++i) {
			
 
				-            output[i] = (*buffer_ptr)[i];
			
 
				+        for (int k = 0; k < n_ / 2; ++k) {
			
 
				+            float angle = -2.0f * static_cast<float>(M_PI) * k / n_;
			
 
				+            twiddles_fwd_[k] = Complex(std::cos(angle), std::sin(angle));
			
 
				+            twiddles_inv_[k] = std::conj(twiddles_fwd_[k]);
			
 
				         }
			
 
				-    } else {
			
 
				-        std::vector<Complex> buffer(n);
			
 
				-        for (int i = 0; i < n; ++i) {
			
 
				-            buffer[i] = Complex(input[i], 0.0f);
			
 
				+    }
			
 
				+
			
 
				+    void BitReverse(Complex* data) const {
			
 
				+        for (int i = 0; i < n_; ++i) {
			
 
				+            int j = bit_reverse_indices_[i];
			
 
				+            if (i < j) {
			
 
				+                std::swap(data[i], data[j]);
			
 
				+            }
			
 
				         }
			
 
				+    }
			
 
				+
			
 
				+    void Compute(Complex* data, bool inverse) const {
			
 
				+        const auto& twiddles = inverse ? twiddles_inv_ : twiddles_fwd_;
			
 
				         
			
 
				-        fft_radix2(buffer.data(), n, false);
			
 
				-        
			
 
				-        int n_out = n / 2 + 1;
			
 
				-        for (int i = 0; i < n_out; ++i) {
			
 
				-            output[i] = buffer[i];
			
 
				+        for (int len = 2; len <= n_; len <<= 1) {
			
 
				+            int half_len = len >> 1;
			
 
				+            int step = n_ / len;
			
 
				+            
			
 
				+            for (int i = 0; i < n_; i += len) {
			
 
				+                for (int j = 0; j < half_len; ++j) {
			
 
				+                    Complex w = twiddles[j * step];
			
 
				+                    Complex u = data[i + j];
			
 
				+                    Complex t = w * data[i + j + half_len];
			
 
				+                    data[i + j] = u + t;
			
 
				+                    data[i + j + half_len] = u - t;
			
 
				+                }
			
 
				+            }
			
 
				         }
			
 
				     }
			
 
				+};
			
 
				+
			
 
				+
			
 
				+//=============================================================================
			
 
				+// STFT Wrapper (Optimized)
			
 
				+//=============================================================================
			
 
				+
			
 
				+inline void rfft(const float* input, Complex* output, int n, STFTBuffer& buffer) {
			
 
				+    // 1. Copy to complex buffer
			
 
				+    for (int i = 0; i < n; ++i) {
			
 
				+        buffer.fft_scratch[i] = Complex(input[i], 0.0f);
			
 
				+    }
			
 
				+    
			
 
				+    // 2. FFT
			
 
				+    TableFFT::GetInstance(n).Forward(buffer.fft_scratch.data());
			
 
				+    
			
 
				+    // 3. Copy first N/2 + 1
			
 
				+    int n_out = n / 2 + 1;
			
 
				+    for (int i = 0; i < n_out; ++i) {
			
 
				+        output[i] = buffer.fft_scratch[i];
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * Complex-to-real inverse FFT (irfft) matching torch.fft.irfft
			
 
				- * @param input Complex input array of size n/2+1
			
 
				- * @param output Real output array of size n
			
 
				- * @param n_out Size of output (must be power of 2)
			
 
				- * @param buffer Temporary buffer of size n_out (optional)
			
 
				- */
			
 
				-inline void irfft(const Complex* input, float* output, int n_out, std::vector<Complex>* buffer_ptr = nullptr) {
			
 
				+inline void irfft(const Complex* input, float* output, int n_out, STFTBuffer& buffer) {
			
 
				     int n_freq = n_out / 2 + 1;
			
 
				     
			
 
				-    if (buffer_ptr) {
			
 
				-        if (buffer_ptr->size() < static_cast<size_t>(n_out)) buffer_ptr->resize(n_out);
			
 
				-        for (int i = 0; i < n_freq; ++i) {
			
 
				-            (*buffer_ptr)[i] = input[i];
			
 
				-        }
			
 
				-         for (int i = n_freq; i < n_out; ++i) {
			
 
				-            (*buffer_ptr)[i] = std::conj((*buffer_ptr)[n_out - i]);
			
 
				-        }
			
 
				-        fft_radix2(buffer_ptr->data(), n_out, true);
			
 
				-        for (int i = 0; i < n_out; ++i) {
			
 
				-            output[i] = (*buffer_ptr)[i].real();
			
 
				-        }
			
 
				-    } else {
			
 
				-        std::vector<Complex> buffer(n_out);
			
 
				-        for (int i = 0; i < n_freq; ++i) {
			
 
				-            buffer[i] = input[i];
			
 
				-        }
			
 
				-        for (int i = n_freq; i < n_out; ++i) {
			
 
				-            buffer[i] = std::conj(buffer[n_out - i]);
			
 
				-        }
			
 
				-        
			
 
				-        fft_radix2(buffer.data(), n_out, true);
			
 
				-        
			
 
				-        for (int i = 0; i < n_out; ++i) {
			
 
				-            output[i] = buffer[i].real();
			
 
				-        }
			
 
				+    // 1. Reconstruct full spectrum
			
 
				+    for (int i = 0; i < n_freq; ++i) {
			
 
				+        buffer.fft_scratch[i] = input[i];
			
 
				+    }
			
 
				+    for (int i = n_freq; i < n_out; ++i) {
			
 
				+        buffer.fft_scratch[i] = std::conj(buffer.fft_scratch[n_out - i]);
			
 
				+    }
			
 
				+    
			
 
				+    // 2. IFFT
			
 
				+    TableFFT::GetInstance(n_out).Inverse(buffer.fft_scratch.data());
			
 
				+    
			
 
				+    // 3. Real part
			
 
				+    for (int i = 0; i < n_out; ++i) {
			
 
				+        output[i] = buffer.fft_scratch[i].real();
			
 
				     }
			
 
				 }
			
 
				 
			
 
				-//=============================================================================
			
 
				-// STFT Implementation
			
 
				-//=============================================================================
			
 
				-
			
 
				-/**
			
 
				- * Short-Time Fourier Transform matching torch.stft
			
 
				- * 
			
 
				- * @param audio Input audio [n_samples]
			
 
				- * @param n_samples Number of samples
			
 
				- * @param n_fft FFT size
			
 
				- * @param hop_length Hop between frames
			
 
				- * @param win_length Window length
			
 
				- * @param window Window function [win_length]
			
 
				- * @param center If true, pad signal on both sides
			
 
				- * @param output Output complex spectrogram [n_freq, n_frames, 2] (real, imag pairs)
			
 
				- * @param n_frames_out Output parameter: number of frames
			
 
				- */
			
 
				 inline void compute_stft(
			
 
				     const float* audio,
			
 
				     int n_samples,
			
@@ -211,21 +234,33 @@ inline void compute_stft(
 
				     int pad_amount = center ? n_fft / 2 : 0;
			
 
				     int padded_len = n_samples + 2 * pad_amount;
			
 
				     
			
 
				-    std::vector<float> padded(padded_len);
			
 
				+    // Calculate number of frames
			
 
				+    // PyTorch formula: (L - N) / H + 1
			
 
				+    int n_frames = 1 + (padded_len - n_fft) / hop_length;
			
 
				+    if (n_frames < 0) n_frames = 0;
			
 
				+    *n_frames_out = n_frames;
			
 
				+    
			
 
				+    // Prepare padding buffer (thread-local or single allocation if not parallel? 
			
 
				+    // Padding + Windowing is usually fast, but padding needs full copy.)
			
 
				+    // For safety and simplicity, let's allocate padded audio once here (It's one large buffer).
			
 
				+    // The previous implementation used thread_local for 'padded_audio' which is wrong because 
			
 
				+    // 'padded_audio' needs to hold the WHOLE signal? No, stft.h:52 says 'padded_audio'.
			
 
				+    // Analyzing original code: It copied the WHOLE signal to 'padded_audio' inside compute_stft.
			
 
				+    // That means 'tls_buffer' was huge! If we have multiple threads, each copying full audio? 
			
 
				+    // That's wasteful.
			
 
				+    // Better: Allocate 'padded' once on heap.
			
 
				     
			
 
				+    std::vector<float> padded(padded_len);
			
 
				     if (center) {
			
 
				         // Reflect padding
			
 
				-        // Left pad (reflect)
			
 
				         for (int i = 0; i < pad_amount; ++i) {
			
 
				             int src_idx = pad_amount - i;
			
 
				             if (src_idx >= n_samples) src_idx = n_samples - 1;
			
 
				             padded[i] = audio[src_idx];
			
 
				         }
			
 
				-        // Center (copy)
			
 
				         if (n_samples > 0) {
			
 
				             std::memcpy(padded.data() + pad_amount, audio, n_samples * sizeof(float));
			
 
				         }
			
 
				-        // Right pad (reflect)
			
 
				         for (int i = 0; i < pad_amount; ++i) {
			
 
				             int src_idx = n_samples - 2 - i;
			
 
				             if (src_idx < 0) src_idx = 0;
			
@@ -234,17 +269,10 @@ inline void compute_stft(
 
				     } else {
			
 
				         std::memcpy(padded.data(), audio, n_samples * sizeof(float));
			
 
				     }
			
 
				-    
			
 
				-    // Calculate number of frames
			
 
				-    // PyTorch formula: (L - N) / H + 1
			
 
				-    int n_frames = 1 + (padded_len - n_fft) / hop_length;
			
 
				-    if (n_frames < 0) n_frames = 0;
			
 
				-    *n_frames_out = n_frames;
			
 
				-    
			
 
				-    // Number of output frequency bins
			
 
				+
			
 
				     int n_freq = n_fft / 2 + 1;
			
 
				     
			
 
				-    // Prepare padded window if win_length < n_fft
			
 
				+    // Prepare window (Single copy)
			
 
				     std::vector<float> window_padded(n_fft, 0.0f);
			
 
				     if (win_length < n_fft) {
			
 
				         int left = (n_fft - win_length) / 2;
			
@@ -253,16 +281,14 @@ inline void compute_stft(
 
				         std::memcpy(window_padded.data(), window, n_fft * sizeof(float));
			
 
				     }
			
 
				     
			
 
				-    // Pre-allocate thread-local buffers
			
 
				+    // Prepare thread buffers
			
 
				     int max_threads = 1;
			
 
				     #ifdef USE_OPENMP
			
 
				     max_threads = omp_get_max_threads();
			
 
				     #endif
			
 
				-    
			
 
				-    std::vector<std::vector<float>> thread_frames(max_threads, std::vector<float>(n_fft));
			
 
				-    std::vector<std::vector<Complex>> thread_fft_outs(max_threads, std::vector<Complex>(n_freq));
			
 
				-    std::vector<std::vector<Complex>> thread_fft_buffers(max_threads, std::vector<Complex>(n_fft));
			
 
				-    
			
 
				+    std::vector<STFTBuffer> thread_buffers(max_threads);
			
 
				+    for(auto& buf : thread_buffers) buf.Resize(n_fft);
			
 
				+
			
 
				     // Process each frame
			
 
				     #ifdef USE_OPENMP
			
 
				     #pragma omp parallel for
			
@@ -272,41 +298,29 @@ inline void compute_stft(
 
				         #ifdef USE_OPENMP
			
 
				         tid = omp_get_thread_num();
			
 
				         #endif
			
 
				-        
			
 
				-        std::vector<float>& frame = thread_frames[tid];
			
 
				-        
			
 
				+        STFTBuffer& buffer = thread_buffers[tid];
			
 
				+
			
 
				+        std::vector<float>& frame = buffer.frame_in;
			
 
				         int start = f * hop_length;
			
 
				         
			
 
				         for (int i = 0; i < n_fft; ++i) {
			
 
				             frame[i] = padded[start + i] * window_padded[i];
			
 
				         }
			
 
				         
			
 
				-        // Compute FFT using pre-allocated buffers
			
 
				-        rfft(frame.data(), thread_fft_outs[tid].data(), n_fft, &thread_fft_buffers[tid]);
			
 
				+        // Compute FFT
			
 
				+        // Output pointer directly to destination
			
 
				+        // We need a place to store complex output before writing to planar output
			
 
				+        
			
 
				+        rfft(frame.data(), buffer.fft_out.data(), n_fft, buffer);
			
 
				         
			
 
				-        // Store in output [n_freq, n_frames, 2] format
			
 
				+        // Write to output
			
 
				         for (int k = 0; k < n_freq; ++k) {
			
 
				-            // Note: Output layout is [Freq, Time, 2]
			
 
				-            output[(k * n_frames + f) * 2 + 0] = thread_fft_outs[tid][k].real();
			
 
				-            output[(k * n_frames + f) * 2 + 1] = thread_fft_outs[tid][k].imag();
			
 
				+            output[(k * n_frames + f) * 2 + 0] = buffer.fft_out[k].real();
			
 
				+            output[(k * n_frames + f) * 2 + 1] = buffer.fft_out[k].imag();
			
 
				         }
			
 
				     }
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * Inverse Short-Time Fourier Transform matching torch.istft
			
 
				- * 
			
 
				- * @param stft_data Input complex spectrogram [n_freq, n_frames, 2]
			
 
				- * @param n_freq Number of frequency bins
			
 
				- * @param n_frames Number of frames
			
 
				- * @param n_fft FFT size
			
 
				- * @param hop_length Hop between frames
			
 
				- * @param win_length Window length
			
 
				- * @param window Window function [win_length]
			
 
				- * @param center If true, signal was centered
			
 
				- * @param length Expected output length (or 0 for auto)
			
 
				- * @param output Output audio
			
 
				- */
			
 
				 inline void compute_istft(
			
 
				     const float* stft_data,
			
 
				     int n_freq,
			
@@ -333,18 +347,16 @@ inline void compute_istft(
 
				         std::memcpy(window_padded.data(), window, n_fft * sizeof(float));
			
 
				     }
			
 
				     
			
 
				-    // Step 1: Compute all IFFTs in parallel
			
 
				-    std::vector<float> frames_time_domain(n_frames * n_fft);
			
 
				-    
			
 
				-    // Pre-allocate thread-local buffers
			
 
				+    // Prepare thread buffers
			
 
				     int max_threads = 1;
			
 
				     #ifdef USE_OPENMP
			
 
				     max_threads = omp_get_max_threads();
			
 
				     #endif
			
 
				-    
			
 
				-    std::vector<std::vector<Complex>> thread_fft_ins(max_threads, std::vector<Complex>(n_freq));
			
 
				-    std::vector<std::vector<float>> thread_frame_outs(max_threads, std::vector<float>(n_fft));
			
 
				-    std::vector<std::vector<Complex>> thread_fft_buffers(max_threads, std::vector<Complex>(n_fft));
			
 
				+    std::vector<STFTBuffer> thread_buffers(max_threads);
			
 
				+    for(auto& buf : thread_buffers) buf.Resize(n_fft);
			
 
				+
			
 
				+    // Step 1: Compute all IFFTs in parallel
			
 
				+    std::vector<float> frames_time_domain(n_frames * n_fft);
			
 
				     
			
 
				     #ifdef USE_OPENMP
			
 
				     #pragma omp parallel for
			
@@ -354,9 +366,10 @@ inline void compute_istft(
 
				         #ifdef USE_OPENMP
			
 
				         tid = omp_get_thread_num();
			
 
				         #endif
			
 
				+        STFTBuffer& buffer = thread_buffers[tid];
			
 
				         
			
 
				-        std::vector<Complex>& fft_in = thread_fft_ins[tid];
			
 
				-        std::vector<float>& frame_out = thread_frame_outs[tid];
			
 
				+        std::vector<Complex>& fft_in = buffer.fft_in;
			
 
				+        std::vector<float>& frame_out = buffer.frame_out;
			
 
				         
			
 
				         // Extract complex spectrum
			
 
				         for (int k = 0; k < n_freq; ++k) {
			
@@ -366,7 +379,7 @@ inline void compute_istft(
 
				         }
			
 
				         
			
 
				         // IFFT
			
 
				-        irfft(fft_in.data(), frame_out.data(), n_fft, &thread_fft_buffers[tid]);
			
 
				+        irfft(fft_in.data(), frame_out.data(), n_fft, buffer);
			
 
				         
			
 
				         // Store
			
 
				         std::memcpy(&frames_time_domain[f * n_fft], frame_out.data(), n_fft * sizeof(float));
			
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -45,3 +45,4 @@ mbr_add_test(test_component_mask)
 
				 # Integration tests
			
 
				 mbr_add_test(test_inference)
			
 
				 mbr_add_test(test_chunking_logic)
			
 
				+mbr_add_test(test_stft_consistency)
			
--- a/tests/test_stft_consistency.cpp
+++ b/tests/test_stft_consistency.cpp
@@ -0,0 +1,145 @@
 
				+#include "test_common.h"
			
 
				+#include "../src/stft.h"
			
 
				+#include "../src/model.h"
			
 
				+
			
 
				+int main(int argc, char** argv) {
			
 
				+    std::cout << "Test: STFT/ISTFT Consistency with PyTorch" << std::endl;
			
 
				+
			
 
				+    // 1. Load Model to get parameters
			
 
				+    std::string model_path = GetModelPath();
			
 
				+    std::cout << "Loading model params from: " << model_path << std::endl;
			
 
				+    
			
 
				+    // We only need the model to read parameters (n_fft, etc.) from GGUF
			
 
				+    // We don't need to allocate the full graph or weights.
			
 
				+    BSRoformer model;
			
 
				+    try {
			
 
				+        model.Initialize(model_path);
			
 
				+    } catch (const std::exception& e) {
			
 
				+        std::cerr << "Failed to load model: " << e.what() << std::endl;
			
 
				+        std::cerr << "Ensure MBR_MODEL_PATH is set correctly or bs_roformer.gguf exists." << std::endl;
			
 
				+        return 1;
			
 
				+    }
			
 
				+    
			
 
				+    int n_fft = model.GetNFFT();
			
 
				+    int hop_length = model.GetHopLength();
			
 
				+    int win_length = model.GetWinLength();
			
 
				+    
			
 
				+    std::cout << "STFT Params: n_fft=" << n_fft << ", hop_length=" << hop_length << ", win_length=" << win_length << std::endl;
			
 
				+    
			
 
				+    // 2. Load Data
			
 
				+    std::string data_dir = GetTestDataDir();
			
 
				+    std::cout << "Loading test data from: " << data_dir << std::endl;
			
 
				+    
			
 
				+    GoldenTensor input_audio(data_dir, "input_audio"); // [batch, channels, samples]
			
 
				+    GoldenTensor expected_stft(data_dir, "stft_raw"); // [batch, channels, freq, time, 2]
			
 
				+    GoldenTensor expected_istft(data_dir, "istft_raw"); // [batch, channels, samples]
			
 
				+    
			
 
				+    TEST_ASSERT_LOAD(input_audio, "input_audio");
			
 
				+    TEST_ASSERT_LOAD(expected_stft, "stft_raw");
			
 
				+    TEST_ASSERT_LOAD(expected_istft, "istft_raw");
			
 
				+    
			
 
				+    input_audio.PrintShape("Input Audio");
			
 
				+    expected_stft.PrintShape("Expected STFT");
			
 
				+    expected_istft.PrintShape("Expected ISTFT");
			
 
				+    
			
 
				+    int batch = input_audio.shape[0];
			
 
				+    int channels = input_audio.shape[1];
			
 
				+    int n_samples = input_audio.shape[2];
			
 
				+    
			
 
				+    int n_freq = n_fft / 2 + 1;
			
 
				+    int expected_n_frames = expected_stft.shape[3]; 
			
 
				+
			
 
				+    // 3. Prepare Window
			
 
				+    std::vector<float> window(win_length);
			
 
				+    stft::hann_window(window.data(), win_length);
			
 
				+    
			
 
				+    bool all_passed = true;
			
 
				+    
			
 
				+    // 4. Test STFT
			
 
				+    std::cout << "\n=== Testing STFT ===" << std::endl;
			
 
				+    
			
 
				+    for (int b = 0; b < batch; ++b) {
			
 
				+        for (int c = 0; c < channels; ++c) {
			
 
				+            // Extract input channel
			
 
				+            std::vector<float> in_channel(n_samples);
			
 
				+            for (int i = 0; i < n_samples; ++i) {
			
 
				+                // Determine index based on memory layout
			
 
				+                // input_audio.npy is F-contiguous [1, 2, 220500] => [220500, 2] in memory (interleaved)
			
 
				+                // Layout: L0, R0, L1, R1, ...
			
 
				+                // Index = (sample_idx * channels + channel_idx)
			
 
				+                size_t idx = ((size_t)b * n_samples + i) * channels + c;
			
 
				+                in_channel[i] = input_audio.data[idx];
			
 
				+            }
			
 
				+            
			
 
				+            // Diagnostic: print first few input values
			
 
				+            std::cout << "  Input[" << b << "," << c << "] first 5: ";
			
 
				+            for (int i = 0; i < 5; ++i) std::cout << in_channel[i] << " ";
			
 
				+            std::cout << std::endl;
			
 
				+            
			
 
				+            int n_frames_calc = 0;
			
 
				+            // Buffer for output. 
			
 
				+            // C++ output is [n_freq, n_frames, 2]
			
 
				+            std::vector<float> out_stft(n_freq * (expected_n_frames + 10) * 2); 
			
 
				+            
			
 
				+            stft::compute_stft(
			
 
				+                in_channel.data(), n_samples, n_fft, hop_length, win_length,
			
 
				+                window.data(), true, out_stft.data(), &n_frames_calc
			
 
				+            );
			
 
				+            
			
 
				+            if (n_frames_calc != expected_n_frames) {
			
 
				+                std::cerr << "  [Batch " << b << " Ch " << c << "] Frame mismatch: calc=" << n_frames_calc << ", expected=" << expected_n_frames << std::endl;
			
 
				+                all_passed = false;
			
 
				+                continue;
			
 
				+            }
			
 
				+            
			
 
				+            // Compare
			
 
				+            size_t channel_stft_size = n_freq * expected_n_frames * 2;
			
 
				+            size_t offset = b * channels * channel_stft_size + c * channel_stft_size;
			
 
				+            
			
 
				+            std::string name = "STFT_B" + std::to_string(b) + "_Ch" + std::to_string(c);
			
 
				+            if (!CompareAndReport(name, 
			
 
				+                                  expected_stft.data + offset, channel_stft_size,
			
 
				+                                  out_stft.data(), channel_stft_size, 1e-3f, 1e-2f)) {
			
 
				+                all_passed = false;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    
			
 
				+    // 5. Test ISTFT
			
 
				+    std::cout << "\n=== Testing ISTFT ===" << std::endl;
			
 
				+    
			
 
				+    for (int b = 0; b < batch; ++b) {
			
 
				+        for (int c = 0; c < channels; ++c) {
			
 
				+             size_t channel_stft_size = n_freq * expected_n_frames * 2;
			
 
				+             size_t offset = b * channels * channel_stft_size + c * channel_stft_size;
			
 
				+             
			
 
				+             // Input: expected_stft.data + offset
			
 
				+             std::vector<float> out_audio(n_samples + n_fft); // Buffer slightly larger
			
 
				+             
			
 
				+             // We pass n_samples as expected length
			
 
				+             stft::compute_istft(
			
 
				+                 expected_stft.data + offset,
			
 
				+                 n_freq, expected_n_frames, n_fft, hop_length, win_length,
			
 
				+                 window.data(), true, n_samples, out_audio.data()
			
 
				+             );
			
 
				+             
			
 
				+             // Verify against expected_istft
			
 
				+             size_t audio_offset = b * channels * n_samples + c * n_samples;
			
 
				+             
			
 
				+             std::string name = "ISTFT_B" + std::to_string(b) + "_Ch" + std::to_string(c);
			
 
				+             if (!CompareAndReport(name,
			
 
				+                                   expected_istft.data + audio_offset, n_samples,
			
 
				+                                   out_audio.data(), n_samples, 1e-4f, 1e-3f)) {
			
 
				+                 all_passed = false;                     
			
 
				+             }
			
 
				+        }
			
 
				+    }
			
 
				+    
			
 
				+    if (all_passed) {
			
 
				+        LOG_PASS();
			
 
				+        return 0;
			
 
				+    } else {
			
 
				+        LOG_FAIL();
			
 
				+        return 1;
			
 
				+    }
			
 
				+}