沉默の金 5 mesiacov pred
commit
b8a5c45ae9

+ 391 - 0
.github/workflows/build.yml

@@ -0,0 +1,391 @@
+name: CI
+
+on:
+  push:
+    branches: [main, master]
+    paths:
+      - '**/*.cpp'
+      - '**/*.h'
+      - '**/*.hpp'
+      - '**/CMakeLists.txt'
+      - '.github/workflows/**'
+  pull_request:
+    types: [opened, synchronize, reopened]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  # HuggingFace model info
+  HF_MODEL_REPO: GaboxR67/MelBandRoformers
+  HF_CHECKPOINT_PATH: melbandroformers/vocals/voc_fv6.ckpt
+  HF_CONFIG_PATH: melbandroformers/vocals/voc_gabox.yaml
+  # Music-Source-Separation-Training repo
+  MSST_REPO: https://github.com/ZFTurbo/Music-Source-Separation-Training.git
+
+jobs:
+  # ===========================================================================
+  # Prepare: Generate test data (runs once, shared via artifacts)
+  # ===========================================================================
+  prepare-test-data:
+    runs-on: ubuntu-latest
+    
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: 'pip'
+          
+      - name: Clone MSST Repository
+        run: git clone --depth 1 ${{ env.MSST_REPO }} msst
+        
+      - name: Install Dependencies
+        run: |
+          pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu
+          pip install -r msst/requirements.txt
+          pip install huggingface_hub scipy soundfile gguf librosa ml_collections einops pyyaml
+          
+      - name: Download Model from HuggingFace
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          python -c "
+          from huggingface_hub import hf_hub_download
+          import os
+          token = os.environ.get('HF_TOKEN') or None
+          hf_hub_download('${{ env.HF_MODEL_REPO }}', '${{ env.HF_CHECKPOINT_PATH }}', 
+                          local_dir='./model', token=token)
+          hf_hub_download('${{ env.HF_MODEL_REPO }}', '${{ env.HF_CONFIG_PATH }}',
+                          local_dir='./model', token=token)
+          "
+          
+      - name: Generate Test Audio
+        run: |
+          python -c "
+          import numpy as np
+          import scipy.io.wavfile as wav
+          sr = 44100
+          duration = 5.0
+          t = np.linspace(0, duration, int(sr * duration))
+          # Create a more complex test signal
+          left = (np.sin(2 * np.pi * 440 * t) + 0.5 * np.sin(2 * np.pi * 880 * t)) * 0.3
+          right = (np.sin(2 * np.pi * 660 * t) + 0.5 * np.sin(2 * np.pi * 1320 * t)) * 0.3
+          stereo = np.stack([left, right], axis=1).astype(np.float32)
+          wav.write('test_audio.wav', sr, stereo)
+          print(f'Generated test audio: {len(t)} samples, {duration}s')
+          "
+          
+      - name: Generate Test Data
+        run: |
+          python scripts/generate_test_data.py \
+            --model-repo msst \
+            --audio test_audio.wav \
+            --checkpoint model/${{ env.HF_CHECKPOINT_PATH }} \
+            --config model/${{ env.HF_CONFIG_PATH }} \
+            --output test_data
+            
+      - name: Convert Model to GGUF
+        run: |
+          python scripts/convert_to_gguf.py \
+            --ckpt model/${{ env.HF_CHECKPOINT_PATH }} \
+            --config model/${{ env.HF_CONFIG_PATH }} \
+            --out model.gguf \
+            --dtype fp16
+            
+      - name: Upload Test Data Artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-data
+          path: |
+            test_data/
+            model.gguf
+            test_audio.wav
+          retention-days: 1
+
+  # ===========================================================================
+  # Build Matrix: Core Platforms + Vulkan
+  # ===========================================================================
+  build:
+    needs: prepare-test-data
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          # Tier 1: Core Platforms (CPU)
+          - { name: linux-x64-cpu,    os: ubuntu-22.04,     backend: cpu,    test: true  }
+          - { name: linux-arm64-cpu,  os: ubuntu-22.04-arm, backend: cpu,    test: true  }
+          - { name: macos-arm64,      os: macos-latest,     backend: cpu,    test: true  }
+          - { name: macos-x64,        os: macos-15-intel,   backend: cpu,    test: true  }
+          - { name: windows-x64-msvc, os: windows-2025,     backend: cpu,    test: true  }
+          # Tier 2: Vulkan Backend
+          - { name: linux-vulkan,     os: ubuntu-24.04,     backend: vulkan, test: true  }
+          - { name: windows-vulkan,   os: windows-2025,     backend: vulkan, test: true  }
+          
+    runs-on: ${{ matrix.os }}
+    
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      
+      - name: Download Test Data
+        uses: actions/download-artifact@v4
+        with:
+          name: test-data
+          
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          
+      - name: Install Python Dependencies
+        run: pip install numpy scipy
+          
+      # ----- Linux Dependencies -----
+      - name: Install Dependencies (Linux)
+        if: runner.os == 'Linux'
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential cmake
+          
+      - name: Install Vulkan SDK (Linux)
+        if: matrix.backend == 'vulkan' && runner.os == 'Linux'
+        run: |
+          sudo apt-get install -y libvulkan-dev glslc mesa-vulkan-drivers
+          
+      # ----- macOS Dependencies -----  
+      - name: Install Dependencies (macOS)
+        if: runner.os == 'macOS'
+        run: brew install cmake
+        
+      # ----- Windows Dependencies -----
+      - name: Install Dependencies (Windows)
+        if: runner.os == 'Windows'
+        run: choco install ninja -y
+        
+      - name: Install Vulkan SDK (Windows)
+        if: matrix.backend == 'vulkan' && runner.os == 'Windows'
+        run: |
+          $VK_VERSION = "1.4.313.2"
+          curl.exe -o VulkanSDK.exe -L "https://sdk.lunarg.com/sdk/download/${VK_VERSION}/windows/vulkansdk-windows-X64-${VK_VERSION}.exe"
+          Start-Process -FilePath .\VulkanSDK.exe -ArgumentList "--accept-licenses --default-answer --confirm-command install" -Wait
+          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${VK_VERSION}"
+          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${VK_VERSION}\bin"
+          
+      # ----- Configure -----
+      - name: Configure (Unix)
+        if: runner.os != 'Windows'
+        run: |
+          cmake -B build \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_CUDA=OFF \
+            -DGGML_VULKAN=${{ matrix.backend == 'vulkan' && 'ON' || 'OFF' }} \
+            -DMBR_BUILD_TESTS=ON \
+            -DMBR_BUILD_CLI=ON
+            
+      - name: Configure (Windows)
+        if: runner.os == 'Windows'
+        run: |
+          cmake -B build -G "Ninja Multi-Config" `
+            -DGGML_CUDA=OFF `
+            -DGGML_VULKAN=${{ matrix.backend == 'vulkan' && 'ON' || 'OFF' }} `
+            -DMBR_BUILD_TESTS=ON `
+            -DMBR_BUILD_CLI=ON
+            
+      # ----- Build -----
+      - name: Build (Unix)
+        if: runner.os != 'Windows'
+        run: cmake --build build --config Release -j $(nproc 2>/dev/null || sysctl -n hw.logicalcpu)
+        
+      - name: Build (Windows)
+        if: runner.os == 'Windows'
+        run: cmake --build build --config Release -j $env:NUMBER_OF_PROCESSORS
+        
+      # ----- Unit Tests -----
+      - name: Run Unit Tests
+        if: matrix.test
+        env:
+          MBR_MODEL_PATH: ${{ github.workspace }}/model.gguf
+          MBR_TEST_DATA_DIR: ${{ github.workspace }}/test_data
+        run: ctest --test-dir build -C Release --output-on-failure --timeout 300
+        
+      # ----- CLI Tests -----
+      - name: Test CLI
+        if: matrix.test
+        shell: bash
+        env:
+          MBR_MODEL_PATH: ${{ github.workspace }}/model.gguf
+        run: |
+          echo "=== CLI Test Suite ==="
+          
+          # Determine CLI path based on OS
+          if [[ "$RUNNER_OS" == "Windows" ]]; then
+            CLI="./build/bin/Release/mel_band_roformer-cli.exe"
+          else
+            CLI="./build/mel_band_roformer-cli"
+          fi
+          
+          # 1. Test --help
+          echo "[1/4] Testing --help..."
+          $CLI --help
+          
+          # 2. Test with missing arguments (should fail)
+          echo "[2/4] Testing error handling..."
+          if $CLI 2>/dev/null; then
+            echo "ERROR: CLI should fail without arguments"
+            exit 1
+          fi
+          
+          # 3. Generate test audio (short 2-second clip)
+          echo "[3/4] Generating test audio..."
+          python3 -c "
+          import numpy as np
+          import scipy.io.wavfile as wav
+          sr = 44100
+          t = np.linspace(0, 2.0, sr * 2)
+          stereo = np.stack([np.sin(2*np.pi*440*t), np.sin(2*np.pi*880*t)], axis=1).astype(np.float32) * 0.5
+          wav.write('cli_test_input.wav', sr, stereo)
+          "
+          
+          # 4. Run full inference
+          echo "[4/4] Running inference..."
+          $CLI "$MBR_MODEL_PATH" cli_test_input.wav cli_test_output.wav --chunk-size 88200 --overlap 2
+          
+          # Verify output exists and has reasonable size
+          if [[ ! -f cli_test_output.wav ]]; then
+            echo "ERROR: Output file not created"
+            exit 1
+          fi
+          
+          OUTPUT_SIZE=$(stat -c%s cli_test_output.wav 2>/dev/null || stat -f%z cli_test_output.wav)
+          if [[ $OUTPUT_SIZE -lt 1000 ]]; then
+            echo "ERROR: Output file too small: $OUTPUT_SIZE bytes"
+            exit 1
+          fi
+          
+          echo "=== CLI Tests Passed ==="
+          
+      # ----- Upload Artifacts -----
+      - name: Upload Build Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: build-${{ matrix.name }}
+          path: |
+            build/bin/
+            build/lib*/
+            build/*.dll
+            build/*.so
+            build/*.dylib
+            build/mel_band_roformer-cli*
+            build/Release/
+          retention-days: 7
+
+  # ===========================================================================
+  # CUDA Build: Linux (Compile Only - No GPU for testing)
+  # ===========================================================================
+  build-cuda-linux:
+    runs-on: ubuntu-latest
+    container: nvidia/cuda:12.6.2-devel-ubuntu24.04
+    
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      
+      - name: Install Dependencies
+        run: |
+          apt-get update
+          apt-get install -y cmake build-essential ninja-build git
+          
+      - name: Configure
+        run: |
+          cmake -B build -G Ninja \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_CUDA=ON \
+            -DCMAKE_CUDA_ARCHITECTURES="75;80;86;89" \
+            -DMBR_BUILD_TESTS=OFF \
+            -DMBR_BUILD_CLI=ON
+            
+      - name: Build
+        run: cmake --build build --config Release -j $(nproc)
+        
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: build-linux-cuda
+          path: |
+            build/bin/
+            build/lib*/
+            build/*.so
+          retention-days: 7
+
+  # ===========================================================================
+  # CUDA Build: Windows (Compile Only - No GPU for testing)
+  # ===========================================================================
+  build-cuda-windows:
+    runs-on: windows-2022
+    
+    env:
+      CUDA_VERSION: '12.4'
+    
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      
+      - name: Install CUDA Toolkit
+        run: |
+          # Download and install CUDA toolkit components
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${{ env.CUDA_VERSION }}"
+          choco install unzip -y
+          $components = @(
+            "cuda_cudart-windows-x86_64-12.4.127-archive",
+            "cuda_nvcc-windows-x86_64-12.4.131-archive",
+            "cuda_nvrtc-windows-x86_64-12.4.127-archive",
+            "libcublas-windows-x86_64-12.4.5.8-archive",
+            "cuda_cccl-windows-x86_64-12.4.127-archive"
+          )
+          foreach ($comp in $components) {
+            $url = "https://developer.download.nvidia.com/compute/cuda/redist/$($comp.Split('-')[0])/$($comp.Replace('cuda_', '').Split('-')[0])/windows-x86_64/$comp.zip"
+            # Simplified: just use the base URL pattern
+          }
+          # For CI, use the official CUDA installer approach
+          curl.exe -o cuda_installer.exe -L "https://developer.download.nvidia.com/compute/cuda/12.4.0/network_installers/cuda_12.4.0_windows_network.exe"
+          Start-Process -FilePath .\cuda_installer.exe -ArgumentList "-s nvcc_12.4 cudart_12.4 cublas_12.4 cufft_12.4" -Wait -NoNewWindow
+          Add-Content $env:GITHUB_ENV "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+          Add-Content $env:GITHUB_PATH "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin"
+        
+      - name: Install Ninja
+        run: choco install ninja -y
+        
+      - name: Configure
+        shell: cmd
+        run: |
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
+          cmake -B build -G "Ninja Multi-Config" ^
+            -DGGML_CUDA=ON ^
+            -DCMAKE_CUDA_ARCHITECTURES="75;80;86;89" ^
+            -DMBR_BUILD_TESTS=OFF ^
+            -DMBR_BUILD_CLI=ON
+            
+      - name: Build
+        run: cmake --build build --config Release -j $env:NUMBER_OF_PROCESSORS
+        
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: build-windows-cuda
+          path: |
+            build/bin/
+            build/Release/
+            build/*.dll
+          retention-days: 7

+ 206 - 0
.github/workflows/convert-model.yml

@@ -0,0 +1,206 @@
+name: Convert Model to GGUF
+
+on:
+  workflow_dispatch:
+    inputs:
+      hf_repo:
+        description: 'HuggingFace 仓库名称 (例如: GaboxR67/MelBandRoformers)'
+        required: true
+        type: string
+      checkpoint_path:
+        description: '权重文件路径 (相对于仓库, 例如: melbandroformers/vocals/voc_fv6.ckpt)'
+        required: true
+        type: string
+      config_path:
+        description: '配置文件路径 (相对于仓库, 例如: melbandroformers/vocals/voc_gabox.yaml)'
+        required: true
+        type: string
+      model_name:
+        description: '输出模型名称 (用于文件命名, 例如: voc_fv6)'
+        required: false
+        type: string
+        default: 'model'
+      quantization_types:
+        description: '要转换的量化类型 (用逗号分隔, 留空则转换全部)'
+        required: false
+        type: string
+        default: 'fp32,fp16,q8_0,q4_0,q4_1,q5_0,q5_1'
+
+env:
+  SUPPORTED_QUANT_TYPES: 'fp32,fp16,q8_0,q4_0,q4_1,q5_0,q5_1'
+
+jobs:
+  convert-to-all-quant-types:
+    runs-on: ubuntu-latest
+    
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: 'pip'
+          
+      - name: Install Dependencies
+        run: |
+          pip install torch --index-url https://download.pytorch.org/whl/cpu
+          pip install huggingface_hub gguf librosa einops pyyaml numpy
+          
+      - name: Download Model from HuggingFace
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          python -c "
+          from huggingface_hub import hf_hub_download
+          import os
+          
+          token = os.environ.get('HF_TOKEN') or None
+          repo = '${{ inputs.hf_repo }}'
+          
+          print(f'Downloading from {repo}...')
+          
+          # Download checkpoint
+          ckpt_path = hf_hub_download(
+              repo, 
+              '${{ inputs.checkpoint_path }}', 
+              local_dir='./model', 
+              token=token
+          )
+          print(f'  Checkpoint: {ckpt_path}')
+          
+          # Download config
+          config_path = hf_hub_download(
+              repo, 
+              '${{ inputs.config_path }}',
+              local_dir='./model', 
+              token=token
+          )
+          print(f'  Config: {config_path}')
+          "
+          
+      - name: Show Model Info
+        run: |
+          echo "=== Model Configuration ==="
+          cat model/${{ inputs.config_path }}
+          echo ""
+          echo "=== Checkpoint File Size ==="
+          ls -lh model/${{ inputs.checkpoint_path }}
+          
+      - name: Create Output Directory
+        run: mkdir -p output
+        
+      - name: Convert to All Quantization Types
+        run: |
+          #!/bin/bash
+          set -e
+          
+          CHECKPOINT="model/${{ inputs.checkpoint_path }}"
+          CONFIG="model/${{ inputs.config_path }}"
+          MODEL_NAME="${{ inputs.model_name }}"
+          QUANT_TYPES="${{ inputs.quantization_types }}"
+          
+          # If no types specified, use all supported types
+          if [ -z "$QUANT_TYPES" ]; then
+            QUANT_TYPES="${{ env.SUPPORTED_QUANT_TYPES }}"
+          fi
+          
+          echo "=== Converting Model: $MODEL_NAME ==="
+          echo "Checkpoint: $CHECKPOINT"
+          echo "Config: $CONFIG"
+          echo "Quantization Types: $QUANT_TYPES"
+          echo ""
+          
+          # Convert comma-separated to array
+          IFS=',' read -ra TYPES <<< "$QUANT_TYPES"
+          
+          for qtype in "${TYPES[@]}"; do
+            # Trim whitespace
+            qtype=$(echo "$qtype" | xargs)
+            
+            # Generate output filename following GGUF naming convention
+            # Format: {model_name}-{quantization}.gguf
+            OUTPUT_FILE="output/${MODEL_NAME}-${qtype^^}.gguf"
+            
+            echo ">>> Converting to $qtype -> $OUTPUT_FILE"
+            
+            python scripts/convert_to_gguf.py \
+              --ckpt "$CHECKPOINT" \
+              --config "$CONFIG" \
+              --out "$OUTPUT_FILE" \
+              --dtype "$qtype"
+              
+            # Show file size
+            SIZE=$(ls -lh "$OUTPUT_FILE" | awk '{print $5}')
+            echo "    Output size: $SIZE"
+            echo ""
+          done
+          
+          echo "=== Conversion Complete ==="
+          ls -lh output/
+          
+      - name: Generate Summary Report
+        run: |
+          python -c "
+          import os
+          import json
+          
+          output_dir = 'output'
+          model_name = '${{ inputs.model_name }}'
+          
+          report = {
+              'model_name': model_name,
+              'hf_repo': '${{ inputs.hf_repo }}',
+              'checkpoint': '${{ inputs.checkpoint_path }}',
+              'config': '${{ inputs.config_path }}',
+              'files': []
+          }
+          
+          total_size = 0
+          for f in sorted(os.listdir(output_dir)):
+              if f.endswith('.gguf'):
+                  path = os.path.join(output_dir, f)
+                  size_bytes = os.path.getsize(path)
+                  total_size += size_bytes
+                  
+                  # Extract quant type from filename
+                  quant_type = f.replace(f'{model_name}-', '').replace('.gguf', '')
+                  
+                  report['files'].append({
+                      'filename': f,
+                      'quant_type': quant_type,
+                      'size_bytes': size_bytes,
+                      'size_mb': round(size_bytes / 1024 / 1024, 2)
+                  })
+          
+          report['total_size_bytes'] = total_size
+          report['total_size_mb'] = round(total_size / 1024 / 1024, 2)
+          
+          # Write JSON report
+          with open('output/conversion_report.json', 'w') as f:
+              json.dump(report, f, indent=2)
+          
+          # Print markdown table for GitHub Actions summary
+          print('## Conversion Report')
+          print('')
+          print(f'**Model**: {model_name}')
+          print(f'**Source**: [{report[\"hf_repo\"]}](https://huggingface.co/{report[\"hf_repo\"]})')
+          print('')
+          print('| Quantization | Filename | Size |')
+          print('|--------------|----------|------|')
+          for f in report['files']:
+              print(f'| {f[\"quant_type\"]} | {f[\"filename\"]} | {f[\"size_mb\"]} MB |')
+          print('')
+          print(f'**Total Size**: {report[\"total_size_mb\"]} MB')
+          " | tee -a $GITHUB_STEP_SUMMARY
+          
+      - name: Upload Converted Models
+        uses: actions/upload-artifact@v4
+        with:
+          name: gguf-models-${{ inputs.model_name }}
+          path: |
+            output/*.gguf
+            output/conversion_report.json
+          retention-days: 30
+          compression-level: 0  # GGUF is already compressed for quantized types

+ 1 - 0
.gitignore

@@ -0,0 +1 @@
+build

+ 160 - 0
CMakeLists.txt

@@ -0,0 +1,160 @@
+cmake_minimum_required(VERSION 3.16)
+project(mel_band_roformer_cpp VERSION 1.0.0 LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+#================================================
+# Build Options
+#================================================
+
+option(GGML_CUDA "Enable CUDA backend" ON)
+option(MBR_BUILD_TESTS "Build tests" OFF)
+option(MBR_BUILD_CLI "Build CLI application" ON)
+
+#================================================
+# Dependencies - GGML (Flexible Resolution)
+#================================================
+
+# Strategy: Allow ggml to be shared across multiple projects
+# 1. Check if ggml target already exists (e.g., from parent project like whisper.cpp)
+# 2. If not, try to find ggml via CMAKE_PREFIX_PATH or GGML_DIR
+# 3. If not found, use local ggml (submodule or sibling directory)
+
+if(NOT TARGET ggml)
+    # Try to find ggml package first (for system-wide or parent project installation)
+    find_package(ggml QUIET CONFIG)
+    
+    if(NOT ggml_FOUND)
+        # ggml not found as package, look for source directory
+        # Priority 1: GGML_DIR variable (explicitly set by user or parent project)
+        # Priority 2: Submodule in ggml/
+        # Priority 3: Sibling directory ../ggml
+        
+        if(DEFINED GGML_DIR)
+            set(GGML_PATH "${GGML_DIR}")
+            message(STATUS "Using GGML from GGML_DIR: ${GGML_PATH}")
+        elseif(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ggml/CMakeLists.txt")
+            set(GGML_PATH "${CMAKE_CURRENT_SOURCE_DIR}/ggml")
+            message(STATUS "Using GGML from submodule: ${GGML_PATH}")
+        elseif(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../ggml/CMakeLists.txt")
+            set(GGML_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../ggml")
+            message(STATUS "Using GGML from sibling directory: ${GGML_PATH}")
+        else()
+            message(FATAL_ERROR 
+                "ggml not found. Please either:\n"
+                "  1. Add ggml as submodule: git submodule add https://github.com/ggerganov/ggml.git\n"
+                "  2. Clone ggml to parent directory: cd .. && git clone https://github.com/ggerganov/ggml.git\n"
+                "  3. Set GGML_DIR to point to ggml source: cmake -DGGML_DIR=/path/to/ggml\n"
+                "  4. Let parent project provide ggml target"
+            )
+        endif()
+        
+        # Add ggml as subdirectory
+        add_subdirectory(${GGML_PATH} ggml EXCLUDE_FROM_ALL)
+    else()
+        message(STATUS "Using GGML from installed package")
+    endif()
+else()
+    message(STATUS "Using GGML target from parent project")
+endif()
+
+#================================================
+# Core Library
+#================================================
+
+set(MBR_SOURCES
+    src/model.cpp
+    src/utils.cpp
+    src/inference.cpp
+)
+
+add_library(mel_band_roformer STATIC ${MBR_SOURCES})
+
+target_include_directories(mel_band_roformer PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:include>
+)
+target_include_directories(mel_band_roformer PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+target_link_libraries(mel_band_roformer PUBLIC ggml)
+if(GGML_CUDA AND TARGET ggml-cuda)
+    target_link_libraries(mel_band_roformer PUBLIC ggml-cuda)
+endif()
+
+# Compiler options
+if(MSVC)
+    target_compile_options(mel_band_roformer PRIVATE /W3 /utf-8)
+else()
+    target_compile_options(mel_band_roformer PRIVATE -Wall -Wextra)
+endif()
+
+# OpenMP support
+find_package(OpenMP)
+if(OpenMP_CXX_FOUND)
+    target_link_libraries(mel_band_roformer PUBLIC OpenMP::OpenMP_CXX)
+    target_compile_definitions(mel_band_roformer PUBLIC USE_OPENMP)
+    message(STATUS "OpenMP: ENABLED")
+else()
+    message(STATUS "OpenMP: NOT FOUND")
+endif()
+
+#================================================
+# DLL Copy Helper (Windows)
+#================================================
+
+function(mbr_copy_ggml_dlls target_name)
+    if(NOT WIN32)
+        return()
+    endif()
+    
+    set(GGML_DLL_TARGETS ggml ggml-base ggml-cpu ggml-cuda)
+    
+    foreach(dll_target ${GGML_DLL_TARGETS})
+        if(TARGET ${dll_target})
+            add_custom_command(TARGET ${target_name} POST_BUILD
+                COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                    $<TARGET_FILE:${dll_target}>
+                    $<TARGET_FILE_DIR:${target_name}>
+                COMMENT "Copying ${dll_target}.dll for ${target_name}"
+            )
+        endif()
+    endforeach()
+endfunction()
+
+#================================================
+# CLI Application
+#================================================
+
+if(MBR_BUILD_CLI)
+    # audio.cpp implements AudioFile utilities (using dr_wav)
+    add_executable(mel_band_roformer-cli 
+        cli/main.cpp 
+        src/audio.cpp
+    )
+    target_link_libraries(mel_band_roformer-cli PRIVATE mel_band_roformer)
+    target_include_directories(mel_band_roformer-cli PRIVATE 
+        src 
+        third_party
+    )
+    
+    if(MSVC)
+        target_compile_options(mel_band_roformer-cli PRIVATE /W3 /utf-8)
+    endif()
+    
+    mbr_copy_ggml_dlls(mel_band_roformer-cli)
+endif()
+
+#================================================
+# Tests (Optional)
+#================================================
+
+if(MBR_BUILD_TESTS)
+    enable_testing()
+    add_subdirectory(tests)
+    message(STATUS "Tests: ENABLED")
+else()
+    message(STATUS "Tests: DISABLED (use -DMBR_BUILD_TESTS=ON to enable)")
+endif()

+ 97 - 0
GGML_DEPENDENCY.md

@@ -0,0 +1,97 @@
+# GGML Dependency Management
+
+This project uses a flexible GGML dependency resolution strategy that supports both standalone builds and integration into larger projects.
+
+## Dependency Resolution Strategy
+
+The CMake configuration resolves GGML dependencies in the following priority order:
+
+1. **Existing Target** - If `ggml` target already exists (e.g., from parent project like `whisper.cpp`)
+2. **Installed Package** - Search for GGML via `find_package()`
+3. **Submodule** - Use `ggml/` subdirectory (if exists)
+4. **Sibling Directory** - Use `../ggml` (if exists)
+5. **Explicit Path** - Use path specified by `GGML_DIR` variable
+
+## Usage Scenarios
+
+### Scenario 1: Standalone Build
+
+Clone GGML as submodule:
+```bash
+cd MelBandRoformer.cpp
+git submodule add https://github.com/ggerganov/ggml.git
+git submodule update --init --recursive
+cmake -B build -DGGML_CUDA=ON
+```
+
+Or use sibling directory:
+```bash
+cd ..
+git clone https://github.com/ggerganov/ggml.git
+cd MelBandRoformer.cpp
+cmake -B build -DGGML_CUDA=ON
+```
+
+### Scenario 2: Shared GGML with whisper.cpp
+
+When both projects need GGML, let whisper.cpp provide it:
+
+```cmake
+# Parent Project CMakeLists.txt
+cmake_minimum_required(VERSION 3.16)
+project(MyParentProject)
+
+# Build whisper.cpp first (includes ggml)
+add_subdirectory(whisper.cpp)
+
+# Build MelBandRoformer - it will reuse whisper.cpp's ggml target
+add_subdirectory(MelBandRoformer.cpp)
+```
+
+Or use explicit path:
+```bash
+cmake -B build -DGGML_DIR=/path/to/shared/ggml
+```
+
+### Scenario 3: Custom GGML Location
+
+```bash
+cmake -B build -DGGML_DIR=/custom/path/to/ggml
+```
+
+## Project Structure Examples
+
+**Option A: Submodule** (Recommended)
+```
+MelBandRoformer.cpp/
+├── ggml/                    # Git submodule
+├── src/
+├── tests/
+└── CMakeLists.txt
+```
+
+**Option B: Sibling Directory**
+```
+parent/
+├── ggml/                    # Shared GGML
+├── MelBandRoformer.cpp/
+└── whisper.cpp/             # Also uses ../ggml
+```
+
+**Option C: Parent Project**
+```
+MyProject/
+├── external/
+│   └── ggml/
+├── whisper.cpp/
+├── MelBandRoformer.cpp/
+└── CMakeLists.txt          # Defines ggml target
+```
+
+## Benefits
+
+- ✅ **Standalone**: Works independently without parent project
+- ✅ **Reusable**: Shares GGML across multiple projects
+- ✅ **Flexible**: Supports multiple directory layouts
+- ✅ **Build Time**: Avoids duplicate GGML compilation
+- ✅ **Disk Space**: Single GGML copy for multiple projects

+ 314 - 0
README.md

@@ -0,0 +1,314 @@
+# MelBandRoformer.cpp
+
+High-performance C++ inference implementation for the Mel-Band-Roformer audio source separation model.
+
+## 📖 Introduction
+
+This project is a pure C++ inference engine for the Mel-Band-Roformer audio source separation model, built on the [GGML](https://github.com/ggerganov/ggml) tensor library. It theoretically supports most Mel-Band-Roformer models and is primarily used for extracting vocals or accompaniment from music.
+
+### ✨ Key Features
+
+- 🚀 **High-Performance Inference**: Supports CPU/GPU (CUDA, Vulkan) acceleration
+- 📦 **GGUF Model Format**: Unified model file format for easy distribution
+- 🎚️ **Multiple Quantization Support**: FP32/FP16/Q8_0/Q4_0/Q4_1/Q5_0/Q5_1
+- 🔧 **Easy Deployment**: Only requires executable and GGML library
+- 🎵 **Complete Audio Pipeline**: Built-in STFT/ISTFT and audio I/O
+- ⚡ **Pipeline Optimization**: CPU preprocessing and GPU inference run in parallel
+
+---
+
+## 🚀 Quick Start
+
+### Download
+
+- **Pre-built Binaries**: Download executables for your platform from the [Releases](../../releases) page
+- **GGUF Models**: Download pre-converted model files from [MelBandRoformer-GGUF](https://huggingface.co/MelBandRoformer-GGUF)
+
+### Command Line Usage
+
+```bash
+./mel_band_roformer-cli <model.gguf> <input.wav> <output.wav> [options]
+
+Options:
+  --chunk-size <N>   Chunk size (in samples), defaults to model value
+  --overlap <N>      Number of overlaps, defaults to model value
+  --help, -h         Show help message
+```
+
+**Parameter Description:**
+
+| Parameter | Description |
+|-----------|-------------|
+| `--chunk-size` | Number of audio samples to process at once. Larger values require more VRAM but may improve processing efficiency. Default is typically 352800 (~8 seconds @44100Hz). |
+| `--overlap` | Number of overlaps between chunks. Increasing this value can **improve output quality** as it helps reduce artifacts when reassembling chunks, but will increase inference time. Recommended value is 2-4. |
+
+**Examples:**
+
+```bash
+# Basic usage (using model defaults)
+./mel_band_roformer-cli model.gguf song.wav vocals.wav
+
+# Custom chunking parameters
+./mel_band_roformer-cli model.gguf song.wav vocals.wav --chunk-size 352800 --overlap 2
+
+# High quality mode (increase overlap to reduce artifacts)
+./mel_band_roformer-cli model.gguf song.wav vocals.wav --overlap 4
+```
+
+> **Note**: Input audio must be **44100 Hz**. Stereo or mono is supported (auto-expanded).
+
+---
+
+## 🔧 Building from Source
+
+### Prerequisites
+
+- CMake >= 3.16
+- C++17 compatible compiler (MSVC 2019+, GCC 9+, Clang 10+)
+- GGML source code (submodule or local directory)
+
+### Getting GGML Dependency
+
+The project supports multiple ways to obtain GGML:
+
+```bash
+# Option 1: Git Submodule (Recommended)
+git submodule add https://github.com/ggerganov/ggml.git
+git submodule update --init --recursive
+
+# Option 2: Sibling Directory
+cd ..
+git clone https://github.com/ggerganov/ggml.git
+
+# Option 3: Explicit Path
+cmake -B build -DGGML_DIR=/path/to/ggml
+```
+
+See [GGML_DEPENDENCY.md](GGML_DEPENDENCY.md) for details.
+
+### Build Commands
+
+```bash
+# CPU Build
+cmake -B build
+cmake --build build --config Release --parallel
+
+# CUDA Acceleration (Recommended)
+cmake -B build -DGGML_CUDA=ON
+cmake --build build --config Release --parallel
+
+# Enable Tests
+cmake -B build -DGGML_CUDA=ON -DMBR_BUILD_TESTS=ON
+cmake --build build --config Release --parallel
+```
+
+### CMake Options
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `GGML_CUDA` | `ON` | Enable CUDA backend |
+| `MBR_BUILD_CLI` | `ON` | Build command line tool |
+| `MBR_BUILD_TESTS` | `OFF` | Build test suite |
+
+---
+
+## 📦 Model Conversion
+
+If you need to convert models yourself, use `convert_to_gguf.py` to convert PyTorch weights to GGUF format.
+
+**Install Dependencies:**
+
+```bash
+pip install torch numpy pyyaml librosa einops gguf
+```
+
+**Conversion Command:**
+
+```bash
+python scripts/convert_to_gguf.py \
+    --ckpt model.ckpt \
+    --config config.yaml \
+    --out model.gguf \
+    --dtype q8_0
+```
+
+### Supported Quantization Types
+
+| Type | Precision | Size | Recommended Use |
+|------|-----------|------|-----------------|
+| `fp32` | Highest | 100% | Debugging/Baseline |
+| `fp16` | High | 50% | High precision needs |
+| `q8_0` | Good | 25% | **Recommended** (balance of precision and performance) |
+| `q5_1` | Medium | 18% | Resource constrained |
+| `q4_0` | Lower | 12.5% | Extreme compression |
+
+> **Note**: The conversion script currently does not support K-Quant types (Q4_K, Q5_K, etc.). This is mainly because the gguf-py library has not yet implemented K-Quant quantization (only supports reading/dequantization), and most models do not meet the requirement that dim must be divisible by 256.
+
+---
+
+## 💻 C++ API
+
+```cpp
+#include <mel_band_roformer/inference.h>
+#include <mel_band_roformer/audio.h>
+
+// 1. Load audio file
+AudioBuffer input = AudioFile::Load("input.wav");
+
+// 2. Initialize inference engine
+Inference engine("model.gguf");
+
+// 3. Get model's recommended inference parameters
+int chunk_size = engine.GetDefaultChunkSize();   // e.g., 352800
+int num_overlap = engine.GetDefaultNumOverlap(); // e.g., 2
+
+// 4. Run inference (with progress callback)
+auto stems = engine.Process(input.data, chunk_size, num_overlap,
+    [](float progress) {
+        std::cout << "Progress: " << int(progress * 100) << "%" << std::endl;
+    });
+
+// 5. Save result
+AudioBuffer output{stems[0], 2, 44100, stems[0].size()};
+AudioFile::Save("vocals.wav", output);
+```
+
+---
+
+## 🏗️ Project Architecture
+
+```
+MelBandRoformer.cpp/
+├── include/
+│   └── mel_band_roformer/
+│       ├── inference.h        # Inference Engine API
+│       └── audio.h            # Audio I/O API
+├── src/
+│   ├── model.h/cpp            # Model weight loading & graph building (internal)
+│   ├── inference.cpp          # Core inference logic (STFT → Network → ISTFT)
+│   ├── stft.h                 # STFT/ISTFT implementation (Radix-2 FFT)
+│   ├── audio.cpp              # Audio read/write implementation (dr_wav)
+│   └── utils.h/cpp            # NPY loading, tensor comparison tools
+├── third_party/
+│   └── dr_libs/dr_wav.h       # dr_libs audio library
+├── cli/
+│   └── main.cpp               # Command line tool
+├── scripts/
+│   ├── convert_to_gguf.py      # PyTorch → GGUF conversion tool
+│   ├── generate_test_data.py   # Test data generation script
+│   └── generate_test_audio.py  # CI test audio generation (no external files needed)
+├── tests/                     # Unit test suite
+├── models/                    # Model file directory
+└── CMakeLists.txt             # Build configuration
+```
+
+---
+
+## 📐 Core Module Details
+
+### 1. Model Loading (`model.h/cpp`)
+
+The `MelBandRoformer` class is responsible for:
+
+- **GGUF Weight Loading**: Parsing hyperparameters and tensors from file
+- **Buffer Generation**: `freq_indices`, `num_bands_per_freq`, etc.
+- **Computation Graph Building**:
+  - `BuildBandSplitGraph()` - Band split layer
+  - `BuildTransformersGraph()` - Time-frequency Transformer stacking
+  - `BuildMaskEstimatorGraph()` - Mask estimator
+
+### 2. Inference Engine (`inference.cpp`)
+
+The `Inference` class implements the complete audio processing pipeline:
+
+```
+Input Audio → Chunking → STFT → Neural Network → Mask Application → ISTFT → Overlap-Add → Output
+```
+
+**Key Methods**:
+
+| Method | Function |
+|--------|----------|
+| `Process()` | Process complete audio (auto-chunking) |
+| `ProcessChunk()` | Process a single audio chunk |
+| `ComputeSTFT()` | Short-Time Fourier Transform |
+| `PostProcessAndISTFT()` | Mask application and inverse transform |
+
+**Pipeline Optimization**:
+
+```
+Chunk N:   [CPU Preprocess] → [GPU Inference] → [CPU Postprocess]
+Chunk N+1:                   [CPU Preprocess] → [GPU Inference] → [CPU Postprocess]
+                              ↑ Parallel execution
+```
+
+### 3. STFT Implementation (`stft.h`)
+
+Pure C++ implementation, numerically aligned with PyTorch `torch.stft/istft`:
+
+- **Radix-2 Cooley-Tukey FFT**: Efficient O(N log N) implementation
+- **Hann Window**: Periodic window function
+- **Center Padding**: Reflect mode padding
+- **OpenMP Parallelization**: Frame-level parallel acceleration
+
+### 4. Audio I/O (`audio.h/cpp`)
+
+Lightweight audio processing based on [dr_libs](https://github.com/mackron/dr_libs):
+
+- Read: WAV file → `float32` interleaved format
+- Write: `float32` interleaved format → WAV file
+
+---
+
+## 🧪 Testing
+
+### Running Tests
+
+```bash
+# Set environment variables
+$env:MBR_MODEL_PATH = "models/model.gguf"
+$env:MBR_TEST_DATA_DIR = "test_data"
+
+# Run all tests
+ctest --test-dir build -C Release
+
+# Run specific test
+ctest --test-dir build -C Release -R test_inference
+```
+
+### Test Suite
+
+| Test File | Verification Content |
+|-----------|---------------------|
+| `test_audio` | Audio read/write functionality |
+| `test_component_stft` | STFT/ISTFT numerical precision |
+| `test_component_bandsplit` | Band split layer |
+| `test_component_layers` | Transformer layers |
+| `test_component_mask` | Mask estimator |
+| `test_inference` | End-to-end inference |
+| `test_chunking_logic` | Chunking overlap-add logic |
+
+### Generating Test Data
+
+First clone [Music-Source-Separation-Training](https://github.com/ZFTurbo/Music-Source-Separation-Training) and install its dependencies:
+
+```bash
+git clone https://github.com/ZFTurbo/Music-Source-Separation-Training.git
+cd Music-Source-Separation-Training
+pip install -r requirements.txt
+cd ..
+
+python scripts/generate_test_data.py \
+    --model-repo "Music-Source-Separation-Training" \
+    --audio "test.wav" \
+    --checkpoint "model.ckpt" \
+    --output "test_data"
+```
+
+---
+
+## Acknowledgements
+
+- [ggerganov/ggml](https://github.com/ggerganov/ggml) - Efficient tensor library
+- [ZFTurbo/Music-Source-Separation-Training](https://github.com/ZFTurbo/Music-Source-Separation-Training) - PyTorch reference implementation
+- [dr_libs](https://github.com/mackron/dr_libs) - Lightweight audio library

+ 314 - 0
README.zh.md

@@ -0,0 +1,314 @@
+# MelBandRoformer.cpp
+
+Mel-Band-Roformer 音频源分离模型的高性能 C++ 推理实现。
+
+## 📖 简介
+
+本项目是 Mel-Band-Roformer 音频源分离模型的纯 C++ 推理引擎,基于 [GGML](https://github.com/ggerganov/ggml) 张量库构建。理论上支持大部分 Mel-Band-Roformer 模型,主要用于从音乐中提取人声或伴奏。
+
+### ✨ 主要特性
+
+- 🚀 **高性能推理**:支持 CPU/GPU (CUDA、Vulkan) 加速
+- 📦 **GGUF 模型格式**:统一的模型文件格式,易于分发
+- 🎚️ **多种量化支持**:FP32/FP16/Q8_0/Q4_0/Q4_1/Q5_0/Q5_1
+- 🔧 **易于部署**:仅需可执行文件和 GGML 库
+- 🎵 **完整音频流程**:内置 STFT/ISTFT 和音频 I/O
+- ⚡ **流水线优化**:CPU 预处理与 GPU 推理并行执行
+
+---
+
+## 🚀 快速开始
+
+### 下载
+
+- **预构建程序**:在 [Releases](../../releases) 页面下载对应平台的可执行文件
+- **GGUF 模型**:在 [MelBandRoformer-GGUF](https://huggingface.co/MelBandRoformer-GGUF) 下载预转换的模型文件
+
+### 命令行使用
+
+```bash
+./mel_band_roformer-cli <模型.gguf> <输入.wav> <输出.wav> [选项]
+
+选项:
+  --chunk-size <N>   分块大小(采样点数),默认从模型读取
+  --overlap <N>      重叠数量,默认从模型读取
+  --help, -h         显示帮助信息
+```
+
+**参数说明:**
+
+| 参数 | 说明 |
+|------|------|
+| `--chunk-size` | 每次处理的音频采样点数。较大的值需要更多显存,但可能提高处理效率。默认值通常为 352800(约 8 秒 @44100Hz)。 |
+| `--overlap` | 分块之间的重叠数量。增加此值可以**提高输出质量**,因为它有助于减少重新组合数据块时产生的伪影(artifacts),但会延长推理时间。建议值为 2-4。 |
+
+**示例:**
+
+```bash
+# 基本用法(使用模型默认参数)
+./mel_band_roformer-cli model.gguf song.wav vocals.wav
+
+# 自定义分块参数
+./mel_band_roformer-cli model.gguf song.wav vocals.wav --chunk-size 352800 --overlap 2
+
+# 高质量模式(增加重叠,减少伪影)
+./mel_band_roformer-cli model.gguf song.wav vocals.wav --overlap 4
+```
+
+> **注意**:输入音频必须为 **44100 Hz**,支持立体声或单声道(自动扩展)。
+
+---
+
+## 🔧 从源码构建
+
+### 前置条件
+
+- CMake >= 3.16
+- C++17 兼容编译器(MSVC 2019+, GCC 9+, Clang 10+)
+- GGML 源码(submodule 或本地目录)
+
+### 获取 GGML 依赖
+
+项目支持多种 GGML 获取方式:
+
+```bash
+# 方式一:Git Submodule(推荐)
+git submodule add https://github.com/ggerganov/ggml.git
+git submodule update --init --recursive
+
+# 方式二:兄弟目录
+cd ..
+git clone https://github.com/ggerganov/ggml.git
+
+# 方式三:显式指定路径
+cmake -B build -DGGML_DIR=/path/to/ggml
+```
+
+详见 [GGML_DEPENDENCY.md](GGML_DEPENDENCY.md)。
+
+### 编译命令
+
+```bash
+# CPU 构建
+cmake -B build
+cmake --build build --config Release --parallel
+
+# CUDA 加速(推荐)
+cmake -B build -DGGML_CUDA=ON
+cmake --build build --config Release --parallel
+
+# 启用测试
+cmake -B build -DGGML_CUDA=ON -DMBR_BUILD_TESTS=ON
+cmake --build build --config Release --parallel
+```
+
+### CMake 选项
+
+| 选项 | 默认值 | 说明 |
+|------|--------|------|
+| `GGML_CUDA` | `ON` | 启用 CUDA 后端 |
+| `MBR_BUILD_CLI` | `ON` | 构建命令行工具 |
+| `MBR_BUILD_TESTS` | `OFF` | 构建测试套件 |
+
+---
+
+## 📦 模型转换
+
+如果需要自行转换模型,可使用 `convert_to_gguf.py` 将 PyTorch 权重转换为 GGUF 格式。
+
+**依赖安装:**
+
+```bash
+pip install torch numpy pyyaml librosa einops gguf
+```
+
+**转换命令:**
+
+```bash
+python scripts/convert_to_gguf.py \
+    --ckpt model.ckpt \
+    --config config.yaml \
+    --out model.gguf \
+    --dtype q8_0
+```
+
+### 支持的量化类型
+
+| 类型 | 精度 | 体积 | 推荐场景 |
+|------|------|------|----------|
+| `fp32` | 最高 | 100% | 调试/基准 |
+| `fp16` | 高 | 50% | 高精度需求 |
+| `q8_0` | 较高 | 25% | **推荐**(平衡精度与性能) |
+| `q5_1` | 中等 | 18% | 资源受限 |
+| `q4_0` | 较低 | 12.5% | 极限压缩 |
+
+> **注意**:目前转换脚本不支持 K-Quant 类型 (Q4_K, Q5_K 等),主要原因是 gguf-py 库尚未实现 K-Quant 的量化功能(仅支持读取/反量化),并且大部分模型不满足dim能被256整除的条件。
+
+---
+
+## 💻 C++ API
+
+```cpp
+#include <mel_band_roformer/inference.h>
+#include <mel_band_roformer/audio.h>
+
+// 1. 加载音频文件
+AudioBuffer input = AudioFile::Load("input.wav");
+
+// 2. 初始化推理引擎
+Inference engine("model.gguf");
+
+// 3. 获取模型推荐的推理参数
+int chunk_size = engine.GetDefaultChunkSize();   // 如 352800
+int num_overlap = engine.GetDefaultNumOverlap(); // 如 2
+
+// 4. 执行推理(带进度回调)
+auto stems = engine.Process(input.data, chunk_size, num_overlap,
+    [](float progress) {
+        std::cout << "Progress: " << int(progress * 100) << "%" << std::endl;
+    });
+
+// 5. 保存结果
+AudioBuffer output{stems[0], 2, 44100, stems[0].size()};
+AudioFile::Save("vocals.wav", output);
+```
+
+---
+
+## 🏗️ 项目架构
+
+```
+MelBandRoformer.cpp/
+├── include/
+│   └── mel_band_roformer/
+│       ├── inference.h        # 推理引擎 API
+│       └── audio.h            # 音频 I/O API
+├── src/
+│   ├── model.h/cpp            # 模型权重加载与图构建(内部)
+│   ├── inference.cpp          # 核心推理逻辑(STFT → 网络 → ISTFT)
+│   ├── stft.h                 # STFT/ISTFT 实现(Radix-2 FFT)
+│   ├── audio.cpp              # 音频读写实现(dr_wav)
+│   └── utils.h/cpp            # NPY 加载、张量对比工具
+├── third_party/
+│   └── dr_libs/dr_wav.h       # dr_libs 音频库
+├── cli/
+│   └── main.cpp               # 命令行工具
+├── scripts/
+│   ├── convert_to_gguf.py      # PyTorch → GGUF 转换工具
+│   ├── generate_test_data.py   # 测试数据生成脚本
+│   └── generate_test_audio.py  # CI 测试音频生成(无需外部文件)
+├── tests/                     # 单元测试套件
+├── models/                    # 模型文件目录
+└── CMakeLists.txt             # 构建配置
+```
+
+---
+
+## 📐 核心模块详解
+
+### 1. 模型加载 (`model.h/cpp`)
+
+`MelBandRoformer` 类负责:
+
+- **GGUF 权重加载**:从文件解析超参数和张量
+- **缓冲区生成**:`freq_indices`、`num_bands_per_freq` 等
+- **计算图构建**:
+  - `BuildBandSplitGraph()` - 频带分割层
+  - `BuildTransformersGraph()` - 时频 Transformer 堆叠
+  - `BuildMaskEstimatorGraph()` - 掩码估计器
+
+### 2. 推理引擎 (`inference.cpp`)
+
+`Inference` 类实现完整的音频处理流程:
+
+```
+输入音频 → 分块(Chunking) → STFT → 神经网络 → 掩码应用 → ISTFT → 重叠相加 → 输出
+```
+
+**关键方法**:
+
+| 方法 | 功能 |
+|------|------|
+| `Process()` | 处理完整音频(自动分块) |
+| `ProcessChunk()` | 处理单个音频块 |
+| `ComputeSTFT()` | 短时傅里叶变换 |
+| `PostProcessAndISTFT()` | 掩码应用与逆变换 |
+
+**流水线优化**:
+
+```
+Chunk N:   [CPU预处理] → [GPU推理] → [CPU后处理]
+Chunk N+1:              [CPU预处理] → [GPU推理] → [CPU后处理]
+                         ↑ 并行执行
+```
+
+### 3. STFT 实现 (`stft.h`)
+
+纯 C++ 实现,与 PyTorch `torch.stft/istft` 数值对齐:
+
+- **Radix-2 Cooley-Tukey FFT**:高效 O(N log N) 实现
+- **Hann 窗口**:周期性窗函数
+- **中心填充**:反射模式 (reflect padding)
+- **OpenMP 并行**:帧级并行加速
+
+### 4. 音频 I/O (`audio.h/cpp`)
+
+基于 [dr_libs](https://github.com/mackron/dr_libs) 的轻量级音频处理:
+
+- 读取:WAV 文件 → `float32` 交错格式
+- 写入:`float32` 交错格式 → WAV 文件
+
+---
+
+## 🧪 测试
+
+### 运行测试
+
+```bash
+# 设置环境变量
+$env:MBR_MODEL_PATH = "models/model.gguf"
+$env:MBR_TEST_DATA_DIR = "test_data"
+
+# 运行所有测试
+ctest --test-dir build -C Release
+
+# 运行特定测试
+ctest --test-dir build -C Release -R test_inference
+```
+
+### 测试套件
+
+| 测试文件 | 验证内容 |
+|----------|----------|
+| `test_audio` | 音频读写功能 |
+| `test_component_stft` | STFT/ISTFT 数值精度 |
+| `test_component_bandsplit` | 频带分割层 |
+| `test_component_layers` | Transformer 层 |
+| `test_component_mask` | 掩码估计器 |
+| `test_inference` | 端到端推理 |
+| `test_chunking_logic` | 分块重叠相加逻辑 |
+
+### 生成测试数据
+
+需要先克隆 [Music-Source-Separation-Training](https://github.com/ZFTurbo/Music-Source-Separation-Training) 并安装其依赖:
+
+```bash
+git clone https://github.com/ZFTurbo/Music-Source-Separation-Training.git
+cd Music-Source-Separation-Training
+pip install -r requirements.txt
+cd ..
+
+python scripts/generate_test_data.py \
+    --model-repo "Music-Source-Separation-Training" \
+    --audio "test.wav" \
+    --checkpoint "model.ckpt" \
+    --output "test_data"
+```
+
+---
+
+## 致谢
+
+- [ggerganov/ggml](https://github.com/ggerganov/ggml) - 高效张量库
+- [ZFTurbo/Music-Source-Separation-Training](https://github.com/ZFTurbo/Music-Source-Separation-Training) - PyTorch 参考实现
+- [dr_libs](https://github.com/mackron/dr_libs) - 轻量级音频库

+ 170 - 0
cli/main.cpp

@@ -0,0 +1,170 @@
+#include "mel_band_roformer/inference.h"
+#include "mel_band_roformer/audio.h"
+#include <iostream>
+#include <string>
+#include <chrono>
+#include <cstdlib>
+
+void print_usage(const char* program_name) {
+    std::cerr << "Usage: " << program_name << " <model.gguf> <input.wav> <output.wav> [options]" << std::endl;
+    std::cerr << std::endl;
+    std::cerr << "Options:" << std::endl;
+    std::cerr << "  --chunk-size <N>   Chunk size in samples (default: from model, fallback 352800)" << std::endl;
+    std::cerr << "  --overlap <N>      Number of overlaps for crossfade (default: from model, fallback 2)" << std::endl;
+    std::cerr << "  --help, -h         Show this help message" << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+    // Default values (will be overridden by model defaults if not explicitly set)
+    int chunk_size = -1;  // -1 means use model default
+    int num_overlap = -1; // -1 means use model default
+    bool chunk_size_set = false;
+    bool num_overlap_set = false;
+    
+    // Check for help flag first
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+        if (arg == "--help" || arg == "-h") {
+            print_usage(argv[0]);
+            return 0;
+        }
+    }
+    
+    if (argc < 4) {
+        print_usage(argv[0]);
+        return 1;
+    }
+
+    std::string model_path = argv[1];
+    std::string input_path = argv[2];
+    std::string output_path = argv[3];
+    
+    // Parse optional arguments
+    for (int i = 4; i < argc; ++i) {
+        std::string arg = argv[i];
+        if (arg == "--chunk-size" && i + 1 < argc) {
+            chunk_size = std::atoi(argv[++i]);
+            if (chunk_size <= 0) {
+                std::cerr << "Error: chunk-size must be a positive integer" << std::endl;
+                return 1;
+            }
+            chunk_size_set = true;
+        } else if (arg == "--overlap" && i + 1 < argc) {
+            num_overlap = std::atoi(argv[++i]);
+            if (num_overlap < 1) {
+                std::cerr << "Error: overlap must be at least 1" << std::endl;
+                return 1;
+            }
+            num_overlap_set = true;
+        } else {
+            std::cerr << "Unknown option: " << arg << std::endl;
+            print_usage(argv[0]);
+            return 1;
+        }
+    }
+
+    try {
+        std::cout << "Initializing MelBandRoformer..." << std::endl;
+        auto start_time = std::chrono::high_resolution_clock::now();
+        
+        Inference engine(model_path);
+        
+        // Use model defaults if not explicitly set by user
+        if (!chunk_size_set) {
+            chunk_size = engine.GetDefaultChunkSize();
+        }
+        if (!num_overlap_set) {
+            num_overlap = engine.GetDefaultNumOverlap();
+        }
+        
+        std::cout << "Loading audio: " << input_path << std::endl;
+        AudioBuffer input_audio = AudioFile::Load(input_path);
+        
+        std::cout << "Audio loaded: " << input_audio.samples << " samples, " 
+                  << input_audio.channels << " channels, " 
+                  << input_audio.sampleRate << " Hz" << std::endl;
+
+        // 1. Check Sample Rate
+        if (input_audio.sampleRate != 44100) {
+            throw std::runtime_error("Input audio sample rate must be 44100 Hz. Current: " + std::to_string(input_audio.sampleRate));
+        }
+
+        // 2. Check Channels & Auto-Expand Mono
+        if (input_audio.channels == 1) {
+             std::cout << "[Info] Input is Mono. Expanding to Stereo..." << std::endl;
+             std::vector<float> stereo_data(input_audio.samples * 2);
+             for(size_t i=0; i<input_audio.samples; ++i) {
+                 stereo_data[i*2 + 0] = input_audio.data[i];
+                 stereo_data[i*2 + 1] = input_audio.data[i];
+             }
+             input_audio.data = std::move(stereo_data);
+             input_audio.channels = 2;
+        } else if (input_audio.channels != 2) {
+             // We can either reject or try to process first 2 channels? 
+             // Ideally reject to be safer, or warn.
+             throw std::runtime_error("Input audio must be Stereo (2 channels) or Mono (1 channel). Current: " + std::to_string(input_audio.channels));
+        }
+
+        std::cout << "Processing with chunk_size=" << chunk_size 
+                  << ", overlap=" << num_overlap << std::endl;
+        auto process_start = std::chrono::high_resolution_clock::now();
+        
+        // Progress Bar Callback
+        auto progress_callback = [](float progress) {
+            int barWidth = 50;
+            std::cout << "[";
+            int pos = barWidth * progress;
+            for (int i = 0; i < barWidth; ++i) {
+                if (i < pos) std::cout << "=";
+                else if (i == pos) std::cout << ">";
+                else std::cout << " ";
+            }
+            std::cout << "] " << int(progress * 100.0) << " %\r";
+            std::cout.flush();
+        };
+
+        std::vector<std::vector<float>> output_stems = engine.Process(input_audio.data, chunk_size, num_overlap, progress_callback);
+
+        // Clear progress line
+        std::cout << std::string(70, ' ') << "\r";
+
+        auto process_end = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> diff = process_end - process_start;
+        std::cout << "Processed in " << diff.count() << " seconds." << std::endl;
+        
+        int num_stems = output_stems.size();
+        std::cout << "Model returned " << num_stems << " stems." << std::endl;
+
+        for (int i = 0; i < num_stems; ++i) {
+            // Prepare output filename
+            std::string current_output_path = output_path;
+            if (num_stems > 1) {
+                // Insert _stem_i before extension
+                size_t dot_pos = output_path.find_last_of(".");
+                if (dot_pos != std::string::npos) {
+                    current_output_path = output_path.substr(0, dot_pos) + "_stem_" + std::to_string(i) + output_path.substr(dot_pos);
+                } else {
+                    current_output_path = output_path + "_stem_" + std::to_string(i);
+                }
+            }
+
+            // Prepare AudioBuffer
+            AudioBuffer output_audio_buf;
+            output_audio_buf.data = output_stems[i]; // Copy? AudioBuffer uses vector, simple move/copy
+            output_audio_buf.channels = 2; // Output is always stereo
+            output_audio_buf.sampleRate = 44100;
+            output_audio_buf.samples = output_stems[i].size();
+            
+            std::cout << "Saving output stem " << i << ": " << current_output_path << std::endl;
+            AudioFile::Save(current_output_path, output_audio_buf);
+        }
+        
+        std::cout << "Done!" << std::endl;
+        
+    } catch (const std::exception& e) {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return 1;
+    }
+
+    return 0;
+}

+ 38 - 0
include/mel_band_roformer/audio.h

@@ -0,0 +1,38 @@
+#pragma once
+#include <vector>
+#include <string>
+#include <stdexcept>
+
+/**
+ * Audio buffer structure for storing audio data.
+ * Data is stored in interleaved format (L, R, L, R, ...) for stereo.
+ */
+struct AudioBuffer {
+    std::vector<float> data;  // Interleaved samples
+    unsigned int channels;
+    unsigned int sampleRate;
+    size_t samples;           // Total samples (frames * channels)
+};
+
+/**
+ * Audio file I/O utilities.
+ * Supports WAV format (via dr_wav).
+ */
+class AudioFile {
+public:
+    /**
+     * Load audio from a WAV file.
+     * @param path Path to the WAV file
+     * @return AudioBuffer containing the loaded audio data
+     * @throws std::runtime_error if the file cannot be opened
+     */
+    static AudioBuffer Load(const std::string& path);
+    
+    /**
+     * Save audio to a WAV file.
+     * @param path Path to save the WAV file
+     * @param buffer AudioBuffer containing audio data to save
+     * @throws std::runtime_error if the file cannot be written
+     */
+    static void Save(const std::string& path, const AudioBuffer& buffer);
+};

+ 99 - 0
include/mel_band_roformer/inference.h

@@ -0,0 +1,99 @@
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <functional>
+// Forward declaration
+class MelBandRoformer;
+
+// Forward declaration
+namespace ggml { struct context; struct cgraph; }
+
+class Inference {
+public:
+    Inference(const std::string& model_path);
+    ~Inference();
+
+    // Process a full audio track (interleaved stereo float32)
+    // Uses overlap-add chunking to handle long files
+    // Process a full audio track (interleaved stereo float32)
+    // Returns a vector of stems, where each stem is an interleaved stereo float vector
+    std::vector<std::vector<float>> Process(const std::vector<float>& input_audio, 
+                               int chunk_size = 352800, 
+                               int num_overlap = 2,
+                               std::function<void(float)> progress_callback = nullptr);
+
+    // Low-level chunk processing (public for testing)
+    std::vector<std::vector<float>> ProcessChunk(const std::vector<float>& chunk_audio);
+
+    // Get model's recommended inference defaults
+    int GetDefaultChunkSize() const;
+    int GetDefaultNumOverlap() const;
+
+    // Static helper for Overlap-Add logic (matches Python exactly)
+    // model_func: input [samples], output [stems][samples] (interleaved stereo)
+    using ModelCallback = std::function<std::vector<std::vector<float>>(const std::vector<float>&)>;
+    static std::vector<std::vector<float>> ProcessOverlapAdd(const std::vector<float>& input_audio, 
+                                                int chunk_size, 
+                                                int num_overlap,
+                                                ModelCallback model_func,
+                                                std::function<void(float)> progress_callback = nullptr); // Added callback
+
+private:
+    // Pipelined Overlap-Add
+    std::vector<std::vector<float>> ProcessOverlapAddPipelined(const std::vector<float>& input_audio, 
+                                                  int chunk_size, 
+                                                  int num_overlap,
+                                                  std::function<void(float)> progress_callback);
+
+private:
+    std::unique_ptr<MelBandRoformer> model_;
+    
+    // Persistent Graph State
+    struct ggml_context* ctx_ = nullptr;
+    struct ggml_cgraph* gf_ = nullptr;
+    struct ggml_gallocr* allocr_ = nullptr;
+    
+    // Cached Input Tensors (owned by ctx_)
+    struct ggml_tensor* input_tensor_ = nullptr;
+    struct ggml_tensor* pos_time_ = nullptr;
+    struct ggml_tensor* pos_freq_ = nullptr;
+    struct ggml_tensor* mask_out_tensor_ = nullptr;
+
+    // Current config state
+    int cached_n_frames_ = -1;
+
+    // Pipelined State Data
+    struct ChunkState {
+        int id = -1;
+        std::vector<float> input_audio;       // Original chunk audio
+        std::vector<float> stft_flattened;    // [Prepared Input for GPU]
+        std::vector<std::vector<float>> stft_outputs; // Kept for reconstruction
+        int n_frames = 0;
+        
+        std::vector<float> mask_output;       // Output from GPU
+        std::vector<std::vector<float>> final_audio;       // Result after ISTFT [stems][samples]
+    };
+
+    // Helper to ensure graph is built for specific n_frames
+    bool EnsureGraph(int n_frames);
+
+    void ComputeSTFT(const std::vector<float>& input_audio,
+                     std::vector<std::vector<float>>& stft_outputs,
+                     int& n_frames);
+                     
+    void PrepareModelInput(const std::vector<std::vector<float>>& stft_outputs,
+                           int n_frames,
+                           std::vector<float>& model_input_rearranged);
+
+    void PostProcessAndISTFT(const std::vector<float>& mask_output,
+                             const std::vector<std::vector<float>>& stft_outputs,
+                             int n_frames,
+                             std::vector<std::vector<float>>& output_audio);
+
+    // Pipeline Steps
+    std::shared_ptr<ChunkState> PreProcessChunk(const std::vector<float>& chunk_audio, int id);
+    void RunInference(std::shared_ptr<ChunkState> state);
+    void PostProcessChunk(std::shared_ptr<ChunkState> state);
+};

+ 469 - 0
scripts/convert_to_gguf.py

@@ -0,0 +1,469 @@
+#!/usr/bin/env python3
+"""
+Convert Mel-Band-Roformer PyTorch checkpoint to GGUF format.
+
+Supports quantization: FP32, FP16, Q8_0, Q4_0, Q4_1, Q5_0, Q5_1
+Mixed Quantization: Keeps Norms/Biases as FP32 to avoid CUDA alignment issues.
+"""
+
+import os
+import argparse
+import torch
+import numpy as np
+import yaml
+import librosa
+from einops import repeat, reduce, rearrange
+import gguf
+from gguf.quants import quantize, GGMLQuantizationType
+
+
+def generate_buffers(hparams):
+    """
+    Generate the buffers (freq_indices, num_bands_per_freq, etc.)
+    mimicking the logic in MelBandRoformer.__init__.
+    """
+    num_bands = hparams["num_bands"]
+    sample_rate = hparams.get("sample_rate", 44100)
+    stft_n_fft = hparams.get("stft_n_fft", 2048)
+    stereo = hparams.get("stereo", False)
+
+    # 1. Calculate number of frequencies
+    freqs = stft_n_fft // 2 + 1
+
+    # 2. Create Mel Filter Bank
+    mel_filter_bank_numpy = librosa.filters.mel(
+        sr=sample_rate, n_fft=stft_n_fft, n_mels=num_bands
+    )
+    mel_filter_bank = torch.from_numpy(mel_filter_bank_numpy)
+
+    # 3. Ensure edge values are positive (required for mask generation)
+    # The exact value doesn't matter as long as it's > 0
+    mel_filter_bank[0, 0] = max(mel_filter_bank[0, 0].item(), 1e-6)
+    mel_filter_bank[-1, -1] = max(mel_filter_bank[-1, -1].item(), 1e-6)
+
+    # 4. Create Masks
+    freqs_per_band = mel_filter_bank > 0
+    assert freqs_per_band.any(dim=0).all(), (
+        "all frequencies need to be covered by all bands"
+    )
+
+    # 5. Generate Indices
+    repeated_freq_indices = repeat(torch.arange(freqs), "f -> b f", b=num_bands)
+    freq_indices = repeated_freq_indices[freqs_per_band]
+
+    if stereo:
+        freq_indices = repeat(freq_indices, "f -> f s", s=2)
+        # s=0 -> 2*f, s=1 -> 2*f+1
+        freq_indices = freq_indices * 2 + torch.arange(2)
+        freq_indices = rearrange(freq_indices, "f s -> (f s)")
+
+    # 6. Aggregate Counts
+    num_freqs_per_band = reduce(freqs_per_band, "b f -> b", "sum")
+    num_bands_per_freq = reduce(freqs_per_band, "b f -> f", "sum")
+
+    return {
+        "freq_indices": freq_indices,
+        "num_freqs_per_band": num_freqs_per_band,
+        "num_bands_per_freq": num_bands_per_freq,
+        "freqs_per_band": freqs_per_band,  # Kept if needed, though usually not saved
+    }
+
+
+# ============================================================================
+# Quantization Helper
+# ============================================================================
+
+
+def get_target_quantization_type(dtype_str: str) -> GGMLQuantizationType:
+    mapping = {
+        "f32": GGMLQuantizationType.F32,
+        "fp32": GGMLQuantizationType.F32,
+        "f16": GGMLQuantizationType.F16,
+        "fp16": GGMLQuantizationType.F16,
+        "q8_0": GGMLQuantizationType.Q8_0,
+        "q4_0": GGMLQuantizationType.Q4_0,
+        "q4_1": GGMLQuantizationType.Q4_1,
+        "q5_0": GGMLQuantizationType.Q5_0,
+        "q5_1": GGMLQuantizationType.Q5_1,
+    }
+    return mapping.get(dtype_str.lower(), GGMLQuantizationType.F32)
+
+
+def get_file_type_id(qtype: GGMLQuantizationType) -> int:
+    # See GGUF spec: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
+    mapping = {
+        GGMLQuantizationType.F32: 0,
+        GGMLQuantizationType.F16: 1,
+        GGMLQuantizationType.Q4_0: 2,
+        GGMLQuantizationType.Q4_1: 3,
+        # 4 is Q4_1_O (deprecated/legacy?)
+        # 5 is Q4_0_O ?
+        # 6 is Q4_1_O ?
+        GGMLQuantizationType.Q8_0: 7,
+        GGMLQuantizationType.Q5_0: 8,
+        GGMLQuantizationType.Q5_1: 9,
+        GGMLQuantizationType.Q2_K: 10,
+        GGMLQuantizationType.Q3_K: 11,
+        GGMLQuantizationType.Q4_K: 12,
+        GGMLQuantizationType.Q5_K: 13,
+        GGMLQuantizationType.Q6_K: 14,
+        # IQ2_XXS etc might have IDs but let's stick to these for now
+    }
+    return mapping.get(qtype, 0)  # Default to ALL_F32 if unknown
+
+
+def should_quantize(name: str) -> bool:
+    """
+    Determine if a tensor should be quantized.
+    Keep norms and biases as FP32 to avoid CUDA alignment issues.
+    """
+    # Biases are always small and sensitive
+    if "bias" in name:
+        return False
+
+    # Norm weights (gamma) must be F32 to avoid mixed-type mul issues in CUDA
+    if "norm.weight" in name:
+        return False
+
+    # Quantize all other "weight" matrices (Linear, Conv, Embedding if any)
+    if "weight" in name:
+        return True
+
+    return False
+
+
+# ============================================================================
+# Key Name Mapping
+# ============================================================================
+
+
+def map_key_name(key: str) -> str:
+    """
+    Map PyTorch state_dict keys to GGUF format (blk.{bid}.*).
+    Standardizes suffixes: gamma -> weight, beta -> bias.
+    """
+
+    def standardize_suffix(param_name: str) -> str:
+        if param_name == "gamma":
+            return "weight"
+        if param_name == "beta":
+            return "bias"
+        return param_name
+
+    parts = key.split(".")
+    suffix = standardize_suffix(parts[-1])
+
+    # Transformer Layers
+    if key.startswith("layers."):
+        layer_idx = parts[1]
+        tf_idx = parts[2]  # 0=Time, 1=Freq
+        type_str = "time" if tf_idx == "0" else "freq"
+
+        # Final Norm: layers.0.0.norm.gamma
+        if len(parts) >= 5 and parts[3] == "norm":
+            return f"blk.{layer_idx}.{type_str}_norm.{suffix}"
+
+        # Sub-layers (Attention=0, FF=1)
+        if len(parts) >= 6 and parts[3] == "layers":
+            block_sub_idx = parts[5]
+
+            if block_sub_idx == "0":  # Attention
+                if len(parts) > 6:
+                    sub_name = parts[6]
+                    if sub_name == "norm":
+                        return f"blk.{layer_idx}.{type_str}_attn_norm.{suffix}"
+                    if sub_name == "to_qkv":
+                        return f"blk.{layer_idx}.{type_str}_attn_qkv.{suffix}"
+                    if sub_name == "to_out":
+                        return f"blk.{layer_idx}.{type_str}_attn_out.{suffix}"
+                    if sub_name == "to_gates":
+                        return f"blk.{layer_idx}.{type_str}_attn_gate.{suffix}"
+
+            elif block_sub_idx == "1":  # FeedForward
+                if len(parts) >= 8 and parts[6] == "net":
+                    net_idx = parts[7]
+                    if net_idx == "0":
+                        return f"blk.{layer_idx}.{type_str}_ff_norm.{suffix}"
+                    if net_idx == "1":
+                        return f"blk.{layer_idx}.{type_str}_ff_in.{suffix}"
+                    if net_idx == "4":
+                        return f"blk.{layer_idx}.{type_str}_ff_out.{suffix}"
+
+    # BandSplit
+    if key.startswith("band_split.to_features"):
+        band_idx = parts[2]
+        layer_idx = parts[3]  # 0=Norm, 1=Linear
+
+        if layer_idx == "0":
+            return f"band_split.{band_idx}.norm.{suffix}"
+        if layer_idx == "1":
+            return f"band_split.{band_idx}.linear.{suffix}"
+
+    # Mask Estimator
+    if key.startswith("mask_estimators"):
+        est_idx = parts[1]
+        freq_idx = parts[3]
+        layer_idx = parts[5]  # 0, 2, 4
+        return f"mask_est.{est_idx}.freq.{freq_idx}.mlp.{layer_idx}.{suffix}"
+
+    return key.replace(".", "_")
+
+
+# ============================================================================
+# Main Conversion
+# ============================================================================
+
+
+def convert(
+    ckpt_path: str,
+    output_path: str,
+    config_path: str,
+    dtype: str = "fp32",
+):
+    """
+    Convert PyTorch checkpoint to GGUF format.
+    """
+    print(f"Loading checkpoint: {ckpt_path}")
+    checkpoint = torch.load(ckpt_path, map_location="cpu")
+
+    if "state_dict" in checkpoint:
+        state_dict = checkpoint["state_dict"]
+    elif "model" in checkpoint:
+        state_dict = checkpoint["model"]
+    else:
+        state_dict = checkpoint
+
+    print(f"Loading config: {config_path}")
+    with open(config_path) as f:
+        config_dict = yaml.load(f, Loader=yaml.FullLoader)
+
+    # Generate buffers
+    print("Generating buffers (standalone)...")
+    buffers = generate_buffers(config_dict["model"])
+    freq_indices = buffers["freq_indices"]
+    num_bands_per_freq = buffers["num_bands_per_freq"]
+    num_freqs_per_band = buffers["num_freqs_per_band"]
+
+    # Create GGUF writer
+    gguf_writer = gguf.GGUFWriter(output_path, "mel_band_roformer")
+
+    # =========================================================================
+    # 1. Write Standard GGUF Metadata
+    # =========================================================================
+    print("Writing metadata...")
+
+    # General metadata
+    gguf_writer.add_name("Mel-Band-Roformer Vocal Separator")
+    gguf_writer.add_description("Audio source separation model for vocal extraction")
+
+    # Determine types
+    target_qtype = get_target_quantization_type(dtype)
+    file_type_id = get_file_type_id(target_qtype)
+
+    gguf_writer.add_file_type(file_type_id)
+
+    # Quantization version (required when quantized)
+    if target_qtype != GGMLQuantizationType.F32:
+        gguf_writer.add_quantization_version(2)
+
+    # Calculate parameter count
+    total_params = 0
+    for key, tensor in state_dict.items():
+        if "freq_indices" in key or "num_bands" in key:
+            continue
+        total_params += tensor.numel()
+
+    print(f"Total parameters: {total_params}")
+    gguf_writer.add_uint64("general.parameter_count", total_params)
+
+    # =========================================================================
+    # 2. Write Hyperparameters
+    # =========================================================================
+    print("Writing hyperparameters...")
+    hparams = config_dict["model"]
+
+    # Architecture specific parameters
+    gguf_writer.add_uint32("mel_band_roformer.dim", hparams["dim"])
+    gguf_writer.add_uint32("mel_band_roformer.depth", hparams["depth"])
+    gguf_writer.add_uint32("mel_band_roformer.num_bands", hparams["num_bands"])
+
+    # STFT parameters
+    gguf_writer.add_uint32(
+        "mel_band_roformer.stft_n_fft", hparams.get("stft_n_fft", 2048)
+    )
+    # Remove default for hop_length, must be present or fail/warn
+    gguf_writer.add_uint32(
+        "mel_band_roformer.stft_hop_length", hparams.get("stft_hop_length", 441)
+    )
+    gguf_writer.add_uint32(
+        "mel_band_roformer.stft_win_length", hparams.get("stft_win_length", 2048)
+    )
+    gguf_writer.add_bool(
+        "mel_band_roformer.stft_normalized", hparams.get("stft_normalized", False)
+    )
+    gguf_writer.add_bool(
+        "mel_band_roformer.zero_dc", hparams.get("zero_dc", True)
+    )  # Defaults to True in reference implementation
+
+    # Architecture details
+    gguf_writer.add_uint32("mel_band_roformer.num_stems", hparams.get("num_stems", 1))
+    gguf_writer.add_bool("mel_band_roformer.stereo", hparams.get("stereo", False))
+    gguf_writer.add_uint32(
+        "mel_band_roformer.sample_rate", hparams.get("sample_rate", 44100)
+    )
+
+    gguf_writer.add_uint32(
+        "mel_band_roformer.time_transformer_depth",
+        hparams.get("time_transformer_depth", 0),
+    )
+    gguf_writer.add_uint32(
+        "mel_band_roformer.freq_transformer_depth",
+        hparams.get("freq_transformer_depth", 0),
+    )
+    gguf_writer.add_uint32(
+        "mel_band_roformer.linear_transformer_depth",
+        hparams.get("linear_transformer_depth", 0),
+    )
+
+    gguf_writer.add_uint32(
+        "mel_band_roformer.mask_estimator_depth", hparams.get("mask_estimator_depth", 1)
+    )
+    gguf_writer.add_uint32("mel_band_roformer.dim_head", hparams.get("dim_head", 64))
+    gguf_writer.add_uint32("mel_band_roformer.heads", hparams.get("heads", 8))
+    gguf_writer.add_uint32(
+        "mel_band_roformer.mlp_expansion_factor", hparams.get("mlp_expansion_factor", 4)
+    )
+    gguf_writer.add_bool(
+        "mel_band_roformer.skip_connection", hparams.get("skip_connection", False)
+    )
+
+    # =========================================================================
+    # 3. Write Inference Defaults (Optional, can be overridden at runtime)
+    # =========================================================================
+    print("Writing inference defaults...")
+
+    inference_config = config_dict.get("inference", {})
+    audio_config = config_dict.get("audio", {})
+
+    # chunk_size: prefer inference.chunk_size, fallback to audio.chunk_size
+    default_chunk_size = inference_config.get(
+        "chunk_size", audio_config.get("chunk_size", 352800)
+    )
+    # num_overlap: from inference section
+    default_num_overlap = inference_config.get("num_overlap", 0)
+
+    gguf_writer.add_uint32("mel_band_roformer.default_chunk_size", default_chunk_size)
+    gguf_writer.add_uint32("mel_band_roformer.default_num_overlap", default_num_overlap)
+
+    # =========================================================================
+    # 4. Write Buffers (Always FP32/I32)
+    # =========================================================================
+    print("Writing buffers...")
+
+    # freq_indices (int32)
+    gguf_writer.add_tensor("buffer_freq_indices", freq_indices.numpy().astype(np.int32))
+    # num_bands_per_freq (int32)
+    gguf_writer.add_tensor(
+        "buffer_num_bands_per_freq", num_bands_per_freq.numpy().astype(np.int32)
+    )
+    # num_freqs_per_band (int32)
+    gguf_writer.add_tensor(
+        "buffer_num_freqs_per_band", num_freqs_per_band.numpy().astype(np.int32)
+    )
+
+    # =========================================================================
+    # 5. Write Weights (Mixed Quantization)
+    # =========================================================================
+    print(f"Writing weights ({dtype} -> {target_qtype.name})...")
+    print("Strategy: Quantize weights, Keep Norm/Bias as F32")
+
+    n_tensors = 0
+    n_quantized = 0
+
+    for key, tensor in state_dict.items():
+        new_key = map_key_name(key)
+
+        # Skip buffers
+        if (
+            "freq_indices" in key
+            or "num_bands_per_freq" in key
+            or "num_freqs_per_band" in key
+        ):
+            continue
+
+        data = tensor.numpy().astype(np.float32)
+
+        # Decide whether to quantize
+        is_quantized = False
+
+        if target_qtype != GGMLQuantizationType.F32 and should_quantize(new_key):
+            try:
+                # Use gguf-py built-in quantization
+                quantized_data = quantize(data, target_qtype)
+                # Pass raw_dtype so GGUFWriter knows how to treat the byte array (for Q types)
+                # or float array (for F16)
+                gguf_writer.add_tensor(new_key, quantized_data, raw_dtype=target_qtype)
+                is_quantized = True
+                n_quantized += 1
+            except Exception as e:
+                print(
+                    f"Warning: Failed to quantize {new_key} to {target_qtype.name}, falling back to F32. Error: {e}"
+                )
+                gguf_writer.add_tensor(new_key, data)
+        else:
+            # Keep as F32
+            gguf_writer.add_tensor(new_key, data)
+
+        status = target_qtype.name if is_quantized else "F32"
+        print(f"  {new_key:<50} | {str(data.shape):<20} | {status}")
+        n_tensors += 1
+
+    # =========================================================================
+    # 6. Write File
+    # =========================================================================
+    print(f"\nWriting GGUF to {output_path}")
+    gguf_writer.write_header_to_file()
+    gguf_writer.write_kv_data_to_file()
+    gguf_writer.write_tensors_to_file()
+    gguf_writer.close()
+
+    file_size = os.path.getsize(output_path)
+    print(f"\nDone! Converted {n_tensors} tensors ({n_quantized} quantized)")
+    print(f"Output file size: {file_size / 1024 / 1024:.2f} MB")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Convert Mel-Band-Roformer checkpoint to GGUF format with Mixed Quantization",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python convert_to_gguf.py --ckpt model.ckpt --config config.yaml --out model_f16.gguf --dtype fp16
+  python convert_to_gguf.py --ckpt model.ckpt --config config.yaml --out model_q8.gguf --dtype q8_0
+""",
+    )
+    parser.add_argument(
+        "--ckpt", type=str, required=True, help="Path to PyTorch checkpoint"
+    )
+    parser.add_argument("--config", type=str, required=True, help="Path to YAML config")
+    parser.add_argument("--out", type=str, required=True, help="Output GGUF file path")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="fp32",
+        choices=[
+            "fp32",
+            "f32",
+            "fp16",
+            "f16",
+            "q8_0",
+            "q4_0",
+            "q4_1",
+            "q5_0",
+            "q5_1",
+        ],
+        help="Target quantization type. Norms/Biases will be kept as F32. (K-Quants not supported due to dim=384)",
+    )
+    args = parser.parse_args()
+
+    convert(args.ckpt, args.out, args.config, args.dtype)

+ 132 - 0
scripts/generate_test_audio.py

@@ -0,0 +1,132 @@
+"""
+Generate synthetic test audio for CI testing.
+
+Creates deterministic test signals without requiring any external audio files,
+ensuring reproducibility and avoiding copyright concerns.
+
+Usage:
+    python generate_test_audio.py --output test_audio.wav
+"""
+
+import argparse
+import numpy as np
+
+try:
+    import soundfile as sf
+except ImportError:
+    sf = None
+
+
+def generate_test_audio(
+    output_path: str,
+    duration: float = 5.0,
+    sample_rate: int = 44100,
+) -> None:
+    """
+    Generate deterministic test audio (sine wave synthesis).
+
+    Creates a mixture of "vocal-like" and "accompaniment-like" sine waves
+    that covers a reasonable frequency range for testing audio separation.
+
+    Args:
+        output_path: Path to save the output WAV file
+        duration: Duration in seconds
+        sample_rate: Sample rate in Hz
+    """
+    if sf is None:
+        raise ImportError(
+            "soundfile is required for audio generation. "
+            "Install with: pip install soundfile"
+        )
+
+    t = np.linspace(0, duration, int(sample_rate * duration), dtype=np.float32)
+
+    # Simulate vocals: multiple sine waves (fundamental + harmonics)
+    # Using A3 (220 Hz) as base frequency
+    vocals = (
+        0.50 * np.sin(2 * np.pi * 220 * t)  # A3 fundamental
+        + 0.30 * np.sin(2 * np.pi * 440 * t)  # A4 harmonic
+        + 0.15 * np.sin(2 * np.pi * 880 * t)  # A5 harmonic
+        + 0.05 * np.sin(2 * np.pi * 1760 * t)  # A6 harmonic
+    )
+
+    # Add slight vibrato to vocals (more realistic)
+    vibrato = 0.02 * np.sin(2 * np.pi * 5 * t)  # 5 Hz vibrato
+    vocals = vocals * (1 + vibrato)
+
+    # Simulate accompaniment: different frequency content
+    accompaniment = (
+        0.40 * np.sin(2 * np.pi * 110 * t)  # A2 bass
+        + 0.30 * np.sin(2 * np.pi * 330 * t)  # E4
+        + 0.20 * np.sin(2 * np.pi * 660 * t)  # E5
+        + 0.10 * np.sin(2 * np.pi * 82.41 * t)  # E2 sub-bass
+    )
+
+    # Add slight amplitude envelope to make it more interesting
+    envelope = np.ones_like(t)
+    fade_samples = int(0.1 * sample_rate)  # 100ms fade
+    envelope[:fade_samples] = np.linspace(0, 1, fade_samples)
+    envelope[-fade_samples:] = np.linspace(1, 0, fade_samples)
+
+    # Mix vocals and accompaniment
+    mix = (vocals + accompaniment) * envelope
+
+    # Normalize to prevent clipping
+    max_val = np.max(np.abs(mix))
+    if max_val > 0:
+        mix = mix / max_val * 0.9  # Leave some headroom
+
+    # Create stereo (identical channels for simplicity)
+    stereo = np.stack([mix, mix], axis=-1)
+
+    # Save as WAV
+    sf.write(output_path, stereo, sample_rate, subtype="PCM_16")
+    print(f"Generated: {output_path}")
+    print(f"  Duration: {duration}s")
+    print(f"  Sample rate: {sample_rate} Hz")
+    print("  Channels: 2 (stereo)")
+    print("  Format: PCM_16")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate synthetic test audio for CI testing",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+This script generates deterministic test audio using sine wave synthesis.
+The output is suitable for testing audio processing pipelines without
+requiring real audio files.
+
+Example:
+    python generate_test_audio.py --output test.wav
+    python generate_test_audio.py --output test.wav --duration 10 --sample-rate 48000
+""",
+    )
+
+    parser.add_argument("--output", "-o", required=True, help="Output WAV file path")
+    parser.add_argument(
+        "--duration",
+        "-d",
+        type=float,
+        default=5.0,
+        help="Duration in seconds (default: 5.0)",
+    )
+    parser.add_argument(
+        "--sample-rate",
+        "-sr",
+        type=int,
+        default=44100,
+        help="Sample rate in Hz (default: 44100)",
+    )
+
+    args = parser.parse_args()
+
+    generate_test_audio(
+        output_path=args.output,
+        duration=args.duration,
+        sample_rate=args.sample_rate,
+    )
+
+
+if __name__ == "__main__":
+    main()

+ 508 - 0
scripts/generate_test_data.py

@@ -0,0 +1,508 @@
+"""
+Generate minimal test data for MelBandRoformer.cpp verification.
+
+This script generates ONLY the essential tensors needed for C++ tests:
+- input_audio.npy  (for test_inference)
+- output_audio.npy (for test_inference)
+- band_split_in.npy (for test_component_bandsplit)
+- after_band_split.npy (for test_component_bandsplit, test_component_layers)
+- before_mask_est.npy (for test_component_layers, test_component_mask)
+- mask_est0.npy (for test_component_mask)
+- chunk_in.npy (for test_chunking_logic)
+- chunk_out.npy (for test_chunking_logic)
+
+Requirements:
+    This script requires the Music-Source-Separation-Training repository:
+    https://github.com/ZFTurbo/Music-Source-Separation-Training
+
+    Clone it first:
+        git clone https://github.com/ZFTurbo/Music-Source-Separation-Training.git
+
+Usage:
+    python generate_test_data.py --model-repo /path/to/Music-Source-Separation-Training \\
+        --audio test.wav --checkpoint model.ckpt --output test_data
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+import numpy as np
+import torch
+import soundfile as sf
+import yaml
+from ml_collections import ConfigDict
+from einops import rearrange, pack, unpack
+
+# Model imports are deferred until we know the model-repo path
+# Model imports are deferred until we know the model-repo path
+MelBandRoformer = None
+pack_one = None
+unpack_one = None
+# Inference utility
+inference_func = None
+
+MODEL_REPO_URL = "https://github.com/ZFTurbo/Music-Source-Separation-Training"
+
+
+class MockModel(torch.nn.Module):
+    """Identity model for testing chunking logic."""
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        # x shape: [Batch, Channels, Time] or [Batch, Time]
+        # Return same as input (Identity)
+        return x
+
+
+def load_model_module(model_repo_path: Path):
+    """Dynamically load the MelBandRoformer model from the specified repository."""
+    global MelBandRoformer, pack_one, unpack_one, inference_func
+
+    if not model_repo_path.exists():
+        print("\n" + "=" * 70)
+        print("ERROR: Model repository not found!")
+        print("=" * 70)
+        print(f"\nPath: {model_repo_path}")
+        print("\nThis script requires the Music-Source-Separation-Training repository.")
+        print("\nPlease clone it first:")
+        print(f"  git clone {MODEL_REPO_URL}")
+        print(
+            "\nThen run this script with --model-repo pointing to the cloned directory."
+        )
+        print("=" * 70)
+        sys.exit(1)
+
+    models_path = model_repo_path / "models"
+    if not models_path.exists():
+        print("\n" + "=" * 70)
+        print("ERROR: Invalid repository structure!")
+        print("=" * 70)
+        print(f"\nThe 'models' directory was not found in: {model_repo_path}")
+        print("=" * 70)
+        sys.exit(1)
+
+    # Add to path and import
+    sys.path.insert(0, str(model_repo_path))
+
+    # Mock loralib to allow importing model_utils without installing it
+    from unittest.mock import MagicMock
+
+    if "loralib" not in sys.modules:
+        sys.modules["loralib"] = MagicMock()
+
+    # Import from new structure (Music-Source-Separation-Training)
+    try:
+        from models.bs_roformer.mel_band_roformer import (
+            MelBandRoformer as _MelBandRoformer,
+        )
+        from models.bs_roformer.mel_band_roformer import (
+            pack_one as _pack_one,
+            unpack_one as _unpack_one,
+        )
+
+        pack_one = _pack_one
+        unpack_one = _unpack_one
+        MelBandRoformer = _MelBandRoformer
+
+        # Import demix from utils.model_utils
+        from utils.model_utils import demix
+
+        inference_func = demix
+
+        print(f"  Loaded model from: {model_repo_path}")
+        return
+    except ImportError as e:
+        print("\n" + "=" * 70)
+        print("ERROR: Failed to import model!")
+        print("=" * 70)
+        print(f"\nImport error: {e}")
+        print(
+            "\nPlease ensure the repository is complete and dependencies are installed."
+        )
+        sys.exit(1)
+
+
+def save_tensor(
+    output_dir: Path, name: str, tensor, subdir: str = "activations"
+) -> dict:
+    """Save tensor to .npy file."""
+    path = output_dir / subdir / f"{name}.npy"
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+    if isinstance(tensor, torch.Tensor):
+        tensor = tensor.detach().cpu()
+        if tensor.dtype in [torch.int64, torch.int32, torch.bool]:
+            tensor = tensor.float()
+        tensor = tensor.numpy()
+
+    if isinstance(tensor, np.ndarray) and tensor.dtype != np.float32:
+        tensor = tensor.astype(np.float32)
+
+    np.save(path, tensor)
+    print(f"  Saved {name}: shape={list(tensor.shape)}")
+    return {"name": name, "shape": list(tensor.shape), "path": str(path)}
+
+
+def generate_chunking_data(output_dir: Path, config: ConfigDict):
+    """Generate input/output data for verifying chunking logic."""
+    print("\n[Chunking] Generating overlap-add debug data...")
+
+    if inference_func is None:
+        print(
+            "  Warning: Inference function not found, skipping chunking data generation."
+        )
+        return
+
+    # Create Mock Model (Identity)
+    model = MockModel()
+    device = torch.device("cpu")
+
+    # Create input: Ramp signal
+    # Size > 2 chunks to test overlap logic
+    # Use fixed values to match C++ test_chunking_logic.cpp (lines 76-77)
+    chunk_size = 352800
+    num_overlap = 2
+
+    print(f"  Chunk size: {chunk_size}, Overlap: {num_overlap}")
+
+    total_len = chunk_size * 2 + 10000
+    inputs = np.linspace(0, 1, total_len).astype(np.float32)
+    # Make stereo [2, T]
+    inputs = np.stack([inputs, inputs], axis=0)
+
+    # Save input (C-order, transposed to [T, 2] for C++ ease if needed, but C++ load_npy handles it)
+    save_tensor(output_dir, "chunk_in", inputs.T, subdir=".")
+
+    # Run Inference
+    mixture = torch.tensor(inputs, dtype=torch.float32)
+
+    # demix(config, model, mix, device, model_type)
+    # generic mode (not htdemucs) uses 'generic'
+    # It returns dict {instr: waveform} or array
+    res = inference_func(config, model, mixture, device, model_type="generic")
+
+    if isinstance(res, dict):
+        # Pick the first instrument
+        first_key = list(res.keys())[0]
+        output = res[first_key]
+    else:
+        output = res
+
+    # Save output
+    if isinstance(output, torch.Tensor):
+        output = output.cpu().numpy()
+
+    save_tensor(output_dir, "chunk_out", output.T, subdir=".")
+
+
+def generate_test_data(
+    model_repo: str,
+    audio_file: str,
+    checkpoint: str,
+    config_file: str,
+    output_dir: str,
+    audio_start: float = 2.0,
+    audio_end: float = 5.0,
+) -> int:
+    """Generate test data for C++ verification."""
+
+    # Load model module from specified repository
+    model_repo_path = Path(model_repo)
+    load_model_module(model_repo_path)
+
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    print("=" * 70)
+    print("MelBandRoformer Test Data Generator")
+    print("=" * 70)
+
+    # 1. Load config and model
+    print(f"\n[1/4] Loading model from {checkpoint}")
+
+    with open(config_file) as f:
+        config = ConfigDict(yaml.load(f, Loader=yaml.FullLoader))
+
+    model = MelBandRoformer(**dict(config.model))
+    state_dict = torch.load(checkpoint, map_location="cpu")
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    print(f"  Config: depth={config.model.depth}, dim={config.model.dim}")
+
+    # 2. Load audio
+    print(f"\n[2/4] Loading audio ({audio_start}s - {audio_end}s) from {audio_file}")
+
+    audio, sr = sf.read(audio_file)
+    start_sample = int(audio_start * sr)
+    end_sample = int(audio_end * sr)
+    audio_segment = audio[start_sample:end_sample]
+
+    if len(audio_segment.shape) == 1:
+        audio_segment = np.stack([audio_segment, audio_segment], axis=-1)
+
+    # [batch, channels, samples]
+    audio_tensor = torch.tensor(audio_segment.T, dtype=torch.float32).unsqueeze(0)
+    print(f"  Audio shape: {audio_tensor.shape}")
+
+    # 3. Run instrumented forward pass
+    print("\n[3/4] Running instrumented forward pass...")
+
+    captured = {}
+
+    with torch.no_grad():
+        device = audio_tensor.device
+        raw_audio = audio_tensor
+
+        if raw_audio.ndim == 2:
+            raw_audio = rearrange(raw_audio, "b t -> b 1 t")
+
+        batch, channels, raw_audio_length = raw_audio.shape
+        istft_length = raw_audio_length if model.match_input_audio_length else None
+
+        # STFT
+        raw_audio_packed, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t")
+        stft_window = model.stft_window_fn(device=device)
+        stft_repr = torch.stft(
+            raw_audio_packed,
+            **model.stft_kwargs,
+            window=stft_window,
+            return_complex=True,
+        )
+        stft_repr = torch.view_as_real(stft_repr)
+        stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c")
+        stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c")
+
+        # Frequency indexing
+        batch_arange = torch.arange(batch, device=device)[..., None]
+        x = stft_repr[batch_arange, model.freq_indices]
+        x = rearrange(x, "b f t c -> b t (f c)")
+
+        # ===== CAPTURE: BandSplit Input =====
+        captured["band_split_in"] = x.clone()
+
+        # BandSplit
+        x = model.band_split(x)
+
+        # ===== CAPTURE: After BandSplit (= Transformer Input) =====
+        captured["after_band_split"] = x.clone()
+
+        # Transformer Layers
+        for layer_idx, (time_transformer, freq_transformer) in enumerate(model.layers):
+            # Time Transformer
+            x = rearrange(x, "b t f d -> b f t d")
+            x, ps = pack([x], "* t d")
+            x = time_transformer(x)
+            (x,) = unpack(x, ps, "* t d")
+            x = rearrange(x, "b f t d -> b t f d")
+
+            # Freq Transformer
+            x, ps = pack([x], "* f d")
+            x = freq_transformer(x)
+            (x,) = unpack(x, ps, "* f d")
+
+        # ===== CAPTURE: Before Mask Estimator (= Transformer Output) =====
+        captured["before_mask_est"] = x.clone()
+
+        # Mask Estimator (just first one for testing)
+        mask0 = model.mask_estimators[0](x)
+
+        # ===== CAPTURE: Mask Estimator Output =====
+        captured["mask_est0"] = mask0.clone()
+
+        # Continue with full forward pass for output
+        num_stems = len(model.mask_estimators)
+        masks = torch.stack([fn(x) for fn in model.mask_estimators], dim=1)
+        masks = rearrange(masks, "b n t (f c) -> b n f t c", c=2)
+
+        stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c")
+        stft_repr = torch.view_as_complex(stft_repr)
+        masks = torch.view_as_complex(masks)
+        masks = masks.type(stft_repr.dtype)
+
+        from einops import repeat
+
+        scatter_indices = repeat(
+            model.freq_indices,
+            "f -> b n f t",
+            b=batch,
+            n=num_stems,
+            t=stft_repr.shape[-1],
+        )
+        stft_repr_expanded_stems = repeat(stft_repr, "b 1 ... -> b n ...", n=num_stems)
+        masks_summed = torch.zeros_like(stft_repr_expanded_stems).scatter_add_(
+            2, scatter_indices, masks
+        )
+
+        denom = repeat(model.num_bands_per_freq, "f -> (f r) 1", r=channels)
+        masks_averaged = masks_summed / denom.clamp(min=1e-8)
+
+        stft_repr = stft_repr * masks_averaged
+
+        # ISTFT
+        stft_repr = rearrange(
+            stft_repr, "b n (f s) t -> (b n s) f t", s=model.audio_channels
+        )
+
+        if getattr(model, "zero_dc", False):
+            # Zero out DC component
+            stft_repr = stft_repr.clone()
+            stft_repr[:, 0, :] = 0.0
+
+        recon_audio = torch.istft(
+            stft_repr,
+            **model.stft_kwargs,
+            window=stft_window,
+            return_complex=False,
+            length=istft_length,
+        )
+        recon_audio = rearrange(
+            recon_audio,
+            "(b n s) t -> b n s t",
+            b=batch,
+            s=model.audio_channels,
+            n=num_stems,
+        )
+
+        if num_stems == 1:
+            recon_audio = rearrange(recon_audio, "b 1 s t -> b s t")
+            captured["output_audio"] = recon_audio.clone()
+        else:
+            # Capture Stem 0 for verification
+            captured["output_audio"] = recon_audio[:, 0, :, :].clone()
+
+            # Capture Stem 0 for verification
+            captured["output_audio"] = recon_audio[:, 0, :, :].clone()
+
+    # 4. Generate Chunking Debug Data
+    generate_chunking_data(output_path, config)
+
+    # 5. Save tensors
+    print(f"\n[4/5] Saving test data to {output_dir}")
+
+    # Input audio
+    save_tensor(output_path, "input_audio", audio_tensor)
+
+    # Captured tensors
+    for name, tensor in captured.items():
+        save_tensor(output_path, name, tensor)
+
+    # Verify outputs match normal forward pass
+    print("\n[Verification] Checking output matches model.forward()...")
+    with torch.no_grad():
+        baseline = model(audio_tensor)
+        if hasattr(model, "num_stems") and model.num_stems > 1:
+            baseline = baseline[:, 0, :, :]
+
+    diff = (baseline - captured["output_audio"]).abs()
+    max_diff = diff.max().item()
+
+    if max_diff > 1e-6:
+        print(f"  ✗ FAILED: max_diff = {max_diff:.2e}")
+        return 1
+    else:
+        print(f"  ✓ PASSED: max_diff = {max_diff:.2e}")
+
+    print("\n" + "=" * 70)
+    print("Test data generation complete!")
+    print(f"  Output: {output_dir}/activations/")
+    print(f"  Files: {len(captured) + 1} tensors")
+    print("=" * 70)
+
+    return 0
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate test data for MelBandRoformer.cpp",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=f"""
+Requirements:
+  This script requires the original Mel-Band-Roformer-Vocal-Model repository.
+  
+  Clone it first:
+    git clone {MODEL_REPO_URL}
+  
+  Then specify the path with --model-repo.
+
+Example:
+  python generate_test_data.py \\
+    --model-repo /path/to/Mel-Band-Roformer-Vocal-Model \\
+    --audio test.wav \\
+    --checkpoint model.ckpt \\
+    --output test_data
+""",
+    )
+
+    parser.add_argument(
+        "--model-repo",
+        required=True,
+        help=f"Path to Mel-Band-Roformer-Vocal-Model repository (clone from {MODEL_REPO_URL})",
+    )
+    parser.add_argument("--audio", required=True, help="Input audio file (WAV)")
+    parser.add_argument(
+        "--checkpoint", required=True, help="Model checkpoint file (.ckpt)"
+    )
+    parser.add_argument(
+        "--config",
+        help="Model config YAML file (default: <model-repo>/configs/config_vocals_mel_band_roformer.yaml)",
+    )
+    parser.add_argument(
+        "--output", default="test_data", help="Output directory for test data"
+    )
+    parser.add_argument(
+        "--start",
+        type=float,
+        default=2.0,
+        help="Audio start time in seconds (default: 2.0)",
+    )
+    parser.add_argument(
+        "--end",
+        type=float,
+        default=5.0,
+        help="Audio end time in seconds (default: 5.0)",
+    )
+
+    args = parser.parse_args()
+
+    # Resolve paths
+    model_repo_path = Path(args.model_repo).resolve()
+    audio_path = Path(args.audio).resolve()
+    checkpoint_path = Path(args.checkpoint).resolve()
+    output_path = Path(args.output).resolve()
+
+    # Config defaults to model-repo/configs/...
+    if args.config:
+        config_path = Path(args.config).resolve()
+    else:
+        config_path = (
+            model_repo_path / "configs" / "config_vocals_mel_band_roformer.yaml"
+        )
+
+    # Validate paths
+    if not audio_path.exists():
+        print(f"Error: Audio file not found: {audio_path}")
+        return 1
+    if not checkpoint_path.exists():
+        print(f"Error: Checkpoint not found: {checkpoint_path}")
+        return 1
+    if not config_path.exists():
+        print(f"Error: Config not found: {config_path}")
+        return 1
+
+    return generate_test_data(
+        str(model_repo_path),
+        str(audio_path),
+        str(checkpoint_path),
+        str(config_path),
+        str(output_path),
+        args.start,
+        args.end,
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(main())

+ 51 - 0
src/audio.cpp

@@ -0,0 +1,51 @@
+#define DR_WAV_IMPLEMENTATION
+#include "dr_libs/dr_wav.h"
+#include "mel_band_roformer/audio.h"
+#include <iostream>
+
+AudioBuffer AudioFile::Load(const std::string& path) {
+    AudioBuffer buffer;
+    drwav_uint64 totalPCMFrames;
+    
+    float* pData = drwav_open_file_and_read_pcm_frames_f32(
+        path.c_str(), &buffer.channels, &buffer.sampleRate, &totalPCMFrames, NULL);
+        
+    if (!pData) {
+        throw std::runtime_error("Failed to open audio file: " + path);
+    }
+    
+    buffer.samples = totalPCMFrames * buffer.channels;
+    buffer.data.assign(pData, pData + buffer.samples);
+    drwav_free(pData, NULL);
+    
+    // Validation
+    if (buffer.sampleRate != 44100) {
+        std::cerr << "Warning: Input sample rate is " << buffer.sampleRate 
+                  << " Hz. Model expects 44100 Hz." << std::endl;
+        // In a full implementation, we would resample here.
+        // For now, we warn.
+    }
+    
+    return buffer;
+}
+
+void AudioFile::Save(const std::string& path, const AudioBuffer& buffer) {
+    drwav_data_format format;
+    format.container = drwav_container_riff;
+    format.format = DR_WAVE_FORMAT_IEEE_FLOAT;
+    format.channels = buffer.channels;
+    format.sampleRate = buffer.sampleRate;
+    format.bitsPerSample = 32;
+    
+    drwav wav;
+    if (!drwav_init_file_write(&wav, path.c_str(), &format, NULL)) {
+        throw std::runtime_error("Failed to open file for writing: " + path);
+    }
+    
+    drwav_uint64 framesWritten = drwav_write_pcm_frames(&wav, buffer.samples / buffer.channels, buffer.data.data());
+    drwav_uninit(&wav);
+    
+    if (framesWritten != buffer.samples / buffer.channels) {
+         throw std::runtime_error("Failed to write all samples to " + path);
+    }
+}

+ 772 - 0
src/inference.cpp

@@ -0,0 +1,772 @@
+#include "mel_band_roformer/inference.h"
+#include "model.h"
+#include "utils.h"
+#include "stft.h"
+#include <iostream>
+#include <complex>
+#include <algorithm>
+#include <cstring>
+#include <ggml.h>
+#include <ggml-alloc.h>
+#include <ggml-backend.h>
+#include <chrono>
+#include <future>
+
+using Complex = std::complex<float>;
+
+// Helper forward decl
+std::vector<float> GetWindow(int size, int fade_size);
+
+std::vector<float> GetWindow(int size, int fade_size) {
+    std::vector<float> window(size, 1.0f);
+    // Match Python: torch.linspace(0, 1, fade_size) and torch.linspace(1, 0, fade_size)
+    // linspace includes both endpoints, so we divide by (fade_size - 1)
+    for (int i = 0; i < fade_size; ++i) {
+        // fadein[i] = i / (fade_size - 1), ranges from 0.0 to 1.0 inclusive
+        // fadeout[i] = 1 - i / (fade_size - 1), ranges from 1.0 to 0.0 inclusive
+        float fadein = (fade_size > 1) ? (float)i / (fade_size - 1) : 1.0f;
+        float fadeout = (fade_size > 1) ? 1.0f - (float)i / (fade_size - 1) : 1.0f;
+        window[i] *= fadein;                     // Start of window: fade in
+        window[size - fade_size + i] *= fadeout; // End of window: fade out
+    }
+    return window;
+}
+
+
+Inference::Inference(const std::string& model_path) {
+    model_ = std::make_unique<MelBandRoformer>();
+    model_->Initialize(model_path);
+}
+
+int Inference::GetDefaultChunkSize() const {
+    return model_->GetDefaultChunkSize();
+}
+
+int Inference::GetDefaultNumOverlap() const {
+    return model_->GetDefaultNumOverlap();
+}
+
+Inference::~Inference() {
+    if (allocr_) ggml_gallocr_free(allocr_);
+    if (ctx_) ggml_free(ctx_);
+    // gf_ is part of ctx_, tensor pointers are part of ctx_
+}
+
+bool Inference::EnsureGraph(int n_frames) {
+    if (n_frames == cached_n_frames_ && ctx_ != nullptr) {
+        return true;
+    }
+    
+    std::cout << "[Inference] Building graph for n_frames=" << n_frames << std::endl;
+
+    // Cleanup old graph
+    if (allocr_) { ggml_gallocr_free(allocr_); allocr_ = nullptr; }
+    if (ctx_) { ggml_free(ctx_); ctx_ = nullptr; }
+    
+    cached_n_frames_ = n_frames;
+
+    // Allocate context
+    size_t mem_size = 1024ull * 1024 * 1024; // 1GB
+    struct ggml_init_params ctx_params = { mem_size, nullptr, true };
+    ctx_ = ggml_init(ctx_params);
+    if (!ctx_) return false;
+    
+    gf_ = ggml_new_graph_custom(ctx_, 65536, false);
+
+    int batch = 1;
+    int total_dim_input = model_->GetTotalDimInput();
+    
+    input_tensor_ = ggml_new_tensor_3d(ctx_, GGML_TYPE_F32, total_dim_input, n_frames, batch);
+    ggml_set_input(input_tensor_);
+
+    // BandSplit -> Transformers -> MaskEstimator
+    ggml_tensor* band_out = model_->BuildBandSplitGraph(ctx_, input_tensor_, gf_, n_frames, batch);
+    
+    int n_bands = model_->GetNumBands();
+    pos_time_ = ggml_new_tensor_1d(ctx_, GGML_TYPE_I32, n_frames * n_bands);
+    pos_freq_ = ggml_new_tensor_1d(ctx_, GGML_TYPE_I32, n_bands * n_frames);
+    ggml_set_input(pos_time_);
+    ggml_set_input(pos_freq_);
+
+    ggml_tensor* trans_out = model_->BuildTransformersGraph(ctx_, band_out, gf_, pos_time_, pos_freq_, n_frames, batch);
+    mask_out_tensor_ = model_->BuildMaskEstimatorGraph(ctx_, trans_out, gf_, n_frames, batch);
+
+    // Allocate compute buffer (VRAM)
+    allocr_ = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model_->GetBackend()));
+    if (!ggml_gallocr_alloc_graph(allocr_, gf_)) {
+        std::cerr << "[Inference] Failed to allocate graph VRAM" << std::endl;
+        return false;
+    }
+    
+    return true;
+}
+
+void Inference::ComputeSTFT(const std::vector<float>& input_audio,
+                            std::vector<std::vector<float>>& stft_outputs,
+                            int& n_frames) {
+    int n_fft = model_->GetNFFT();
+    int hop_length = model_->GetHopLength();
+    int win_length = model_->GetWinLength();
+    int n_freq = n_fft / 2 + 1;
+    int channels = 2; 
+
+    std::vector<float> window(win_length);
+    stft::hann_window(window.data(), win_length);
+
+    stft_outputs.resize(channels);
+    int n_samples = input_audio.size() / channels;
+
+    for (int ch = 0; ch < channels; ++ch) {
+        std::vector<float> channel_audio(n_samples);
+        for (int i = 0; i < n_samples; ++i) {
+            channel_audio[i] = input_audio[ch + i * channels];
+        }
+
+        stft_outputs[ch].resize(n_freq * (n_samples / hop_length + 5) * 2);
+        stft::compute_stft(channel_audio.data(), n_samples, n_fft, hop_length, win_length, 
+                           window.data(), true, stft_outputs[ch].data(), &n_frames);
+    }
+}
+
+void Inference::PrepareModelInput(const std::vector<std::vector<float>>& stft_outputs,
+                                  int n_frames,
+                                  std::vector<float>& model_input_rearranged) {
+    const std::vector<int>& freq_indices = model_->GetFreqIndices();
+    int num_freq_indices = freq_indices.size();
+    int total_dim_input = model_->GetTotalDimInput();
+    int channels = 2;
+
+    model_input_rearranged.resize(n_frames * total_dim_input);
+
+    #ifdef USE_OPENMP
+    #pragma omp parallel for
+    #endif
+    for (int t = 0; t < n_frames; ++t) {
+        for (int f = 0; f < num_freq_indices; ++f) {
+            int idx = freq_indices[f];
+            int raw_freq_idx = idx / channels;
+            int ch = idx % channels;
+
+            int in_idx_ch = (raw_freq_idx * n_frames + t) * 2;
+            int out_idx = t * total_dim_input + f * 2;
+
+            model_input_rearranged[out_idx + 0] = stft_outputs[ch][in_idx_ch + 0];
+            model_input_rearranged[out_idx + 1] = stft_outputs[ch][in_idx_ch + 1];
+        }
+    }
+}
+
+void Inference::PostProcessAndISTFT(const std::vector<float>& mask_output,
+                                    const std::vector<std::vector<float>>& stft_outputs,
+                                    int n_frames,
+                                    std::vector<std::vector<float>>& output_audio) {
+    int n_fft = model_->GetNFFT();
+    int hop_length = model_->GetHopLength();
+    int win_length = model_->GetWinLength();
+    int n_freq = n_fft / 2 + 1;
+    int channels = 2;
+
+    const std::vector<int>& freq_indices = model_->GetFreqIndices();
+    int num_freq_indices = freq_indices.size();
+    int mask_features = num_freq_indices * 2;
+    int num_stems = model_->GetNumStems();
+    // Tensor layout: [mask_features, num_stems, n_frames, batch]
+    // GGML stride for time t is: mask_features * num_stems
+    int stride_time = mask_features * num_stems;
+    
+    output_audio.resize(num_stems);
+
+    std::vector<float> window(win_length);
+    stft::hann_window(window.data(), win_length);
+    
+    // Process each stem
+    for (int stem = 0; stem < num_stems; ++stem) {
+        std::vector<Complex> masks(num_freq_indices * n_frames);
+        
+        #ifdef USE_OPENMP
+        #pragma omp parallel for
+        #endif
+        for (int t = 0; t < n_frames; ++t) {
+            // Base index for this frame and current stem
+            int frame_offset = t * stride_time;
+            int stem_offset = stem * mask_features;
+            int base_offset = frame_offset + stem_offset;
+            
+            for (int f = 0; f < num_freq_indices; ++f) {
+                int idx = base_offset + f * 2;
+                masks[f * n_frames + t] = Complex(mask_output[idx + 0], mask_output[idx + 1]);
+            }
+        }
+
+        int total_freq_stereo = n_freq * channels;
+        std::vector<Complex> masks_summed(total_freq_stereo * n_frames, {0.0f, 0.0f});
+
+        for (int f = 0; f < num_freq_indices; ++f) {
+            int dst_idx_base = freq_indices[f]; 
+            for (int t = 0; t < n_frames; ++t) {
+                masks_summed[dst_idx_base * n_frames + t] += masks[f * n_frames + t];
+            }
+        }
+
+        const std::vector<int>& num_bands_per_freq = model_->GetNumBandsPerFreq();
+        #ifdef USE_OPENMP
+        #pragma omp parallel for
+        #endif
+        for (int f = 0; f < n_freq; ++f) {
+            float denom = (float)num_bands_per_freq[f];
+            if (denom < 1e-8f) denom = 1e-8f;
+            for (int ch = 0; ch < channels; ++ch) {
+                int freq_stereo_idx = f * channels + ch;
+                for (int t = 0; t < n_frames; ++t) {
+                    masks_summed[freq_stereo_idx * n_frames + t] /= denom;
+                }
+            }
+        }
+
+        std::vector<Complex> stft_output_masked(total_freq_stereo * n_frames);
+        #ifdef USE_OPENMP
+        #pragma omp parallel for
+        #endif
+        for (int ch = 0; ch < channels; ++ch) {
+            for (int f = 0; f < n_freq; ++f) {
+                int freq_stereo_idx = f * channels + ch;
+                for (int t = 0; t < n_frames; ++t) {
+                    int mask_idx = freq_stereo_idx * n_frames + t;
+                    int stft_idx = (f * n_frames + t) * 2;
+                    Complex stft_val(stft_outputs[ch][stft_idx + 0], stft_outputs[ch][stft_idx + 1]);
+                    stft_output_masked[mask_idx] = stft_val * masks_summed[mask_idx];
+                }
+            }
+        }
+
+        std::vector<std::vector<float>> output_channels(channels);
+        int n_samples_out = 0;
+
+        for (int ch = 0; ch < channels; ++ch) {
+            std::vector<float> istft_in(n_freq * n_frames * 2);
+            for (int f = 0; f < n_freq; ++f) {
+                int freq_stereo_idx = f * channels + ch;
+                for (int t = 0; t < n_frames; ++t) {
+                    int mask_idx = freq_stereo_idx * n_frames + t;
+                    int dst_idx = (f * n_frames + t) * 2;
+                    istft_in[dst_idx + 0] = stft_output_masked[mask_idx].real();
+                    istft_in[dst_idx + 1] = stft_output_masked[mask_idx].imag();
+                }
+            }
+            
+            // Zero DC if enabled
+            if (model_->GetZeroDC()) {
+                for (int t = 0; t < n_frames; ++t) {
+                    // f=0 is DC component
+                    int dst_idx = (0 * n_frames + t) * 2; 
+                    istft_in[dst_idx + 0] = 0.0f;
+                    istft_in[dst_idx + 1] = 0.0f;
+                }
+            }
+            
+            int approx_len = (n_frames - 1) * hop_length + n_fft;
+            output_channels[ch].resize(approx_len + n_fft); 
+            stft::compute_istft(istft_in.data(), n_freq, n_frames, n_fft, hop_length, win_length, 
+                                window.data(), true, approx_len, output_channels[ch].data());
+            if (ch == 0) n_samples_out = approx_len;
+            output_channels[ch].resize(n_samples_out);
+        }
+
+        output_audio[stem].resize(channels * n_samples_out);
+        for (int i = 0; i < n_samples_out; ++i) {
+            for (int ch = 0; ch < channels; ++ch) {
+                output_audio[stem][ch + i * channels] = output_channels[ch][i];
+            }
+        }
+    }
+}
+
+#include <future>
+
+std::vector<std::vector<float>> Inference::Process(const std::vector<float>& input_audio, int chunk_size, int num_overlap, std::function<void(float)> progress_callback) {
+    if (input_audio.empty()) return {};
+    return ProcessOverlapAddPipelined(input_audio, chunk_size, num_overlap, progress_callback);
+}
+
+// =================================================================================================
+// Pipeline Stages
+// =================================================================================================
+
+std::shared_ptr<Inference::ChunkState> Inference::PreProcessChunk(const std::vector<float>& chunk_audio, int id) {
+    auto state = std::make_shared<ChunkState>();
+    state->id = id;
+    state->input_audio = chunk_audio; // Copy
+
+    if (chunk_audio.empty()) return state;
+
+    // 1. STFT
+    ComputeSTFT(state->input_audio, state->stft_outputs, state->n_frames);
+
+    // 2. Prepare Input
+    PrepareModelInput(state->stft_outputs, state->n_frames, state->stft_flattened);
+
+    return state;
+}
+
+void Inference::RunInference(std::shared_ptr<ChunkState> state) {
+    if (!state || state->stft_flattened.empty()) return;
+
+    // 3. Ensure Graph
+    if (!EnsureGraph(state->n_frames)) {
+        return;
+    }
+
+    int n_bands = model_->GetNumBands();
+    int n_frames = state->n_frames;
+
+    // Prepare position data
+    // TODO: Cache these to avoid allocation every frame if size is constant
+    std::vector<int32_t> pos_time_data(n_frames * n_bands);
+    for(int i=0; i < n_frames * n_bands; ++i) pos_time_data[i] = i % n_frames;
+    
+    std::vector<int32_t> pos_freq_data(n_bands * n_frames);
+    for(int i=0; i < n_bands * n_frames; ++i) pos_freq_data[i] = i % n_bands;
+
+    // 4. Host -> Device
+    ggml_backend_tensor_set(input_tensor_, state->stft_flattened.data(), 0, ggml_nbytes(input_tensor_));
+    ggml_backend_tensor_set(pos_time_, pos_time_data.data(), 0, ggml_nbytes(pos_time_));
+    ggml_backend_tensor_set(pos_freq_, pos_freq_data.data(), 0, ggml_nbytes(pos_freq_));
+
+    // 5. Compute
+    ggml_backend_graph_compute(model_->GetBackend(), gf_);
+
+    // 6. Device -> Host
+    state->mask_output.resize(ggml_nelements(mask_out_tensor_));
+    ggml_backend_tensor_get(mask_out_tensor_, state->mask_output.data(), 0, ggml_nbytes(mask_out_tensor_));
+}
+
+void Inference::PostProcessChunk(std::shared_ptr<ChunkState> state) {
+    if (!state || state->mask_output.empty()) return;
+
+    // 7. Post-Process & ISTFT
+    PostProcessAndISTFT(state->mask_output, state->stft_outputs, state->n_frames, state->final_audio);
+
+    // 8. Trim
+    for (auto& stem_audio : state->final_audio) {
+        if (stem_audio.size() > state->input_audio.size()) {
+           stem_audio.resize(state->input_audio.size());
+        } else if (stem_audio.size() < state->input_audio.size()) {
+           stem_audio.resize(state->input_audio.size(), 0.0f);
+        }
+    }
+}
+
+std::vector<std::vector<float>> Inference::ProcessChunk(const std::vector<float>& chunk_audio) {
+    // Serial fallback
+    auto state = PreProcessChunk(chunk_audio, 0);
+    RunInference(state);
+    PostProcessChunk(state);
+    return state->final_audio;
+}
+
+// =================================================================================================
+// Pipelined Overlap-Add Logic
+// =================================================================================================
+
+std::vector<std::vector<float>> Inference::ProcessOverlapAddPipelined(const std::vector<float>& input_audio, 
+                                                         int chunk_size, 
+                                                         int num_overlap,
+                                                         std::function<void(float)> progress_callback) {
+    if (input_audio.empty()) return {};
+    if (input_audio.size() % 2 != 0) {
+        throw std::runtime_error("Error: Input audio must be interleaved stereo (even number of samples).");
+    }
+    
+    // Parameters matches Python demix_track
+    int channels = 2; 
+    int C = chunk_size;
+    
+    int step = chunk_size / num_overlap;
+    int fade_size = chunk_size / 10;
+    int border = chunk_size - step;
+    
+    int n_input_samples = input_audio.size() / channels;
+
+    // 1. Pad Input
+    bool do_pad = (n_input_samples > 2 * border) && (border > 0);
+    int pad_l = do_pad ? border : 0;
+    int pad_r = do_pad ? border : 0;
+    int n_padded_samples = n_input_samples + pad_l + pad_r;
+    
+    std::vector<float> padded_input;
+    
+    if (do_pad) {
+        padded_input.resize(n_padded_samples * channels);
+        // Copy center
+        for (int i = 0; i < n_input_samples; ++i) {
+            padded_input[(pad_l + i) * channels + 0] = input_audio[i * channels + 0];
+            padded_input[(pad_l + i) * channels + 1] = input_audio[i * channels + 1];
+        }
+        // Reflect Left
+        for (int i = 0; i < pad_l; ++i) {
+            int src_idx = 1 + i; 
+            if (src_idx >= n_input_samples) src_idx = n_input_samples - 1;
+            int dst_idx = pad_l - 1 - i;
+            padded_input[dst_idx * channels + 0] = input_audio[src_idx * channels + 0];
+            padded_input[dst_idx * channels + 1] = input_audio[src_idx * channels + 1];
+        }
+        // Reflect Right
+        for (int i = 0; i < pad_r; ++i) {
+            int src_idx = n_input_samples - 2 - i;
+            if (src_idx < 0) src_idx = 0;
+            int dst_idx = pad_l + n_input_samples + i;
+            padded_input[dst_idx * channels + 0] = input_audio[src_idx * channels + 0];
+            padded_input[dst_idx * channels + 1] = input_audio[src_idx * channels + 1];
+        }
+    } else {
+        padded_input = input_audio;
+    }
+
+    std::vector<std::vector<float>> result; // [stems][samples]
+    std::vector<float> counter(n_padded_samples * channels, 0.0f);
+    std::vector<float> window_base = GetWindow(chunk_size, fade_size);
+    
+    // lambda to extract chunk 'i'
+    auto extract_chunk = [&](int i) -> std::vector<float> {
+        if (i >= n_padded_samples) return {};
+        
+        int remaining = n_padded_samples - i;
+        int part_len = std::min(C, remaining);
+        
+        std::vector<float> chunk_in(C * channels, 0.0f);
+        
+        // Copy part
+        for (int k = 0; k < part_len; ++k) {
+            chunk_in[k * channels + 0] = padded_input[(i + k) * channels + 0];
+            chunk_in[k * channels + 1] = padded_input[(i + k) * channels + 1];
+        }
+        
+        // Pad short chunk if needed
+        if (part_len < C) {
+             int pad_amount = C - part_len;
+             if (part_len > C / 2 + 1) {
+                 // Reflect pad right
+                 for(int k=0; k<pad_amount; ++k) {
+                     int src_idx = part_len - 2 - k;
+                     if(src_idx < 0) src_idx = 0;
+                     chunk_in[(part_len + k)*2+0] = chunk_in[src_idx*2+0];
+                     chunk_in[(part_len + k)*2+1] = chunk_in[src_idx*2+1];
+                 }
+             }
+        }
+        return chunk_in;
+    };
+
+    // lambda to accumulate result 'state' at offset 'i'
+    auto accumulate_result = [&](std::shared_ptr<ChunkState> state, int i) {
+        if (!state) return;
+        const std::vector<std::vector<float>>& chunk_out_stems = state->final_audio; // Now [stems][samples]
+        if (chunk_out_stems.empty()) return;
+        
+        // Lazy Initialize result
+        if (result.empty()) {
+            int num_stems = chunk_out_stems.size();
+            result.resize(num_stems, std::vector<float>(n_padded_samples * channels, 0.0f));
+        }
+
+        int remaining = n_padded_samples - i;
+        int part_len = std::min(C, remaining); 
+
+        std::vector<float> window = window_base; // Copy
+        if (i == 0) {
+            for(int k=0; k<fade_size; ++k) window[k] = 1.0f;
+        } else if (i + step >= n_padded_samples) {
+            for(int k=0; k<fade_size; ++k) window[C - 1 - k] = 1.0f;
+        }
+        
+        int num_stems = result.size();
+        for (int k = 0; k < part_len; ++k) {
+            float w = window[k];
+            int res_idx = (i + k) * channels;
+            int chk_idx = k * channels;
+            
+            for (int s = 0; s < num_stems; ++s) {
+                 if (s >= chunk_out_stems.size()) continue;
+                 const auto& stem_chunk = chunk_out_stems[s];
+                 result[s][res_idx + 0] += stem_chunk[chk_idx + 0] * w;
+                 result[s][res_idx + 1] += stem_chunk[chk_idx + 1] * w;
+            }
+            
+            // Counter is same for all stems, just update once
+            counter[res_idx + 0] += w;
+            counter[res_idx + 1] += w;
+        }
+    };
+
+    // ==========================================================
+    // Pipeline Loop
+    // ==========================================================
+    
+    // Future for the NEXT chunk's preprocessing
+    std::future<std::shared_ptr<ChunkState>> next_prep_future;
+    
+    // Future for the PREVIOUS chunk's postprocessing
+    std::future<void> prev_post_future;
+    
+    std::shared_ptr<ChunkState> prev_state = nullptr;
+    
+    int i = 0;
+    int current_offset = 0;
+    
+    // Bootstrap: Start PreProcessing first chunk
+    {
+        std::vector<float> chunk0 = extract_chunk(0);
+        // Async launch
+        next_prep_future = std::async(std::launch::async, 
+            [this](std::vector<float> c, int id) { return this->PreProcessChunk(c, id); }, 
+            std::move(chunk0), 0);
+    }
+    
+    while (current_offset < n_padded_samples) {
+        // 1. Wait for PRE-processing of CURRENT chunk
+        if (next_prep_future.valid()) {
+            // This blocks until STFT is done.
+            // In steady state, this should be ready or nearly ready while GPU was busy.
+        }
+        auto current_state = next_prep_future.get();
+        
+        // 2. Start PRE-processing of NEXT chunk (if exists)
+        int next_offset = current_offset + step;
+        if (next_offset < n_padded_samples) {
+             std::vector<float> chunk_next = extract_chunk(next_offset);
+             next_prep_future = std::async(std::launch::async, 
+                [this](std::vector<float> c, int id) { return this->PreProcessChunk(c, id); }, 
+                std::move(chunk_next), next_offset);
+        } else {
+            // No more next chunks
+        }
+        
+        // 3. Run Inference on CURRENT chunk (GPU Sync)
+        // This blocks heavily.
+        RunInference(current_state);
+        
+        // 4. Wait for POST-processing of PREVIOUS chunk
+        if (prev_post_future.valid()) {
+            prev_post_future.get();
+        }
+        
+        // 5. Accumulate PREVIOUS chunk result (Serial, fast)
+        // Note: PostProcessChunk fills 'final_audio', but doesn't accumulate to 'result'.
+        // We do accumulation here on main thread to avoid races on 'result' buffer.
+        if (prev_state) {
+            int prev_offset = current_offset - step;
+            accumulate_result(prev_state, prev_offset);
+            prev_state = nullptr; // Free memory
+        }
+        
+        // 6. Start POST-processing of CURRENT chunk
+        prev_state = current_state;
+        // Use shared_ptr copy
+        prev_post_future = std::async(std::launch::async, 
+            [this](std::shared_ptr<ChunkState> s) { this->PostProcessChunk(s); }, 
+            prev_state);
+            
+        // Advance
+        current_offset += step;
+
+        if (progress_callback) {
+            float progress = (float)std::min(current_offset, n_padded_samples) / n_padded_samples;
+            progress_callback(progress);
+        }
+    }
+    
+    // Drain Pipeline
+    // Wait for last post-process
+    if (prev_post_future.valid()) {
+        prev_post_future.get();
+    }
+    if (prev_state) {
+        int prev_offset = current_offset - step;
+        accumulate_result(prev_state, prev_offset);
+    }
+    
+    // Normalize and Crop
+    // result is [stems][samples]
+    if (result.empty()) return {};
+
+    int num_stems = result.size();
+    std::vector<std::vector<float>> final_output_stems(num_stems);
+    
+    for (int s = 0; s < num_stems; ++s) {
+        final_output_stems[s].resize(n_input_samples * channels);
+        for (int k = 0; k < n_input_samples; ++k) {
+            int padded_idx = (pad_l + k) * channels;
+            int final_idx = k * channels;
+            
+            float w0 = counter[padded_idx + 0];
+            float w1 = counter[padded_idx + 1];
+            
+            if (w0 < 1e-4f) w0 = 1.0f;
+            if (w1 < 1e-4f) w1 = 1.0f;
+            
+            final_output_stems[s][final_idx + 0] = result[s][padded_idx + 0] / w0;
+            final_output_stems[s][final_idx + 1] = result[s][padded_idx + 1] / w1;
+        }
+    }
+    
+    return final_output_stems;
+}
+
+std::vector<std::vector<float>> Inference::ProcessOverlapAdd(const std::vector<float>& input_audio, 
+                                                int chunk_size, 
+                                                int num_overlap,
+                                                ModelCallback model_func,
+                                                std::function<void(float)> progress_callback) {
+    if (input_audio.empty()) return {};
+    if (input_audio.size() % 2 != 0) {
+        throw std::runtime_error("Error: Input audio must be interleaved stereo (even number of samples).");
+    }
+    
+    // Parameters matches Python demix_track
+    int channels = 2; 
+    int C = chunk_size;
+    
+    int step = chunk_size / num_overlap;
+    int fade_size = chunk_size / 10;
+    int border = chunk_size - step;
+    
+    int n_input_samples = input_audio.size() / channels;
+
+    // 1. Pad Input
+    bool do_pad = (n_input_samples > 2 * border) && (border > 0);
+    int pad_l = do_pad ? border : 0;
+    int pad_r = do_pad ? border : 0;
+    int n_padded_samples = n_input_samples + pad_l + pad_r;
+    
+    std::vector<float> padded_input;
+    
+    if (do_pad) {
+        padded_input.resize(n_padded_samples * channels);
+        
+        // Copy center
+        for (int i = 0; i < n_input_samples; ++i) {
+            padded_input[(pad_l + i) * channels + 0] = input_audio[i * channels + 0];
+            padded_input[(pad_l + i) * channels + 1] = input_audio[i * channels + 1];
+        }
+        // Reflect Left
+        for (int i = 0; i < pad_l; ++i) {
+            int src_idx = 1 + i; 
+            if (src_idx >= n_input_samples) src_idx = n_input_samples - 1;
+            int dst_idx = pad_l - 1 - i;
+            padded_input[dst_idx * channels + 0] = input_audio[src_idx * channels + 0];
+            padded_input[dst_idx * channels + 1] = input_audio[src_idx * channels + 1];
+        }
+        // Reflect Right
+        for (int i = 0; i < pad_r; ++i) {
+            int src_idx = n_input_samples - 2 - i;
+            if (src_idx < 0) src_idx = 0;
+            int dst_idx = pad_l + n_input_samples + i;
+            padded_input[dst_idx * channels + 0] = input_audio[src_idx * channels + 0];
+            padded_input[dst_idx * channels + 1] = input_audio[src_idx * channels + 1];
+        }
+    } else {
+        padded_input = input_audio;
+    }
+
+    std::vector<std::vector<float>> result; // [stems][samples]
+    std::vector<float> counter(n_padded_samples * channels, 0.0f);
+    std::vector<float> window_base = GetWindow(chunk_size, fade_size);
+    
+    int i = 0;
+    int total_length = n_padded_samples;
+    
+    while (i < total_length) {
+        int remaining = total_length - i;
+        int part_len = std::min(C, remaining); // Logic matches Python slice [i:i+C]
+        
+        std::vector<float> chunk_in(C * channels, 0.0f);
+        
+        // Copy part
+        for (int k = 0; k < part_len; ++k) {
+            chunk_in[k * channels + 0] = padded_input[(i + k) * channels + 0];
+            chunk_in[k * channels + 1] = padded_input[(i + k) * channels + 1];
+        }
+        
+        // Pad short chunk if needed
+        if (part_len < C) {
+             int pad_amount = C - part_len;
+             if (part_len > C / 2 + 1) {
+                 // Reflect pad right
+                 for(int k=0; k<pad_amount; ++k) {
+                     int src_idx = part_len - 2 - k;
+                     if(src_idx < 0) src_idx = 0;
+                     chunk_in[(part_len + k)*2+0] = chunk_in[src_idx*2+0];
+                     chunk_in[(part_len + k)*2+1] = chunk_in[src_idx*2+1];
+                 }
+             }
+        }
+        
+        std::vector<std::vector<float>> chunk_out_stems = model_func(chunk_in);
+        if (chunk_out_stems.empty()) {
+             // ?
+        }
+        
+        // Lazy Initialize result
+        if (result.empty()) {
+            int num_stems = chunk_out_stems.size();
+            result.resize(num_stems, std::vector<float>(n_padded_samples * channels, 0.0f));
+        }
+
+        // Window Adjustment
+        std::vector<float> window = window_base; // Copy
+        if (i == 0) {
+            for(int k=0; k<fade_size; ++k) window[k] = 1.0f;
+        } else if (i + step >= total_length) {
+            for(int k=0; k<fade_size; ++k) window[C - 1 - k] = 1.0f;
+        }
+        
+        // Accumulate
+        int num_stems = result.size();
+        for (int k = 0; k < part_len; ++k) {
+            float w = window[k];
+            int res_idx = (i + k) * channels;
+            int chk_idx = k * channels;
+            
+            for (int s = 0; s < num_stems; ++s) {
+                 if (s >= chunk_out_stems.size()) continue;
+                 const auto& stem_chunk = chunk_out_stems[s];
+                 result[s][res_idx + 0] += stem_chunk[chk_idx + 0] * w;
+                 result[s][res_idx + 1] += stem_chunk[chk_idx + 1] * w;
+            }
+            
+            counter[res_idx + 0] += w;
+            counter[res_idx + 1] += w;
+        }
+        
+        i += step;
+        if (progress_callback) {
+             float progress = (float)std::min(i, total_length) / total_length;
+             progress_callback(progress);
+        }
+    }
+    
+    // Normalize and Crop
+    if (result.empty()) return {};
+
+    int num_stems = result.size();
+    std::vector<std::vector<float>> final_output_stems(num_stems);
+    
+    for (int s = 0; s < num_stems; ++s) {
+        final_output_stems[s].resize(n_input_samples * channels);
+        for (int k = 0; k < n_input_samples; ++k) {
+            int padded_idx = (pad_l + k) * channels;
+            int final_idx = k * channels;
+            
+            float w0 = counter[padded_idx + 0];
+            float w1 = counter[padded_idx + 1];
+            
+            if (w0 < 1e-4f) w0 = 1.0f;
+            if (w1 < 1e-4f) w1 = 1.0f;
+            
+            final_output_stems[s][final_idx + 0] = result[s][padded_idx + 0] / w0;
+            final_output_stems[s][final_idx + 1] = result[s][padded_idx + 1] / w1;
+        }
+    }
+    
+    return final_output_stems;
+}

+ 731 - 0
src/model.cpp

@@ -0,0 +1,731 @@
+#include "model.h"
+#include <ggml.h>
+#include <ggml-alloc.h>
+#include <ggml-backend.h>
+#include <gguf.h>
+#include <iostream>
+#include <stdexcept>
+#include <cstring>
+#include <cmath>
+
+MelBandRoformer::MelBandRoformer() {
+}
+
+MelBandRoformer::~MelBandRoformer() {
+    if (buffer_weights_) ggml_backend_buffer_free(buffer_weights_);
+    if (backend_) ggml_backend_free(backend_);
+    if (ctx_weights_) ggml_free(ctx_weights_);
+}
+
+void MelBandRoformer::Initialize(const std::string& model_path) {
+    // Use best available backend
+    backend_ = ggml_backend_init_best();
+    if (!backend_) {
+        throw std::runtime_error("Failed to initialize backend");
+    }
+    std::cout << "Using backend: " << ggml_backend_name(backend_) << std::endl;
+
+    LoadWeights(model_path);
+}
+
+void MelBandRoformer::LoadWeights(const std::string& path) {
+    std::cout << "Loading model from " << path << std::endl;
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ &ctx_weights_,
+    };
+
+    struct gguf_context* ctx_gguf = gguf_init_from_file(path.c_str(), params);
+    if (!ctx_gguf) {
+        throw std::runtime_error("Failed to load GGUF file: " + path);
+    }
+
+    // 1. Read Hyperparameters
+    int kv_idx;
+    
+    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.stft_n_fft");
+    if (kv_idx >= 0) n_fft_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
+    
+    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.stft_hop_length");
+    if (kv_idx >= 0) hop_length_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
+    
+    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.stft_win_length");
+    if (kv_idx >= 0) win_length_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
+
+    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.dim");
+    if (kv_idx >= 0) dim_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
+
+    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.num_bands");
+    if (kv_idx >= 0) num_bands_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
+    
+    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.depth");
+    if (kv_idx >= 0) depth_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
+
+    // New Parameters
+    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.num_stems");
+    if (kv_idx >= 0) num_stems_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
+    
+    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.skip_connection");
+    if (kv_idx >= 0) skip_connection_ = gguf_get_val_bool(ctx_gguf, kv_idx);
+
+    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.stft_normalized");
+    if (kv_idx >= 0) stft_normalized_ = gguf_get_val_bool(ctx_gguf, kv_idx);
+
+    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.zero_dc");
+    if (kv_idx >= 0) zero_dc_ = gguf_get_val_bool(ctx_gguf, kv_idx);
+
+    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.mask_estimator_depth");
+    if (kv_idx >= 0) mask_estimator_depth_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
+    
+    // Inference defaults (optional, fallback to hardcoded values)
+    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.default_chunk_size");
+    if (kv_idx >= 0) default_chunk_size_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
+    
+    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.default_num_overlap");
+    if (kv_idx >= 0) default_num_overlap_ = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
+    
+    kv_idx = gguf_find_key(ctx_gguf, "mel_band_roformer.linear_transformer_depth");
+    if (kv_idx >= 0) {
+        int lin_depth = (int)gguf_get_val_u32(ctx_gguf, kv_idx);
+        if (lin_depth > 0) {
+            std::cerr << "\n[WARNING] Model uses Linear Attention (depth=" << lin_depth 
+                      << "). This is NOT supported yet. Results will be incorrect.\n" << std::endl;
+        }
+    }
+
+    std::cout << "Model Config: n_fft=" << n_fft_ << ", hop_length=" << hop_length_ 
+              << ", num_bands=" << num_bands_ << ", dim=" << dim_ << ", depth=" << depth_ 
+              << ", num_stems=" << num_stems_ << ", skip_conn=" << skip_connection_ << std::endl;
+    std::cout << "Inference Defaults: chunk_size=" << default_chunk_size_ 
+              << ", num_overlap=" << default_num_overlap_ << std::endl;
+
+
+    // 2. Allocate backend buffer for ALL tensors
+    buffer_weights_ = ggml_backend_alloc_ctx_tensors_from_buft(
+        ctx_weights_, 
+        ggml_backend_get_default_buffer_type(backend_)
+    );
+    if (!buffer_weights_) {
+        throw std::runtime_error("Failed to allocate weight buffer");
+    }
+
+    // 3. Read data from file and upload to backend
+    FILE* file = fopen(path.c_str(), "rb");
+    if (!file) throw std::runtime_error("Cannot open file");
+    
+    size_t data_offset = gguf_get_data_offset(ctx_gguf);
+    
+    struct ggml_tensor* t = ggml_get_first_tensor(ctx_weights_);
+    std::vector<uint8_t> read_buf;
+    
+    while (t) {
+        int tid = gguf_find_tensor(ctx_gguf, t->name);
+        if (tid >= 0) {
+            size_t offset = data_offset + gguf_get_tensor_offset(ctx_gguf, tid);
+            size_t size = ggml_nbytes(t);
+            
+            if (read_buf.size() < size) read_buf.resize(size);
+            
+            fseek(file, (long)offset, SEEK_SET);
+            fread(read_buf.data(), 1, size, file);
+            
+            // Upload to backend
+            ggml_backend_tensor_set(t, read_buf.data(), 0, size);
+            
+            // Cache important buffers
+            if (std::string(t->name) == "buffer_freq_indices") {
+                freq_indices_.resize(ggml_nelements(t));
+                if (t->type == GGML_TYPE_I32) {
+                    memcpy(freq_indices_.data(), read_buf.data(), size);
+                }
+                std::cout << "  Loaded freq_indices: " << freq_indices_.size() << " indices" << std::endl;
+            }
+            if (std::string(t->name) == "buffer_num_bands_per_freq") {
+                num_bands_per_freq_.resize(ggml_nelements(t));
+                if (t->type == GGML_TYPE_I32) {
+                    memcpy(num_bands_per_freq_.data(), read_buf.data(), size);
+                }
+            }
+            if (std::string(t->name) == "buffer_num_freqs_per_band") {
+                num_freqs_per_band_.resize(ggml_nelements(t));
+                if (t->type == GGML_TYPE_I32) {
+                    memcpy(num_freqs_per_band_.data(), read_buf.data(), size);
+                }
+            }
+        }
+        
+        t = ggml_get_next_tensor(ctx_weights_, t);
+    }
+    
+    fclose(file);
+    
+    int n_tensors = gguf_get_n_tensors(ctx_gguf);
+    std::cout << "Loaded " << n_tensors << " tensors" << std::endl;
+    
+    gguf_free(ctx_gguf);
+}
+
+ggml_tensor* MelBandRoformer::GetWeight(const std::string& name) const {
+    return ggml_get_tensor(ctx_weights_, name.c_str());
+}
+
+std::vector<int> MelBandRoformer::GetDimInputs() const {
+    std::vector<int> dim_inputs(num_bands_);
+    for (int i = 0; i < num_bands_; ++i) {
+        int num_freqs = num_freqs_per_band_[i];
+        dim_inputs[i] = num_freqs * 4;  // stereo=2, complex=2
+    }
+    return dim_inputs;
+}
+
+int MelBandRoformer::GetTotalDimInput() const {
+    int total = 0;
+    for (int i = 0; i < num_bands_; ++i) {
+        total += num_freqs_per_band_[i] * 4;
+    }
+    return total;
+}
+
+// ========== Graph Building Functions ==========
+
+ggml_tensor* MelBandRoformer::BuildBandSplitGraph(
+    ggml_context* ctx,
+    ggml_tensor* input,
+    ggml_cgraph* gf,
+    int n_frames,
+    int batch
+) {
+    // Following test_10_full_model.cpp implementation
+    // Input: [total_dim_input, n_frames, batch]
+    // Output: [dim, num_bands, n_frames, batch]
+    
+    std::vector<int> dim_inputs = GetDimInputs();
+    
+    ggml_tensor* x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, dim_, num_bands_, n_frames, batch);
+    
+    size_t offset_elements = 0;
+    for (int i = 0; i < num_bands_; ++i) {
+        int dim_in = dim_inputs[i];
+        
+        // View for this band's input
+        ggml_tensor* band_input = ggml_view_3d(ctx, input,
+                                               dim_in, n_frames, batch,
+                                               input->nb[1], input->nb[2],
+                                               offset_elements * sizeof(float));
+        
+        // Get RMSNorm gamma weight
+        // band_split.{i}.norm.weight
+        std::string gamma_name = "band_split." + std::to_string(i) + ".norm.weight";
+        ggml_tensor* gamma = GetWeight(gamma_name);
+        if (!gamma) {
+            std::cerr << "Missing weight: " << gamma_name << std::endl;
+            return nullptr;
+        }
+        
+        // RMSNorm
+        ggml_tensor* normed = ggml_rms_norm(ctx, band_input, 1e-12f);
+        normed = ggml_mul(ctx, normed, gamma);
+        
+        // Get Linear weight and bias
+        // band_split.{i}.linear.weight
+        std::string w_name = "band_split." + std::to_string(i) + ".linear.weight";
+        std::string b_name = "band_split." + std::to_string(i) + ".linear.bias";
+        ggml_tensor* weight = GetWeight(w_name);
+        ggml_tensor* bias = GetWeight(b_name);
+        
+        if (!weight || !bias) {
+            std::cerr << "Missing weight: " << w_name << " or " << b_name << std::endl;
+            return nullptr;
+        }
+        
+        // Linear projection
+        ggml_tensor* projected = ggml_mul_mat(ctx, weight, normed);
+        projected = ggml_add(ctx, projected, bias);
+        
+        // Copy to output slice
+        ggml_tensor* out_slice = ggml_view_3d(ctx, x,
+                                              dim_, n_frames, batch,
+                                              x->nb[2], x->nb[3],
+                                              i * x->nb[1]);
+        
+        ggml_build_forward_expand(gf, ggml_cpy(ctx, projected, out_slice));
+        
+        offset_elements += dim_in;
+    }
+    
+    return x;
+}
+
+ggml_tensor* MelBandRoformer::BuildTransformersGraph(
+    ggml_context* ctx,
+    ggml_tensor* input,
+    ggml_cgraph* gf,
+    ggml_tensor* pos_time_exp,
+    ggml_tensor* pos_freq_exp,
+    int n_frames,
+    int batch
+) {
+    // Following test_10_full_model.cpp implementation
+    // Input: [dim, num_bands, n_frames, batch]
+    
+    const int D = dim_;
+    const int F = num_bands_;
+    const int T = n_frames;
+    const int B = batch;
+    const int HEADS = heads_;
+    const int DIM_HEAD = dim_head_;
+    const int DIM_INNER = HEADS * DIM_HEAD;
+    
+    ggml_tensor* x = input;
+    std::vector<ggml_tensor*> skip_outputs;
+    
+    for (int layer = 0; layer < depth_; ++layer) {
+        if (skip_connection_) {
+            for (ggml_tensor* s : skip_outputs) {
+                x = ggml_add(ctx, x, s);
+            }
+        }
+        // ========== TIME TRANSFORMER ==========
+        // Permute: [D, F, T, B] -> [D, T, F, B]
+        x = ggml_permute(ctx, x, 0, 2, 1, 3);
+        x = ggml_cont(ctx, x);
+        
+        int fb = F * B;
+        ggml_tensor* x_packed = ggml_reshape_3d(ctx, x, D, T, fb);
+        
+        std::string time_prefix = "blk." + std::to_string(layer) + ".time_attn";
+        std::string time_ff_prefix = "blk." + std::to_string(layer) + ".time_ff";
+        
+        // Attention Block
+        // blk.{l}.time_attn_norm.weight
+        ggml_tensor* t_attn_norm_w = GetWeight(time_prefix + "_norm.weight");
+        if (!t_attn_norm_w) { std::cerr << "Missing: " << time_prefix << "_norm.weight\n"; return nullptr; }
+        
+        ggml_tensor* x_norm = ggml_rms_norm(ctx, x_packed, 1e-12f);
+        x_norm = ggml_mul(ctx, x_norm, t_attn_norm_w);
+        
+        // blk.{l}.time_attn_qkv.weight
+        ggml_tensor* t_qkv_w = GetWeight(time_prefix + "_qkv.weight");
+        if (!t_qkv_w) { std::cerr << "Missing: " << time_prefix << "_qkv.weight\n"; return nullptr; }
+        
+        ggml_tensor* qkv_out = ggml_mul_mat(ctx, t_qkv_w, x_norm);
+        
+        // Split QKV
+        ggml_tensor* Q_view = ggml_view_4d(ctx, qkv_out, DIM_HEAD, T, HEADS, fb, 
+                                          qkv_out->nb[1], DIM_HEAD*sizeof(float), qkv_out->nb[2], 0);
+        ggml_tensor* K_view = ggml_view_4d(ctx, qkv_out, DIM_HEAD, T, HEADS, fb,
+                                          qkv_out->nb[1], DIM_HEAD*sizeof(float), qkv_out->nb[2], DIM_INNER*sizeof(float));
+        ggml_tensor* V_view = ggml_view_4d(ctx, qkv_out, DIM_HEAD, T, HEADS, fb,
+                                          qkv_out->nb[1], DIM_HEAD*sizeof(float), qkv_out->nb[2], 2*DIM_INNER*sizeof(float));
+        
+        ggml_tensor* Q = ggml_cont(ctx, Q_view);
+        ggml_tensor* K = ggml_cont(ctx, K_view);
+        ggml_tensor* V = ggml_cont(ctx, V_view);
+        
+        // RoPE with CUDA-compatible reshape
+        // Original Q/K shape: [DIM_HEAD, T, HEADS, fb]
+        // After permute: [DIM_HEAD, HEADS, T, fb]
+        // For CUDA RoPE: reshape to [DIM_HEAD, HEADS, T*fb, 1] and use expanded pos
+        ggml_tensor* Q_perm = ggml_permute(ctx, Q, 0, 2, 1, 3);
+        ggml_tensor* K_perm = ggml_permute(ctx, K, 0, 2, 1, 3);
+        ggml_tensor* Q_perm_cont = ggml_cont(ctx, Q_perm);
+        ggml_tensor* K_perm_cont = ggml_cont(ctx, K_perm);
+        
+        // Reshape to merge batch(fb) into sequence for CUDA RoPE compatibility
+        int T_fb = T * fb;
+        ggml_tensor* Q_flat = ggml_reshape_4d(ctx, Q_perm_cont, DIM_HEAD, HEADS, T_fb, 1);
+        ggml_tensor* K_flat = ggml_reshape_4d(ctx, K_perm_cont, DIM_HEAD, HEADS, T_fb, 1);
+        
+        // Use passed-in expanded position tensor (caller prepares [T*F*B] with repeating [0..T-1])
+        ggml_tensor* Q_rope_flat = ggml_rope_ext(ctx, Q_flat, pos_time_exp, nullptr, DIM_HEAD, 
+                                                 GGML_ROPE_TYPE_NORMAL, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+        ggml_tensor* K_rope_flat = ggml_rope_ext(ctx, K_flat, pos_time_exp, nullptr, DIM_HEAD,
+                                                 GGML_ROPE_TYPE_NORMAL, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+        
+        // Reshape back to [DIM_HEAD, HEADS, T, fb]
+        ggml_tensor* Q_rope_perm = ggml_reshape_4d(ctx, Q_rope_flat, DIM_HEAD, HEADS, T, fb);
+        ggml_tensor* K_rope_perm = ggml_reshape_4d(ctx, K_rope_flat, DIM_HEAD, HEADS, T, fb);
+        
+        ggml_tensor* Q_rope = ggml_permute(ctx, Q_rope_perm, 0, 2, 1, 3);
+        ggml_tensor* K_rope = ggml_permute(ctx, K_rope_perm, 0, 2, 1, 3);
+
+        // Flash Attention
+        // Inputs: [DIM_HEAD, T, HEADS, fb]
+        // Output: [DIM_HEAD, HEADS, T, fb] (permuted)
+        ggml_tensor* Q_fa = ggml_cont(ctx, Q_rope);
+        ggml_tensor* K_fa = ggml_cont(ctx, K_rope);
+        ggml_tensor* V_fa = V; // V is already contiguous [DIM_HEAD, T, HEADS, fb]
+
+        float scale = 1.0f / sqrtf(static_cast<float>(DIM_HEAD));
+        ggml_tensor* attn_out_fa = ggml_flash_attn_ext(ctx, Q_fa, K_fa, V_fa, nullptr, scale, 0.0f, 0.0f);
+        
+        // Permute back to [DIM_HEAD, T, HEADS, fb] to match original flow
+        ggml_tensor* attn_out_perm = ggml_permute(ctx, attn_out_fa, 0, 2, 1, 3);
+        ggml_tensor* attn_out_raw = ggml_cont(ctx, attn_out_perm);
+        
+        // Gates
+        // blk.{l}.time_attn_gate.weight/bias
+        ggml_tensor* t_gate_w = GetWeight(time_prefix + "_gate.weight");
+        ggml_tensor* t_gate_b = GetWeight(time_prefix + "_gate.bias");
+        if (!t_gate_w || !t_gate_b) { std::cerr << "Missing gates weights\n"; return nullptr; }
+        
+        ggml_tensor* gates = ggml_mul_mat(ctx, t_gate_w, x_norm);
+        gates = ggml_add(ctx, gates, t_gate_b);
+        gates = ggml_sigmoid(ctx, gates);
+        
+        ggml_tensor* gates_perm = ggml_permute(ctx, gates, 1, 0, 2, 3);
+        ggml_tensor* gates_bcast = ggml_view_4d(ctx, gates_perm, 1, T, HEADS, fb,
+                                               gates_perm->nb[0], gates_perm->nb[1], gates_perm->nb[2], 0);
+        
+        ggml_tensor* gated_out = ggml_mul(ctx, attn_out_raw, gates_bcast);
+        
+        ggml_tensor* out_perm = ggml_permute(ctx, gated_out, 0, 2, 1, 3);
+        ggml_tensor* out_cont = ggml_cont(ctx, out_perm);
+        ggml_tensor* out_flat = ggml_reshape_3d(ctx, out_cont, DIM_INNER, T, fb);
+        
+        // blk.{l}.time_attn_out.weight
+        ggml_tensor* t_attn_out_w = GetWeight(time_prefix + "_out.weight");
+        if (!t_attn_out_w) { std::cerr << "Missing to_out_weight\n"; return nullptr; }
+        
+        ggml_tensor* attn_block_out = ggml_mul_mat(ctx, t_attn_out_w, out_flat);
+        ggml_tensor* x_resid1 = ggml_add(ctx, x_packed, attn_block_out);
+        
+        // FeedForward Block
+        // blk.{l}.time_ff_norm.weight
+        ggml_tensor* t_ff_norm_w = GetWeight(time_ff_prefix + "_norm.weight");
+        if (!t_ff_norm_w) { std::cerr << "Missing ff norm\n"; return nullptr; }
+        
+        ggml_tensor* x_resid1_norm = ggml_rms_norm(ctx, x_resid1, 1e-12f);
+        x_resid1_norm = ggml_mul(ctx, x_resid1_norm, t_ff_norm_w);
+        
+        // blk.{l}.time_ff_in.weight/bias
+        ggml_tensor* t_ff_in_w = GetWeight(time_ff_prefix + "_in.weight");
+        ggml_tensor* t_ff_in_b = GetWeight(time_ff_prefix + "_in.bias");
+        if (!t_ff_in_w || !t_ff_in_b) { std::cerr << "Missing ff in weights\n"; return nullptr; }
+        
+        ggml_tensor* ff_proj_in = ggml_mul_mat(ctx, t_ff_in_w, x_resid1_norm);
+        ff_proj_in = ggml_add(ctx, ff_proj_in, t_ff_in_b);
+        
+        ggml_tensor* gelu_out = ggml_gelu_erf(ctx, ff_proj_in);
+        
+
+        
+        // blk.{l}.time_ff_out.weight/bias
+        ggml_tensor* t_ff_out_w = GetWeight(time_ff_prefix + "_out.weight");
+        ggml_tensor* t_ff_out_b = GetWeight(time_ff_prefix + "_out.bias");
+        if (!t_ff_out_w || !t_ff_out_b) { std::cerr << "Missing ff out weights\n"; return nullptr; }
+        
+        ggml_tensor* ff_block_out = ggml_mul_mat(ctx, t_ff_out_w, gelu_out);
+        ff_block_out = ggml_add(ctx, ff_block_out, t_ff_out_b);
+        
+        x_packed = ggml_add(ctx, x_resid1, ff_block_out);
+        
+
+        
+        // Time Transformer Final Norm
+        // blk.{l}.time_norm.weight
+        std::string time_norm_name = "blk." + std::to_string(layer) + ".time_norm.weight";
+        ggml_tensor* time_norm_w = GetWeight(time_norm_name);
+        if (!time_norm_w) { std::cerr << "Missing: " << time_norm_name << "\n"; return nullptr; }
+        
+        x_packed = ggml_rms_norm(ctx, x_packed, 1e-12f);
+        x_packed = ggml_mul(ctx, x_packed, time_norm_w);
+        
+        x = ggml_reshape_4d(ctx, x_packed, D, T, F, B);
+        x = ggml_permute(ctx, x, 0, 2, 1, 3);
+        x = ggml_cont(ctx, x);
+        
+        // ========== FREQ TRANSFORMER ==========
+        int tb = T * B;
+        ggml_tensor* x_freq_packed = ggml_reshape_3d(ctx, x, D, F, tb);
+        
+
+        
+        std::string freq_prefix = "blk." + std::to_string(layer) + ".freq_attn";
+        std::string freq_ff_prefix = "blk." + std::to_string(layer) + ".freq_ff";
+        
+        ggml_tensor* f_attn_norm_w = GetWeight(freq_prefix + "_norm.weight");
+        if (!f_attn_norm_w) { std::cerr << "Missing freq norm\n"; return nullptr; }
+        
+        ggml_tensor* x_fnorm = ggml_rms_norm(ctx, x_freq_packed, 1e-12f);
+        x_fnorm = ggml_mul(ctx, x_fnorm, f_attn_norm_w);
+        
+
+        
+        ggml_tensor* f_qkv_w = GetWeight(freq_prefix + "_qkv.weight");
+        if (!f_qkv_w) { std::cerr << "Missing freq qkv\n"; return nullptr; }
+        
+        ggml_tensor* f_qkv_out = ggml_mul_mat(ctx, f_qkv_w, x_fnorm);
+        
+        ggml_tensor* fQ_view = ggml_view_4d(ctx, f_qkv_out, DIM_HEAD, F, HEADS, tb, 
+                                           f_qkv_out->nb[1], DIM_HEAD*sizeof(float), f_qkv_out->nb[2], 0);
+        ggml_tensor* fK_view = ggml_view_4d(ctx, f_qkv_out, DIM_HEAD, F, HEADS, tb,
+                                           f_qkv_out->nb[1], DIM_HEAD*sizeof(float), f_qkv_out->nb[2], DIM_INNER*sizeof(float));
+        ggml_tensor* fV_view = ggml_view_4d(ctx, f_qkv_out, DIM_HEAD, F, HEADS, tb,
+                                           f_qkv_out->nb[1], DIM_HEAD*sizeof(float), f_qkv_out->nb[2], 2*DIM_INNER*sizeof(float));
+        
+        ggml_tensor* fQ = ggml_cont(ctx, fQ_view);
+        ggml_tensor* fK = ggml_cont(ctx, fK_view);
+        ggml_tensor* fV = ggml_cont(ctx, fV_view);
+        
+        // RoPE with CUDA-compatible reshape for Freq Transformer
+        // fQ/fK shape after permute: [DIM_HEAD, HEADS, F, tb]
+        ggml_tensor* fQ_perm = ggml_permute(ctx, fQ, 0, 2, 1, 3);
+        ggml_tensor* fK_perm = ggml_permute(ctx, fK, 0, 2, 1, 3);
+        ggml_tensor* fQ_perm_cont = ggml_cont(ctx, fQ_perm);
+        ggml_tensor* fK_perm_cont = ggml_cont(ctx, fK_perm);
+        
+        // Reshape to merge batch(tb) into sequence for CUDA RoPE compatibility
+        int F_tb = F * tb;
+        ggml_tensor* fQ_flat = ggml_reshape_4d(ctx, fQ_perm_cont, DIM_HEAD, HEADS, F_tb, 1);
+        ggml_tensor* fK_flat = ggml_reshape_4d(ctx, fK_perm_cont, DIM_HEAD, HEADS, F_tb, 1);
+        
+        // Use passed-in expanded position tensor (caller prepares [F*T*B] with repeating [0..F-1])
+        ggml_tensor* fQ_rope_flat = ggml_rope_ext(ctx, fQ_flat, pos_freq_exp, nullptr, DIM_HEAD, 
+                                                  GGML_ROPE_TYPE_NORMAL, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+        ggml_tensor* fK_rope_flat = ggml_rope_ext(ctx, fK_flat, pos_freq_exp, nullptr, DIM_HEAD,
+                                                  GGML_ROPE_TYPE_NORMAL, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+        
+        // Reshape back to [DIM_HEAD, HEADS, F, tb]
+        ggml_tensor* fQ_rope_perm = ggml_reshape_4d(ctx, fQ_rope_flat, DIM_HEAD, HEADS, F, tb);
+        ggml_tensor* fK_rope_perm = ggml_reshape_4d(ctx, fK_rope_flat, DIM_HEAD, HEADS, F, tb);
+        
+        ggml_tensor* fQ_rope = ggml_permute(ctx, fQ_rope_perm, 0, 2, 1, 3);
+        ggml_tensor* fK_rope = ggml_permute(ctx, fK_rope_perm, 0, 2, 1, 3);
+        
+        // Flash Attention (Freq)
+        // Inputs: [DIM_HEAD, F, HEADS, tb]
+        ggml_tensor* fQ_fa = ggml_cont(ctx, fQ_rope);
+        ggml_tensor* fK_fa = ggml_cont(ctx, fK_rope);
+        ggml_tensor* fV_fa = fV; // fV is contiguous [DIM_HEAD, F, HEADS, tb]
+
+        // float scale is already defined in scope (Time Transformer block) or re-define if shadowed loop?
+        // Actually 'scale' was defined inside the Time Transformer loop, so it persists? 
+        // No, Freq Transformer is in the same loop logic? 
+        // Let's check scope. It's in the same 'layer' loop.
+        // But previously I removed the definition line in Time Transformer too? No, I added it back above.
+        // Wait, best to redeclare or rely on scope? 
+        // Time Transformer code block vs Freq Transformer.
+        // Let's just use the value. 
+        // Re-reading Freq Block:
+        // Need to be safe. Redefine 'scale' if needed or ensuring it's available.
+        // Previous search showed `float scale` was defined in Time Block.
+        // If Time block is just sequential code, `scale` is available.
+        // But I removed the line in Time block in the previous step (lines 307-319 replaced).
+        // So I need to add it back in Time block (done in chunk 1).
+        // For Freq block, if it's in same scope, it's fine.
+        // However, standard good practice:
+        
+        // float scale = 1.0f / sqrtf(static_cast<float>(DIM_HEAD)); // Redefinition might error if same scope.
+        // Let's check file content to see if Freq block is separately scoped.
+        // It's in `for (int layer...) { ... Time ... Freq ... }`.
+        // So `scale` defined in Time part is visible in Freq part.
+        // So I don't need to define it again, just ensure it IS defined in Time part.
+        
+        ggml_tensor* f_attn_out_fa = ggml_flash_attn_ext(ctx, fQ_fa, fK_fa, fV_fa, nullptr, scale, 0.0f, 0.0f);
+
+        // Permute output back to [DIM_HEAD, F, HEADS, tb]
+        ggml_tensor* f_attn_out_perm = ggml_permute(ctx, f_attn_out_fa, 0, 2, 1, 3);
+        ggml_tensor* f_attn_out_raw = ggml_cont(ctx, f_attn_out_perm);
+        
+
+        
+        ggml_tensor* f_gate_w = GetWeight(freq_prefix + "_gate.weight");
+        ggml_tensor* f_gate_b = GetWeight(freq_prefix + "_gate.bias");
+        if (!f_gate_w || !f_gate_b) { std::cerr << "Missing freq gates\n"; return nullptr; }
+        
+        ggml_tensor* f_gates = ggml_mul_mat(ctx, f_gate_w, x_fnorm);
+        f_gates = ggml_add(ctx, f_gates, f_gate_b);
+        f_gates = ggml_sigmoid(ctx, f_gates);
+        
+        ggml_tensor* f_gates_perm = ggml_permute(ctx, f_gates, 1, 0, 2, 3);
+        ggml_tensor* f_gates_bcast = ggml_view_4d(ctx, f_gates_perm, 1, F, HEADS, tb,
+                                                  f_gates_perm->nb[0], f_gates_perm->nb[1], f_gates_perm->nb[2], 0);
+        
+        ggml_tensor* f_gated_out = ggml_mul(ctx, f_attn_out_raw, f_gates_bcast);
+        
+        ggml_tensor* f_out_perm = ggml_permute(ctx, f_gated_out, 0, 2, 1, 3);
+        ggml_tensor* f_out_cont = ggml_cont(ctx, f_out_perm);
+        ggml_tensor* f_out_flat = ggml_reshape_3d(ctx, f_out_cont, DIM_INNER, F, tb);
+        
+
+        
+        ggml_tensor* f_attn_out_w = GetWeight(freq_prefix + "_out.weight");
+        if (!f_attn_out_w) { std::cerr << "Missing freq to_out\n"; return nullptr; }
+        
+        ggml_tensor* f_attn_block_out = ggml_mul_mat(ctx, f_attn_out_w, f_out_flat);
+        ggml_tensor* f_x_resid1 = ggml_add(ctx, x_freq_packed, f_attn_block_out);
+        
+
+        
+        // Freq FeedForward
+        ggml_tensor* f_ff_norm_w = GetWeight(freq_ff_prefix + "_norm.weight");
+        if (!f_ff_norm_w) { std::cerr << "Missing freq ff norm\n"; return nullptr; }
+        
+        ggml_tensor* f_x_resid1_norm = ggml_rms_norm(ctx, f_x_resid1, 1e-12f);
+        f_x_resid1_norm = ggml_mul(ctx, f_x_resid1_norm, f_ff_norm_w);
+        
+        x_fnorm = ggml_mul(ctx, x_fnorm, f_attn_norm_w);
+        
+        ggml_tensor* f_ff_in_w = GetWeight(freq_ff_prefix + "_in.weight");
+        ggml_tensor* f_ff_in_b = GetWeight(freq_ff_prefix + "_in.bias");
+        if (!f_ff_in_w || !f_ff_in_b) { std::cerr << "Missing freq ff in\n"; return nullptr; }
+        
+        ggml_tensor* f_ff_proj_in = ggml_mul_mat(ctx, f_ff_in_w, f_x_resid1_norm);
+        f_ff_proj_in = ggml_add(ctx, f_ff_proj_in, f_ff_in_b);
+        
+        ggml_tensor* f_gelu_out = ggml_gelu_erf(ctx, f_ff_proj_in);
+        
+
+        
+        ggml_tensor* f_ff_out_w = GetWeight(freq_ff_prefix + "_out.weight");
+        ggml_tensor* f_ff_out_b = GetWeight(freq_ff_prefix + "_out.bias");
+        if (!f_ff_out_w || !f_ff_out_b) { std::cerr << "Missing freq ff out\n"; return nullptr; }
+        
+        ggml_tensor* f_ff_block_out = ggml_mul_mat(ctx, f_ff_out_w, f_gelu_out);
+        f_ff_block_out = ggml_add(ctx, f_ff_block_out, f_ff_out_b);
+        
+        x_freq_packed = ggml_add(ctx, f_x_resid1, f_ff_block_out);
+        
+        // Freq Transformer Final Norm
+        // blk.{l}.freq_norm.weight
+        std::string freq_norm_name = "blk." + std::to_string(layer) + ".freq_norm.weight";
+        ggml_tensor* freq_norm_w = GetWeight(freq_norm_name);
+        if (!freq_norm_w) { std::cerr << "Missing: " << freq_norm_name << "\n"; return nullptr; }
+        
+        x_freq_packed = ggml_rms_norm(ctx, x_freq_packed, 1e-12f);
+        x_freq_packed = ggml_mul(ctx, x_freq_packed, freq_norm_w);
+        
+        x = ggml_reshape_4d(ctx, x_freq_packed, D, F, T, B);
+        
+        if (skip_connection_) {
+            skip_outputs.push_back(x);
+        }
+    }
+    
+    return x;
+}
+
+ggml_tensor* MelBandRoformer::BuildMaskEstimatorGraph(
+    ggml_context* ctx,
+    ggml_tensor* input,
+    ggml_cgraph* gf,
+    int n_frames,
+    int batch
+) {
+    // Following test_10_full_model.cpp lines 532-618 EXACTLY
+    // Input shape: [dim, num_bands, n_frames, batch]
+    // Output: [total_out_dim, num_stems, n_frames, batch]
+    
+    const int DIM = dim_;
+    const int NUM_BANDS = num_bands_;
+    const int NUM_STEMS = num_stems_;
+    
+    // Calculate band_out_dims from mask_est.0.freq.{b}.mlp.4.weight shape
+    std::vector<int> band_out_dims(NUM_BANDS);
+    int total_out_dim = 0;
+
+    for (int b = 0; b < NUM_BANDS; ++b) {
+        // mask_est.0.freq.{b}.mlp.4.weight
+        // Assuming all stems have same architecture, check stem 0
+        std::string w4_name = "mask_est.0.freq." + std::to_string(b) + ".mlp.4.weight";
+        ggml_tensor* w4 = GetWeight(w4_name);
+        if (!w4) {
+            std::cerr << "Missing weight for dim check: " << w4_name << std::endl;
+            return nullptr;
+        }
+        band_out_dims[b] = static_cast<int>(w4->ne[1]) / 2;  // GLU halves the dimension
+        total_out_dim += band_out_dims[b];
+    }
+    
+    ggml_tensor* x = input;  // [D, F, T, B]
+    
+    // Create mask_output tensor: [total_out_dim, num_stems, n_frames, batch]
+    ggml_tensor* mask_output = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, total_out_dim, NUM_STEMS, n_frames, batch);
+    // No set_input needed if we cpy into it? Actually we construct it piecewise.
+    // Making it zero-initialized or using views to write into it is safer.
+    // ggml_set_zero(mask_output); // Not available easily in graph building usually, assumes overwritten.
+    
+    for (int s = 0; s < NUM_STEMS; ++s) {
+        size_t mask_offset_elements = 0;
+        
+        for (int b = 0; b < NUM_BANDS; ++b) {
+            // Extract band input: [DIM, n_frames, batch] for this band
+            // Since input is same for all stems, we could cache this view? 
+            // GGML graph deduplication might handle it, but explicit view is fine.
+            ggml_tensor* band_in = ggml_view_3d(ctx, x,
+                                                DIM, n_frames, batch,
+                                                x->nb[2], x->nb[3],
+                                                b * x->nb[1]);
+            
+            // mask_est.{s}.freq.{b}.mlp...
+            std::string prefix = "mask_est." + std::to_string(s) + ".freq." + std::to_string(b) + ".mlp.";
+            
+            // MLP Layer 0
+            ggml_tensor* w0 = GetWeight(prefix + "0.weight");
+            ggml_tensor* bias0 = GetWeight(prefix + "0.bias");
+            if (!w0 || !bias0) { std::cerr << "Missing mask weights s=" << s << " b=" << b << "\n"; return nullptr; }
+            
+            ggml_tensor* layer0 = ggml_mul_mat(ctx, w0, band_in);
+            layer0 = ggml_add(ctx, layer0, bias0);
+            layer0 = ggml_tanh(ctx, layer0);
+            
+            // MLP Layer 2
+            ggml_tensor* w2 = GetWeight(prefix + "2.weight");
+            ggml_tensor* bias2 = GetWeight(prefix + "2.bias");
+            
+            ggml_tensor* layer2 = ggml_mul_mat(ctx, w2, layer0);
+            layer2 = ggml_add(ctx, layer2, bias2);
+            layer2 = ggml_tanh(ctx, layer2);
+            
+            // MLP Layer 4
+            ggml_tensor* w4 = GetWeight(prefix + "4.weight");
+            ggml_tensor* bias4 = GetWeight(prefix + "4.bias");
+            
+            ggml_tensor* mlp_out = ggml_mul_mat(ctx, w4, layer2);
+            mlp_out = ggml_add(ctx, mlp_out, bias4);
+            
+            // GLU
+            int dim_out = band_out_dims[b];
+            
+            ggml_tensor* glu_a = ggml_view_3d(ctx, mlp_out,
+                                              dim_out, n_frames, batch,
+                                              mlp_out->nb[1], mlp_out->nb[2], 0);
+            ggml_tensor* glu_b = ggml_view_3d(ctx, mlp_out,
+                                              dim_out, n_frames, batch,
+                                              mlp_out->nb[1], mlp_out->nb[2],
+                                              dim_out * sizeof(float));
+            
+            glu_a = ggml_cont(ctx, glu_a);
+            glu_b = ggml_cont(ctx, glu_b);
+            
+            ggml_tensor* glu_b_sig = ggml_sigmoid(ctx, glu_b);
+            ggml_tensor* band_out = ggml_mul(ctx, glu_a, glu_b_sig);
+            
+            // Copy to mask_output
+            // Destination slice: mask_output[offset:offset+dim, s, :, :]
+            // Use view_4d
+            // offset in dimension 0 is mask_offset_elements
+            // offset in dimension 1 is s
+            size_t dest_offset_bytes = (mask_offset_elements * sizeof(float)) + (s * mask_output->nb[1]);
+            
+            ggml_tensor* dst_view = ggml_view_3d(ctx, mask_output,
+                                                 dim_out, n_frames, batch,
+                                                 mask_output->nb[2], // Time stride
+                                                 mask_output->nb[3], // Batch stride
+                                                 dest_offset_bytes); // Offset to correct freq-bin and stem
+                                                 
+            ggml_build_forward_expand(gf, ggml_cpy(ctx, band_out, dst_view));
+            
+            mask_offset_elements += dim_out;
+        }
+    }
+    
+    // Ensure output
+    ggml_tensor* mask_check = ggml_dup(ctx, mask_output);
+    ggml_set_output(mask_check);
+    ggml_build_forward_expand(gf, mask_check);
+    
+    return mask_check;
+}

+ 157 - 0
src/model.h

@@ -0,0 +1,157 @@
+#pragma once
+#include <string>
+#include <vector>
+#include <memory>
+#include <ggml.h>
+#include <ggml-backend.h>
+#include <ggml-alloc.h>
+
+// Forward declarations
+struct ggml_context;
+struct ggml_cgraph;
+struct gguf_context;
+
+/**
+ * MelBandRoformer Model
+ * 
+ * This class handles:
+ * 1. Loading weights from GGUF file
+ * 2. Providing access to weights and buffers
+ * 3. Building GGML computation graphs for each component
+ * 
+ * Execution is handled by test/inference code using these graphs.
+ */
+class MelBandRoformer {
+public:
+    MelBandRoformer();
+    ~MelBandRoformer();
+
+    // Initialize model from GGUF file
+    void Initialize(const std::string& model_path);
+    
+    // ========== Accessors for weights and config ==========
+    
+    // Get weight tensor by name
+    ggml_tensor* GetWeight(const std::string& name) const;
+    
+    // Get backend
+    ggml_backend_t GetBackend() const { return backend_; }
+    
+    // Get weights context (for creating tensors from weights)
+    ggml_context* GetWeightsContext() const { return ctx_weights_; }
+    
+    // ========== Model Config Accessors ==========
+    int GetDim() const { return dim_; }
+    int GetDepth() const { return depth_; }
+    int GetNumBands() const { return num_bands_; }
+    int GetNFFT() const { return n_fft_; }
+    int GetHopLength() const { return hop_length_; }
+    int GetWinLength() const { return win_length_; }
+    int GetNumStems() const { return num_stems_; }
+    bool GetSkipConnection() const { return skip_connection_; }
+    bool GetSTFTNormalized() const { return stft_normalized_; }
+    bool GetZeroDC() const { return zero_dc_; }
+    
+    // Inference defaults (from GGUF, can be overridden at runtime)
+    int GetDefaultChunkSize() const { return default_chunk_size_; }
+    int GetDefaultNumOverlap() const { return default_num_overlap_; }
+    
+    // ========== Buffer Accessors ==========
+    const std::vector<int>& GetFreqIndices() const { return freq_indices_; }
+    const std::vector<int>& GetNumBandsPerFreq() const { return num_bands_per_freq_; }
+    const std::vector<int>& GetNumFreqsPerBand() const { return num_freqs_per_band_; }
+    
+    // Calculate dim_inputs for each band (num_freqs * 4 for stereo complex)
+    std::vector<int> GetDimInputs() const;
+    int GetTotalDimInput() const;
+    
+    // ========== Graph Building Functions ==========
+    // These functions build GGML computation graph nodes.
+    // They don't execute - execution is done by caller with gallocr + backend_graph_compute.
+    
+    /**
+     * Build BandSplit subgraph
+     * @param ctx Computation context (must have no_alloc=true)
+     * @param input Input tensor [total_dim_input, n_frames, batch]
+     * @param gf Graph to add nodes to
+     * @return Output tensor [dim, num_bands, n_frames, batch]
+     */
+    ggml_tensor* BuildBandSplitGraph(
+        ggml_context* ctx,
+        ggml_tensor* input,
+        ggml_cgraph* gf,
+        int n_frames,
+        int batch = 1
+    );
+    
+    /**
+     * Build Transformer layers subgraph (Time + Freq transformers)
+     * @param ctx Computation context
+     * @param input Input tensor [dim, num_bands, n_frames, batch]
+     * @param gf Graph to add nodes to
+     * @param pos_time_exp Expanded position tensor for time RoPE [T * F * B], with repeating [0..T-1] * (F*B) times
+     * @param pos_freq_exp Expanded position tensor for freq RoPE [F * T * B], with repeating [0..F-1] * (T*B) times
+     * @return Output tensor [dim, num_bands, n_frames, batch]
+     */
+    ggml_tensor* BuildTransformersGraph(
+        ggml_context* ctx,
+        ggml_tensor* input,
+        ggml_cgraph* gf,
+        ggml_tensor* pos_time_exp,
+        ggml_tensor* pos_freq_exp,
+        int n_frames,
+        int batch = 1
+    );
+    
+    /**
+     * Build MaskEstimator subgraph
+     * @param ctx Computation context
+     * @param input Input tensor [dim, num_bands, n_frames, batch]
+     * @param gf Graph to add nodes to
+     * @return Output tensor [total_mask_dim, n_frames, batch]
+     */
+    ggml_tensor* BuildMaskEstimatorGraph(
+        ggml_context* ctx,
+        ggml_tensor* input,
+        ggml_cgraph* gf,
+        int n_frames,
+        int batch = 1
+    );
+
+private:
+    // GGML Contexts
+    ggml_context* ctx_weights_ = nullptr;
+
+    // Backend
+    ggml_backend_t backend_ = nullptr;
+    ggml_backend_buffer_t buffer_weights_ = nullptr;
+
+    // Model Config
+    int dim_ = 384;
+    int depth_ = 6;
+    int num_bands_ = 60;
+    int heads_ = 8;
+    int dim_head_ = 64;
+    int n_fft_ = 2048;
+    int hop_length_ = 441;
+    int win_length_ = 2048;
+    
+    // New Params
+    int num_stems_ = 1;
+    bool skip_connection_ = false;
+    bool stft_normalized_ = false;
+    bool zero_dc_ = false;
+    int mask_estimator_depth_ = 1;
+    
+    // Inference defaults
+    int default_chunk_size_ = 352800;
+    int default_num_overlap_ = 2;
+    
+    // Buffers loaded from GGUF
+    std::vector<int> freq_indices_;
+    std::vector<int> num_bands_per_freq_;
+    std::vector<int> num_freqs_per_band_;
+    
+    // Helper to load GGUF
+    void LoadWeights(const std::string& path);
+};

+ 371 - 0
src/stft.h

@@ -0,0 +1,371 @@
+#pragma once
+/**
+ * stft.h - STFT/ISTFT implementation
+ * 
+ * Implements:
+ * - Hann window generation
+ * - Center padding (reflect mode)
+ * - Frame extraction
+ * - Radix-2 Cooley-Tukey FFT
+ * - Real-to-complex FFT (rfft)
+ * - Inverse FFT (irfft)
+ * - Full STFT/ISTFT matching torch.stft/torch.istft
+ */
+
+#include <cmath>
+#include <vector>
+#include <complex>
+#include <cstring>
+#include <algorithm> // for std::swap
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+namespace stft {
+
+// Complex number type
+using Complex = std::complex<float>;
+
+//=============================================================================
+// Window Functions
+//=============================================================================
+
+/**
+ * Generate Hann window matching torch.hann_window()
+ * PyTorch uses periodic=True by default for STFT compatibility
+ * Periodic formula: 0.5 * (1 - cos(2*pi*n / N))
+ * Symmetric formula: 0.5 * (1 - cos(2*pi*n / (N-1)))
+ */
+inline void hann_window(float* out, int size, bool periodic = true) {
+    int divisor = periodic ? size : (size - 1);
+    for (int i = 0; i < size; ++i) {
+        out[i] = 0.5f * (1.0f - std::cos(2.0f * static_cast<float>(M_PI) * i / divisor));
+    }
+}
+
+//=============================================================================
+// FFT Implementation (Cooley-Tukey Radix-2)
+//=============================================================================
+
+/**
+ * Bit-reversal permutation for radix-2 FFT
+ */
+inline void bit_reverse(Complex* data, int n) {
+    int j = 0;
+    for (int i = 0; i < n - 1; ++i) {
+        if (i < j) {
+            std::swap(data[i], data[j]);
+        }
+        int m = n >> 1;
+        while (j >= m && m > 0) {
+            j -= m;
+            m >>= 1;
+        }
+        j += m;
+    }
+}
+
+/**
+ * In-place Cooley-Tukey radix-2 FFT
+ * @param data Complex array of size n (must be power of 2)
+ * @param n Size of array
+ * @param inverse If true, compute inverse FFT
+ */
+inline void fft_radix2(Complex* data, int n, bool inverse = false) {
+    bit_reverse(data, n);
+    
+    // Danielson-Lanczos lemma
+    for (int len = 2; len <= n; len <<= 1) {
+        float angle = (inverse ? 2.0f : -2.0f) * static_cast<float>(M_PI) / len;
+        Complex w_n(std::cos(angle), std::sin(angle));
+        
+        for (int i = 0; i < n; i += len) {
+            Complex w(1.0f, 0.0f);
+            for (int j = 0; j < len / 2; ++j) {
+                Complex u = data[i + j];
+                Complex t = w * data[i + j + len / 2];
+                data[i + j] = u + t;
+                data[i + j + len / 2] = u - t;
+                w *= w_n;
+            }
+        }
+    }
+    
+    // Normalize for inverse FFT
+    if (inverse) {
+        for (int i = 0; i < n; ++i) {
+            data[i] /= static_cast<float>(n);
+        }
+    }
+}
+
+/**
+ * Real-to-complex FFT (rfft) matching torch.fft.rfft
+ * @param input Real input array of size n
+ * @param output Complex output array of size n/2+1
+ * @param n Size of input (must be power of 2)
+ */
+inline void rfft(const float* input, Complex* output, int n) {
+    // Copy to complex buffer
+    std::vector<Complex> buffer(n);
+    for (int i = 0; i < n; ++i) {
+        buffer[i] = Complex(input[i], 0.0f);
+    }
+    
+    // Compute full FFT
+    fft_radix2(buffer.data(), n, false);
+    
+    // Extract first n/2+1 coefficients (one-sided)
+    int n_out = n / 2 + 1;
+    for (int i = 0; i < n_out; ++i) {
+        output[i] = buffer[i];
+    }
+}
+
+/**
+ * Complex-to-real inverse FFT (irfft) matching torch.fft.irfft
+ * @param input Complex input array of size n/2+1
+ * @param output Real output array of size n
+ * @param n_out Size of output (must be power of 2)
+ */
+inline void irfft(const Complex* input, float* output, int n_out) {
+    int n_freq = n_out / 2 + 1;
+    
+    // Reconstruct full spectrum (conjugate symmetry)
+    std::vector<Complex> buffer(n_out);
+    for (int i = 0; i < n_freq; ++i) {
+        buffer[i] = input[i];
+    }
+    for (int i = n_freq; i < n_out; ++i) {
+        buffer[i] = std::conj(buffer[n_out - i]);
+    }
+    
+    // Compute inverse FFT
+    fft_radix2(buffer.data(), n_out, true);
+    
+    // Extract real part
+    for (int i = 0; i < n_out; ++i) {
+        output[i] = buffer[i].real();
+    }
+}
+
+//=============================================================================
+// STFT Implementation
+//=============================================================================
+
+/**
+ * Short-Time Fourier Transform matching torch.stft
+ * 
+ * @param audio Input audio [n_samples]
+ * @param n_samples Number of samples
+ * @param n_fft FFT size
+ * @param hop_length Hop between frames
+ * @param win_length Window length
+ * @param window Window function [win_length]
+ * @param center If true, pad signal on both sides
+ * @param output Output complex spectrogram [n_freq, n_frames, 2] (real, imag pairs)
+ * @param n_frames_out Output parameter: number of frames
+ */
+inline void compute_stft(
+    const float* audio,
+    int n_samples,
+    int n_fft,
+    int hop_length,
+    int win_length,
+    const float* window,
+    bool center,
+    float* output,
+    int* n_frames_out
+) {
+    // Center padding
+    int pad_amount = center ? n_fft / 2 : 0;
+    int padded_len = n_samples + 2 * pad_amount;
+    
+    std::vector<float> padded(padded_len);
+    
+    if (center) {
+        // Reflect padding
+        // Left pad (reflect)
+        for (int i = 0; i < pad_amount; ++i) {
+            int src_idx = pad_amount - i;
+            if (src_idx >= n_samples) src_idx = n_samples - 1;
+            padded[i] = audio[src_idx];
+        }
+        // Center (copy)
+        if (n_samples > 0) {
+            std::memcpy(padded.data() + pad_amount, audio, n_samples * sizeof(float));
+        }
+        // Right pad (reflect)
+        for (int i = 0; i < pad_amount; ++i) {
+            int src_idx = n_samples - 2 - i;
+            if (src_idx < 0) src_idx = 0;
+            padded[pad_amount + n_samples + i] = audio[src_idx];
+        }
+    } else {
+        std::memcpy(padded.data(), audio, n_samples * sizeof(float));
+    }
+    
+    // Calculate number of frames
+    // PyTorch formula: (L - N) / H + 1
+    int n_frames = 1 + (padded_len - n_fft) / hop_length;
+    if (n_frames < 0) n_frames = 0;
+    *n_frames_out = n_frames;
+    
+    // Number of output frequency bins
+    int n_freq = n_fft / 2 + 1;
+    
+    // Prepare padded window if win_length < n_fft
+    std::vector<float> window_padded(n_fft, 0.0f);
+    if (win_length < n_fft) {
+        int left = (n_fft - win_length) / 2;
+        std::memcpy(window_padded.data() + left, window, win_length * sizeof(float));
+    } else {
+        std::memcpy(window_padded.data(), window, n_fft * sizeof(float));
+    }
+    
+    // Buffers
+    std::vector<float> frame(n_fft);
+    std::vector<Complex> fft_out(n_freq);
+    
+    // Process each frame
+    #ifdef USE_OPENMP
+    #pragma omp parallel for
+    #endif
+    for (int f = 0; f < n_frames; ++f) {
+        int start = f * hop_length;
+        
+        // Extract and window frame
+        // Need private buffer for frame and fft_out if logical threads share memory?
+        // Wait, std::vector inside loop is local to block, so essentially thread-private?
+        // YES. Variables declared inside the loop are private to the iteration/thread.
+        
+        // However, we need to be careful about allocating vectors inside a loop in parallel (heap contention).
+        // It's better to allocate buffers per thread or use raw arrays.
+        // For simplicity and since n_fft is small (2048), stack array or thread_local vector is better.
+        // But std::vector inside parallel for is safe but might allocate.
+        // n_fft=2048 float is 8KB. 
+        std::vector<float> frame(n_fft); // Allocation!
+        std::vector<Complex> fft_out(n_freq);
+        
+        for (int i = 0; i < n_fft; ++i) {
+            frame[i] = padded[start + i] * window_padded[i];
+        }
+        
+        // Compute FFT
+        rfft(frame.data(), fft_out.data(), n_fft);
+        
+        // Store in output [n_freq, n_frames, 2] format
+        for (int k = 0; k < n_freq; ++k) {
+            // Note: Output layout is [Freq, Time, 2]
+            output[(k * n_frames + f) * 2 + 0] = fft_out[k].real();
+            output[(k * n_frames + f) * 2 + 1] = fft_out[k].imag();
+        }
+    }
+}
+
+/**
+ * Inverse Short-Time Fourier Transform matching torch.istft
+ * 
+ * @param stft_data Input complex spectrogram [n_freq, n_frames, 2]
+ * @param n_freq Number of frequency bins
+ * @param n_frames Number of frames
+ * @param n_fft FFT size
+ * @param hop_length Hop between frames
+ * @param win_length Window length
+ * @param window Window function [win_length]
+ * @param center If true, signal was centered
+ * @param length Expected output length (or 0 for auto)
+ * @param output Output audio
+ */
+inline void compute_istft(
+    const float* stft_data,
+    int n_freq,
+    int n_frames,
+    int n_fft,
+    int hop_length,
+    int win_length,
+    const float* window,
+    bool center,
+    int length,
+    float* output
+) {
+    // Calculate expected output signal length
+    int expected_len = n_fft + hop_length * (n_frames - 1);
+    int pad_amount = center ? n_fft / 2 : 0;
+    int output_len = (length > 0) ? length : (expected_len - 2 * pad_amount);
+    
+    // Prepare padded window
+    std::vector<float> window_padded(n_fft, 0.0f);
+    if (win_length < n_fft) {
+        int left = (n_fft - win_length) / 2;
+        std::memcpy(window_padded.data() + left, window, win_length * sizeof(float));
+    } else {
+        std::memcpy(window_padded.data(), window, n_fft * sizeof(float));
+    }
+    
+    // Overlap-add buffers
+    // This is tricky for parallelization: race condition on y (overlap-add).
+    // We CANNOT parallelize the write to 'y' easily without atomic float add (slow/hard) or reduction.
+    // APPROACH:
+    // 1. Parallel IFFT: Compute all frames' time-domain signals into a large buffer [n_frames, n_fft].
+    // 2. Serial Overlap-Add: Add them up. (Overlap-add is O(N_Frames * N_FFT), same complexity, but memory bound).
+    // Serial part might be fast enough if FFT is the heavy lifter.
+    // FFT is O(N log N). Overlap add is O(N). FFT dominates.
+    
+    // Step 1: Compute all IFFTs in parallel
+    std::vector<float> frames_time_domain(n_frames * n_fft);
+    
+    #ifdef USE_OPENMP
+    #pragma omp parallel for
+    #endif
+    for (int f = 0; f < n_frames; ++f) {
+        std::vector<Complex> fft_in(n_freq);
+        std::vector<float> frame_out(n_fft);
+        
+        // Extract complex spectrum
+        for (int k = 0; k < n_freq; ++k) {
+            float re = stft_data[(k * n_frames + f) * 2 + 0];
+            float im = stft_data[(k * n_frames + f) * 2 + 1];
+            fft_in[k] = Complex(re, im);
+        }
+        
+        // IFFT
+        irfft(fft_in.data(), frame_out.data(), n_fft);
+        
+        // Store
+        std::memcpy(&frames_time_domain[f * n_fft], frame_out.data(), n_fft * sizeof(float));
+    }
+    
+    // Step 2: Overlap Add (Serial)
+    std::vector<float> y(expected_len, 0.0f);
+    std::vector<float> window_sum(expected_len, 0.0f);
+    
+    for (int f = 0; f < n_frames; ++f) {
+        int start = f * hop_length;
+        const float* frame_ptr = &frames_time_domain[f * n_fft];
+        
+        for (int i = 0; i < n_fft; ++i) {
+            y[start + i] += frame_ptr[i] * window_padded[i];
+            window_sum[start + i] += window_padded[i] * window_padded[i];
+        }
+    }
+    
+    // Normalize by window sum (avoid division by zero)
+    for (int i = 0; i < expected_len; ++i) {
+        if (window_sum[i] > 1e-8f) {
+            y[i] /= window_sum[i];
+        }
+    }
+    
+    // Remove center padding and copy to output
+    for (int i = 0; i < output_len; ++i) {
+        if (pad_amount + i < expected_len) {
+             output[i] = y[pad_amount + i];
+        } else {
+             output[i] = 0.0f;
+        }
+    }
+}
+
+} // namespace stft

+ 268 - 0
src/utils.cpp

@@ -0,0 +1,268 @@
+#include "utils.h"
+#include <ggml-backend.h>
+#include <fstream>
+#include <iostream>
+#include <cstring>
+#include <cmath>
+#include <algorithm>
+#include <filesystem>
+#include <sstream>
+
+namespace fs = std::filesystem;
+
+namespace utils {
+
+// NPY header parsing (simplified - assumes float32, C-order)
+std::pair<float*, std::vector<size_t>> load_npy(const std::string& filepath) {
+    std::ifstream file(filepath, std::ios::binary);
+    if (!file) {
+        std::cerr << "Failed to open: " << filepath << std::endl;
+        return {nullptr, {}};
+    }
+    
+    // Read magic string
+    char magic[6];
+    file.read(magic, 6);
+    if (std::string(magic, 6) != "\x93NUMPY") {
+        std::cerr << "Invalid NPY file: " << filepath << std::endl;
+        return {nullptr, {}};
+    }
+    
+    // Read version
+    uint8_t major, minor;
+    file.read(reinterpret_cast<char*>(&major), 1);
+    file.read(reinterpret_cast<char*>(&minor), 1);
+    
+    // Read header length
+    uint16_t header_len;
+    if (major == 1) {
+        file.read(reinterpret_cast<char*>(&header_len), 2);
+    } else {
+        uint32_t header_len_32;
+        file.read(reinterpret_cast<char*>(&header_len_32), 4);
+        header_len = header_len_32;
+    }
+    
+    // Read header
+    std::string header(header_len, ' ');
+    file.read(&header[0], header_len);
+    
+    // Parse shape from header
+    std::vector<size_t> shape;
+    size_t shape_start = header.find("'shape': (");
+    if (shape_start == std::string::npos) {
+        shape_start = header.find("\"shape\": (");
+    }
+    
+    if (shape_start != std::string::npos) {
+        size_t shape_end = header.find(')', shape_start);
+        std::string shape_str = header.substr(shape_start + 10, shape_end - shape_start - 10);
+        std::istringstream ss(shape_str);
+        std::string token;
+        while (std::getline(ss, token, ',')) {
+            // Remove spaces
+            token.erase(std::remove(token.begin(), token.end(), ' '), token.end());
+            if (!token.empty()) {
+                shape.push_back(std::stoull(token));
+            }
+        }
+    }
+    
+    // Calculate total elements
+    size_t nelements = 1;
+    for (size_t dim : shape) {
+        nelements *= dim;
+    }
+    
+    // Read data
+    float* data = new float[nelements];
+    file.read(reinterpret_cast<char*>(data), nelements * sizeof(float));
+    
+    file.close();
+    return {data, shape};
+}
+
+std::map<std::string, std::pair<float*, std::vector<size_t>>> load_all_weights(const std::string& debug_dir) {
+    std::map<std::string, std::pair<float*, std::vector<size_t>>> weights;
+    std::string weights_dir = debug_dir + "/weights";
+    
+    for (const auto& entry : fs::directory_iterator(weights_dir)) {
+        if (entry.path().extension() == ".npy") {
+            std::string name = entry.path().stem().string();
+            auto [data, shape] = load_npy(entry.path().string());
+            if (data) {
+                weights[name] = {data, shape};
+                // std::cout << "Loaded weight: " << name << " shape: [";
+                //for (size_t i = 0; i < shape.size(); ++i) {
+                //    std::cout << shape[i];
+                //    if (i < shape.size() - 1) std::cout << ", ";
+                //}
+                // std::cout << "]" << std::endl;
+            }
+        }
+    }
+    
+    return weights;
+}
+
+std::pair<float*, std::vector<size_t>> load_activation(const std::string& debug_dir, const std::string& name) {
+    std::string filepath = debug_dir + "/activations/" + name + ".npy";
+    return load_npy(filepath);
+}
+
+TensorComparison compare_tensors(
+    const std::string& name,
+    const float* expected,
+    const std::vector<size_t>& expected_shape,
+    const ggml_tensor* actual,
+    float atol,
+    float rtol
+) {
+    TensorComparison result;
+    result.name = name;
+    result.shape_expected = expected_shape;
+    
+    // Extract actual shape
+    std::vector<size_t> actual_shape;
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        if (actual->ne[i] > 1 || i == 0) {
+            actual_shape.push_back(actual->ne[i]);
+        }
+    }
+    std::reverse(actual_shape.begin(), actual_shape.end());
+    result.shape_actual = actual_shape;
+    
+    // Robust Squeeze Comparison
+    std::vector<size_t> expected_squeezed;
+    for (size_t dim : expected_shape) {
+        if (dim > 1) expected_squeezed.push_back(dim);
+    }
+    
+    std::vector<size_t> actual_squeezed;
+    for (size_t dim : actual_shape) {
+        if (dim > 1) actual_squeezed.push_back(dim);
+    }
+    
+    bool shape_match = false;
+    if (expected_squeezed.size() == actual_squeezed.size()) {
+        shape_match = true;
+        for (size_t i = 0; i < expected_squeezed.size(); ++i) {
+            if (expected_squeezed[i] != actual_squeezed[i]) {
+                shape_match = false;
+                break;
+            }
+        }
+    }
+
+    if (!shape_match) {
+        result.match = false;
+        result.max_abs_diff = -1;
+        result.mean_abs_diff = -1;
+        result.max_rel_diff = -1;
+        return result;
+    }
+    
+    // Compare values
+    size_t nelements = shape_nelements(expected_shape);
+    // Note: shape_nelements uses full shape, which is correct as total elements match.
+    
+    // Safe data access for Backend/CUDA (Copy to CPU first)
+    // Note: This requires including ggml-backend.h and linking against it
+    std::vector<float> actual_data_vec(nelements);
+    ggml_backend_tensor_get(const_cast<ggml_tensor*>(actual), actual_data_vec.data(), 0, ggml_nbytes(actual));
+    const float* actual_data = actual_data_vec.data();
+    
+    float max_abs = 0.0f;
+    float sum_abs = 0.0f;
+    float max_rel = 0.0f;
+    
+    for (size_t i = 0; i < nelements; ++i) {
+        float diff = std::abs(expected[i] - actual_data[i]);
+        max_abs = std::max(max_abs, diff);
+        sum_abs += diff;
+        
+        float rel_diff = 0.0f;
+        if (std::abs(expected[i]) > 1e-8) {
+            rel_diff = diff / std::abs(expected[i]);
+        }
+        max_rel = std::max(max_rel, rel_diff);
+    }
+    
+    result.max_abs_diff = max_abs;
+    result.mean_abs_diff = sum_abs / nelements;
+    result.max_rel_diff = max_rel;
+    result.match = (max_abs <= atol) || (max_rel <= rtol);
+    
+    return result;
+}
+
+void print_comparison(const TensorComparison& cmp, bool verbose) {
+    std::cout << "\n[Comparison] " << cmp.name << std::endl;
+    
+    // Print shapes
+    std::cout << "  Expected shape: [";
+    for (size_t i = 0; i < cmp.shape_expected.size(); ++i) {
+        std::cout << cmp.shape_expected[i];
+        if (i < cmp.shape_expected.size() - 1) std::cout << ", ";
+    }
+    std::cout << "]" << std::endl;
+    
+    std::cout << "  Actual shape:   [";
+    for (size_t i = 0; i < cmp.shape_actual.size(); ++i) {
+        std::cout << cmp.shape_actual[i];
+        if (i < cmp.shape_actual.size() - 1) std::cout << ", ";
+    }
+    std::cout << "]" << std::endl;
+    
+    // Print statistics
+    if (cmp.max_abs_diff >= 0) {
+        std::cout << "  Max abs diff:  " << cmp.max_abs_diff << std::endl;
+        std::cout << "  Mean abs diff: " << cmp.mean_abs_diff << std::endl;
+        std::cout << "  Max rel diff:  " << cmp.max_rel_diff << std::endl;
+        std::cout << "  Status:        " << (cmp.match ? "✓ MATCH" : "✗ MISMATCH") << std::endl;
+    } else {
+        std::cout << "  Status:        ✗ SHAPE MISMATCH" << std::endl;
+    }
+}
+
+ggml_tensor* create_tensor_from_numpy(
+    ggml_context* ctx,
+    const float* data,
+    const std::vector<size_t>& shape
+) {
+    // GGML uses reversed dimension order
+    int64_t ne[GGML_MAX_DIMS] = {1, 1, 1, 1};
+    for (size_t i = 0; i < shape.size() && i < GGML_MAX_DIMS; ++i) {
+        ne[shape.size() - 1 - i] = shape[i];
+    }
+    
+    ggml_tensor* tensor = ggml_new_tensor(ctx, GGML_TYPE_F32, shape.size(), ne);
+    memcpy(tensor->data, data, shape_nelements(shape) * sizeof(float));
+    
+    return tensor;
+}
+
+size_t shape_nelements(const std::vector<size_t>& shape) {
+    size_t n = 1;
+    for (size_t dim : shape) {
+        n *= dim;
+    }
+    return n;
+}
+
+void print_tensor_shape(const std::string& name, const ggml_tensor* tensor) {
+    std::cout << name << " shape: [";
+    for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
+        if (tensor->ne[i] > 1 || i == 0) {
+            std::cout << tensor->ne[i];
+            if (i > 0) std::cout << ", ";
+        }
+    }
+    std::cout << "]" << std::endl;
+}
+
+void free_npy_data(float* data) {
+    delete[] data;
+}
+
+} // namespace utils

+ 82 - 0
src/utils.h

@@ -0,0 +1,82 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <map>
+#include <cstdint>
+#include <ggml.h>
+
+// Tensor comparison result
+struct TensorComparison {
+    std::string name;
+    bool match;
+    float max_abs_diff;
+    float mean_abs_diff;
+    float max_rel_diff;
+    std::vector<size_t> shape_expected;
+    std::vector<size_t> shape_actual;
+};
+
+// Utility functions
+namespace utils {
+
+/**
+ * Load numpy .npy file into memory
+ * Returns: pointer to data (caller must free), shape vector
+ */
+std::pair<float*, std::vector<size_t>> load_npy(const std::string& filepath);
+
+/**
+ * Load all weights from debug_tensors/weights/ directory
+ * Returns: map of tensor name -> (data pointer, shape)
+ */
+std::map<std::string, std::pair<float*, std::vector<size_t>>> load_all_weights(const std::string& debug_dir);
+
+/**
+ * Load activation tensor from debug_tensors/activations/
+ */
+std::pair<float*, std::vector<size_t>> load_activation(const std::string& debug_dir, const std::string& name);
+
+/**
+ * Compare two tensors (expected from numpy, actual from ggml)
+ * Returns comparison result with detailed statistics
+ */
+TensorComparison compare_tensors(
+    const std::string& name,
+    const float* expected,
+    const std::vector<size_t>& expected_shape,
+    const ggml_tensor* actual,
+    float atol = 1e-4,
+    float rtol = 1e-3
+);
+
+/**
+ * Print comparison result
+ */
+void print_comparison(const TensorComparison& cmp, bool verbose = false);
+
+/**
+ * Create ggml tensor from numpy data
+ */
+ggml_tensor* create_tensor_from_numpy(
+    ggml_context* ctx,
+    const float* data,
+    const std::vector<size_t>& shape
+);
+
+/**
+ * Get total number of elements in shape
+ */
+size_t shape_nelements(const std::vector<size_t>& shape);
+
+/**
+ * Print tensor shape for debugging
+ */
+void print_tensor_shape(const std::string& name, const ggml_tensor* tensor);
+
+/**
+ * Free numpy data pointer
+ */
+void free_npy_data(float* data);
+
+} // namespace utils

+ 47 - 0
tests/CMakeLists.txt

@@ -0,0 +1,47 @@
+# tests/CMakeLists.txt
+# Test suite for MelBandRoformer
+
+#================================================
+# Test Infrastructure
+#================================================
+
+add_library(test_common STATIC test_common.cpp)
+target_link_libraries(test_common PUBLIC mel_band_roformer)
+target_include_directories(test_common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+#================================================
+# Test Registration
+#================================================
+
+# Helper: Add a test with common configuration
+function(mbr_add_test name)
+    if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${name}.cpp")
+        return()
+    endif()
+    
+    add_executable(${name} ${name}.cpp)
+    target_link_libraries(${name} PRIVATE test_common)
+    add_test(NAME ${name} COMMAND ${name})
+    
+    # Copy DLLs on Windows
+    mbr_copy_ggml_dlls(${name})
+endfunction()
+
+# Core tests (no external data required)
+# Core tests (no external data required)
+# mbr_add_test(test_audio) -> test_audio needs src/audio.cpp
+add_executable(test_audio test_audio.cpp ../src/audio.cpp)
+target_link_libraries(test_audio PRIVATE test_common)
+target_include_directories(test_audio PRIVATE ../src ../third_party)
+add_test(NAME test_audio COMMAND test_audio)
+mbr_copy_ggml_dlls(test_audio)
+mbr_add_test(test_component_stft)
+
+# Component tests (require model + test data)
+mbr_add_test(test_component_bandsplit)
+mbr_add_test(test_component_layers)
+mbr_add_test(test_component_mask)
+
+# Integration tests
+mbr_add_test(test_inference)
+mbr_add_test(test_chunking_logic)

+ 128 - 0
tests/README.md

@@ -0,0 +1,128 @@
+# MelBandRoformer Tests
+
+This directory contains the test suite for the MelBandRoformer C++ implementation.
+
+## Test Overview
+
+| Test Name | Description | Requires External Data |
+|-----------|-------------|------------------------|
+| `test_audio` | Audio I/O functionality | ❌ |
+| `test_component_stft` | STFT/ISTFT component verification | ❌ |
+| `test_component_bandsplit` | BandSplit layer verification | ✅ |
+| `test_component_layers` | Transformer layers verification | ✅ |
+| `test_component_mask` | MaskEstimator verification | ✅ |
+| `test_inference` | End-to-end inference verification | ✅ |
+| `test_chunking_logic` | Chunking/overlap-add logic verification | ✅ |
+
+## Quick Start
+
+### 1. Build Tests
+
+```powershell
+# Configure with tests enabled
+cmake -B build -DGGML_CUDA=ON -DMBR_BUILD_TESTS=ON
+
+# Build
+cmake --build build --config Release --parallel 14
+```
+
+### 2. Generate Test Data
+
+First, clone the original PyTorch inference code repository:
+
+```bash
+git clone https://github.com/ZFTurbo/Music-Source-Separation-Training.git
+```
+
+Then use the script to generate test data:
+
+```powershell
+python scripts/generate_test_data.py `
+    --model-repo "path/to/Music-Source-Separation-Training" `
+    --audio "test_segment.wav" `
+    --checkpoint "MelBandRoformer.ckpt" `
+    --output "test_data"
+```
+
+> **Note:** 
+> - `MelBandRoformer.ckpt` is the original PyTorch model weights file.
+> - By default, the script extracts audio from **2.0s to 5.0s**. Use `--start` and `--end` to verify a different range.
+
+### 3. Run Tests
+
+Set environment variables and run:
+
+```powershell
+# Set environment variables
+$env:MBR_MODEL_PATH = "path/to/model.gguf"
+$env:MBR_TEST_DATA_DIR = "path/to/test_data"
+
+# Run all tests
+ctest --test-dir build -C Release
+
+# Run specific test
+ctest --test-dir build -C Release -R test_inference
+
+# Show detailed output
+ctest --test-dir build -C Release --output-on-failure
+```
+
+## Environment Variables
+
+| Variable | Description | Example |
+|----------|-------------|---------|
+| `MBR_MODEL_PATH` | Path to GGUF model file | `models/MelBandRoformer-228M-Vocals-v1-FP16.gguf` |
+| `MBR_TEST_DATA_DIR` | Test data directory (containing `activations/` subdirectory) | `test_data` |
+| `MBR_TEST_ATOL` | Absolute tolerance (optional) | `0.01` |
+| `MBR_TEST_RTOL` | Relative tolerance (optional) | `0.01` |
+
+## Test Data Structure
+
+```
+test_data/
+├── chunk_in.npy              # Chunking test input
+├── chunk_out.npy             # Chunking test output
+└── activations/
+    ├── input_audio.npy       # Input audio [1, 2, N]
+    ├── output_audio.npy      # Output audio [1, 2, N]
+    ├── band_split_in.npy     # BandSplit input
+    ├── after_band_split.npy  # BandSplit output
+    ├── before_mask_est.npy   # Transformer output
+    └── mask_est0.npy         # MaskEstimator output
+```
+
+## Verification Standards
+
+| Model Type | Expected Max Abs Diff | Expected Mean Abs Diff |
+|------------|----------------------|------------------------|
+| FP32 | < 1e-4 | < 1e-5 |
+| FP16 | < 5e-4 | < 5e-5 |
+| Q8_0 | < 5e-3 | < 5e-4 |
+| Q5_x | < 2e-2 | < 3e-3 |
+| Q4_x | < 5e-2 | < 5e-3 |
+
+## Adding New Tests
+
+1. Create `test_xxx.cpp` in `tests/` directory
+2. Use utilities from `test_common.h`
+3. Add to `tests/CMakeLists.txt`:
+   ```cmake
+   mbr_add_test(test_xxx)
+   ```
+
+## Troubleshooting
+
+### Test fails: Model file not found
+
+Ensure `MBR_MODEL_PATH` points to a valid `.gguf` file.
+
+### Test fails: Test data not found
+
+Ensure `MBR_TEST_DATA_DIR` points to a directory containing the `activations/` subdirectory.
+
+### Numerical mismatch
+
+For quantized models, relax the tolerance:
+```powershell
+$env:MBR_TEST_ATOL = "0.05"
+```

+ 76 - 0
tests/test_audio.cpp

@@ -0,0 +1,76 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include "mel_band_roformer/audio.h"
+
+int main() {
+    std::cout << "Test: Audio I/O with dr_wav" << std::endl;
+    
+    const std::string test_file = "test_tone.wav";
+    const int sample_rate = 44100;
+    const int duration_sec = 1;
+    const int channels = 2;
+    const int total_samples = sample_rate * duration_sec * channels;
+    
+    // 1. Generate Stereo Sine Wave (440Hz Left, 880Hz Right)
+    AudioBuffer gen_buffer;
+    gen_buffer.channels = channels;
+    gen_buffer.sampleRate = sample_rate;
+    gen_buffer.samples = total_samples;
+    gen_buffer.data.resize(total_samples);
+    
+    for (int i = 0; i < sample_rate * duration_sec; ++i) {
+        float t = (float)i / sample_rate;
+        float val_left = std::sin(2.0f * 3.14159f * 440.0f * t);
+        float val_right = std::sin(2.0f * 3.14159f * 880.0f * t);
+        
+        gen_buffer.data[i * 2 + 0] = val_left;
+        gen_buffer.data[i * 2 + 1] = val_right;
+    }
+    
+    // 2. Write to File
+    std::cout << "Writing " << test_file << "..." << std::endl;
+    try {
+        AudioFile::Save(test_file, gen_buffer);
+    } catch (const std::exception& e) {
+        std::cerr << "Error writing: " << e.what() << std::endl;
+        return 1;
+    }
+    
+    // 3. Read Back
+    std::cout << "Reading " << test_file << "..." << std::endl;
+    AudioBuffer read_buffer;
+    try {
+        read_buffer = AudioFile::Load(test_file);
+    } catch (const std::exception& e) {
+        std::cerr << "Error reading: " << e.what() << std::endl;
+        return 1;
+    }
+    
+    // 4. Verify
+    if (read_buffer.sampleRate != sample_rate) {
+        std::cerr << "FAILED: Sample rate mismatch " << read_buffer.sampleRate << " != " << sample_rate << std::endl;
+        return 1;
+    }
+    
+    if (read_buffer.channels != channels) {
+         std::cerr << "FAILED: Channel count mismatch " << read_buffer.channels << " != " << channels << std::endl;
+         return 1;
+    }
+    
+    float max_diff = 0.0f;
+    for (size_t i = 0; i < read_buffer.samples; ++i) {
+        float diff = std::abs(read_buffer.data[i] - gen_buffer.data[i]);
+        if (diff > max_diff) max_diff = diff;
+    }
+    
+    std::cout << "Max diff: " << max_diff << std::endl;
+    
+    if (max_diff > 1e-4) {
+        std::cerr << "FAILED: Data mismatch (diff > 1e-4)" << std::endl;
+        return 1;
+    }
+    
+    std::cout << "PASSED: Audio I/O Verified." << std::endl;
+    return 0;
+}

+ 97 - 0
tests/test_chunking_logic.cpp

@@ -0,0 +1,97 @@
+#include "test_common.h"
+#include "mel_band_roformer/inference.h"
+#include <cstring>
+
+// We need to test the static helper OR the pipeline.
+// Check if Inference::ProcessOverlapAdd is still available and public.
+// Yes, it is in inference.h and implemented in inference.cpp.
+
+int main(int argc, char* argv[]) {
+    std::cout << "Test: Chunking Logic (Overlap-Add) Verification" << std::endl;
+    
+    std::string data_dir = GetTestDataDir();
+    // Use files generated by export_chunking_debug.py if available, or skip
+    // If we included them in package_test_data.py, they might be in activations/ ??
+    // No, export_chunking_debug.py puts them in root or specified dir.
+    // If packaged, we might have them? 
+    // Let's assume they are in data_dir (which might be "golden" root).
+    // Note: chunk_in.npy and chunk_out.npy are NOT in 'activations/' subdir normally
+    // but in tests/ or root.
+    // Let's try loading from data_dir directly.
+    
+    // Fallback: If not found, try generating? Or just skip?
+    // Better: Assume they are present.
+    
+    // We use load_npy directly as they might not be in activations/
+    // We use load_npy directly as they are in tests/ directory
+    // Use data_dir (from MBR_TEST_DATA_DIR or default)
+    std::string in_path = data_dir + "/chunk_in.npy";
+    std::string out_path = data_dir + "/chunk_out.npy";
+
+    if (argc > 1) in_path = argv[1];
+    if (argc > 2) out_path = argv[2];
+    
+    auto [in_ptr, in_shape] = utils::load_npy(in_path);
+    if (!in_ptr) {
+         // Try checking if it's in the 'activations' subdir (legacy/alternative structure)
+         std::string alt_in = data_dir + "/activations/chunk_in.npy";
+         auto res = utils::load_npy(alt_in);
+         if (res.first) {
+             in_ptr = res.first; in_shape = res.second;
+             in_path = alt_in;
+         }
+    }
+    
+    if (!in_ptr) {
+        // Just print absolute path hint for debugging
+        std::cout << "[SKIP] chunk_in.npy not found in " << data_dir << " or " << in_path << std::endl;
+        return 0; 
+    }
+    
+    auto [out_ptr, out_shape] = utils::load_npy(out_path);
+    if (!out_ptr) {
+         std::string alt_out = data_dir + "/activations/chunk_out.npy";
+         auto res = utils::load_npy(alt_out);
+         if (res.first) {
+             out_ptr = res.first; out_shape = res.second;
+         }
+    }
+    
+    if (!out_ptr) {
+         std::cout << "[SKIP] chunk_out.npy not found" << std::endl;
+         utils::free_npy_data(in_ptr);
+         return 0;
+    }
+    
+    std::vector<float> input_vec(utils::shape_nelements(in_shape));
+    std::memcpy(input_vec.data(), in_ptr, input_vec.size()*sizeof(float));
+    utils::free_npy_data(in_ptr);
+    
+    // Expected
+    std::vector<float> expected_vec(utils::shape_nelements(out_shape));
+    std::memcpy(expected_vec.data(), out_ptr, expected_vec.size()*sizeof(float));
+    utils::free_npy_data(out_ptr);
+    
+    // Run Logic
+    int chunk_size = 352800;
+    int num_overlap = 2;
+    
+    std::cout << "  Input size: " << input_vec.size() << std::endl;
+    
+    // Identity Model
+    auto identity = [](const std::vector<float>& chunk) { return std::vector<std::vector<float>>{chunk}; };
+    
+    // We test the STATIC legacy method because we can't easily mock the pipeline
+    // inside Inference class without refactoring it to accept an abstract Model interface.
+    
+    auto actual_stems = Inference::ProcessOverlapAdd(input_vec, chunk_size, num_overlap, identity);
+    std::vector<float> actual = actual_stems[0];
+    
+    bool pass = CompareAndReport("OverlapAdd Logic", 
+                                  expected_vec.data(), expected_vec.size(),
+                                  actual.data(), actual.size(),
+                                  1e-4f, 1e-4f);
+                                  
+    if (pass) LOG_PASS(); else LOG_FAIL();
+    return pass ? 0 : 1;
+}

+ 145 - 0
tests/test_common.cpp

@@ -0,0 +1,145 @@
+#include "test_common.h"
+
+//======================================================
+// TestContext
+//======================================================
+TestContext::TestContext(MelBandRoformer* m, size_t mem_size) : model(m) {
+    if (!model) {
+        std::cerr << "FATAL: Model is null in TestContext" << std::endl;
+        exit(1);
+    }
+    
+    struct ggml_init_params ctx_params = {
+        /*.mem_size   = */ mem_size,
+        /*.mem_buffer = */ nullptr,
+        /*.no_alloc   = */ true,
+    };
+    ctx = ggml_init(ctx_params);
+    gf = ggml_new_graph_custom(ctx, 16384, false); // Sufficiently large graph
+}
+
+TestContext::~TestContext() {
+    if (allocr) ggml_gallocr_free(allocr);
+    if (ctx) ggml_free(ctx);
+}
+
+bool TestContext::AllocateGraph() {
+    if (!allocr) {
+        allocr = ggml_gallocr_new(
+            ggml_backend_get_default_buffer_type(model->GetBackend())
+        );
+    }
+    return ggml_gallocr_alloc_graph(allocr, gf);
+}
+
+void TestContext::Compute() {
+    ggml_backend_graph_compute(model->GetBackend(), gf);
+}
+
+std::vector<float> TestContext::ReadTensor(ggml_tensor* t) {
+    size_t nelements = ggml_nelements(t);
+    std::vector<float> buffer(nelements);
+    ggml_backend_tensor_get(t, buffer.data(), 0, ggml_nbytes(t));
+    return buffer;
+}
+
+//======================================================
+// GoldenTensor
+//======================================================
+GoldenTensor::GoldenTensor(const std::string& dir, const std::string& n) : name(n) {
+    std::pair<float*, std::vector<size_t>> res = utils::load_activation(dir, name);
+    data = res.first;
+    shape = res.second;
+}
+
+GoldenTensor::~GoldenTensor() {
+    if (data) {
+        utils::free_npy_data(data);
+        data = nullptr;
+    }
+}
+
+GoldenTensor::GoldenTensor(GoldenTensor&& o) noexcept 
+    : data(o.data), shape(std::move(o.shape)), name(std::move(o.name)) {
+    o.data = nullptr;
+}
+
+GoldenTensor& GoldenTensor::operator=(GoldenTensor&& o) noexcept {
+    if (this != &o) {
+        if (data) utils::free_npy_data(data);
+        data = o.data;
+        shape = std::move(o.shape);
+        name = std::move(o.name);
+        o.data = nullptr;
+    }
+    return *this;
+}
+
+size_t GoldenTensor::nelements() const {
+    if (shape.empty()) return 0;
+    size_t n = 1;
+    for (size_t dim : shape) n *= dim;
+    return n;
+}
+
+void GoldenTensor::PrintShape(const std::string& prefix) const {
+    std::cout << prefix << name << " shape: [";
+    for (size_t i = 0; i < shape.size(); ++i) {
+        std::cout << shape[i];
+        if (i < shape.size() - 1) std::cout << ", ";
+    }
+    std::cout << "]" << std::endl;
+}
+
+//======================================================
+// Helper
+//======================================================
+bool CompareAndReport(
+    const std::string& name,
+    const float* expected, size_t n_expected,
+    const float* actual, size_t n_actual,
+    float atol,
+    float rtol
+) {
+    std::cout << "[Compare] " << name << std::endl;
+    
+    if (n_expected != n_actual) {
+        std::cerr << "  SIZE MISMATCH: Expected " << n_expected << ", Actual " << n_actual << std::endl;
+        return false;
+    }
+    
+    // Resolve tolerances
+    if (atol < 0) atol = GetToleranceAtol();
+    if (rtol < 0) rtol = GetToleranceRtol();
+    
+    float max_diff = 0.0f;
+    float sum_diff = 0.0f;
+    float max_rel_diff = 0.0f;
+    
+    for (size_t i = 0; i < n_expected; ++i) {
+        float diff = std::abs(expected[i] - actual[i]);
+        max_diff = std::max(max_diff, diff);
+        sum_diff += diff;
+        
+        if (std::abs(expected[i]) > 1e-8f) {
+            float rel = diff / std::abs(expected[i]);
+            max_rel_diff = std::max(max_rel_diff, rel);
+        }
+    }
+    
+    float mean_diff = sum_diff / n_expected;
+    
+    std::cout << "  max_diff: " << max_diff << " (limit " << atol << ")" << std::endl;
+    std::cout << "  mean_diff: " << mean_diff << std::endl;
+    std::cout << "  max_rel_diff: " << max_rel_diff << " (limit " << rtol << ")" << std::endl;
+    
+    bool match = (max_diff <= atol) || (max_rel_diff <= rtol);
+    
+    if (match) {
+        std::cout << "  ✓ OK" << std::endl;
+    } else {
+        std::cout << "  ✗ MISMATCH" << std::endl;
+    }
+    
+    return match;
+}

+ 126 - 0
tests/test_common.h

@@ -0,0 +1,126 @@
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <cstdlib>
+#include <string>
+#include <algorithm>
+#include <ggml.h>
+#include <ggml-alloc.h>
+#include <ggml-backend.h>
+#include "../src/model.h"
+#include "../src/utils.h"
+
+//======================================================
+// 配置获取
+//======================================================
+inline std::string GetTestDataDir() {
+    const char* env = std::getenv("MBR_TEST_DATA_DIR");
+    return env ? env : ".";
+}
+
+inline std::string GetModelPath() {
+    const char* env = std::getenv("MBR_MODEL_PATH");
+    return env ? env : "mel_band_roformer.gguf";
+}
+
+inline float GetToleranceAtol() {
+    const char* env = std::getenv("MBR_TEST_ATOL");
+    return env ? std::stof(env) : 1e-3f;
+}
+
+inline float GetToleranceRtol() {
+    const char* env = std::getenv("MBR_TEST_RTOL");
+    return env ? std::stof(env) : 1e-2f;
+}
+
+//======================================================
+// RAII 测试上下文 (TestContext)
+//======================================================
+struct TestContext {
+    ggml_context* ctx = nullptr;
+    ggml_cgraph* gf = nullptr;
+    ggml_gallocr_t allocr = nullptr;
+    MelBandRoformer* model = nullptr;
+    
+    // 初始化上下文和图
+    TestContext(MelBandRoformer* m, size_t mem_size = 512 * 1024 * 1024);
+    
+    // 析构自动释放资源
+    ~TestContext();
+    
+    // 分配图内存 (VRAM/RAM)
+    bool AllocateGraph();
+    
+    // 执行计算
+    void Compute();
+    
+    // 安全读取张量数据 (自动处理 GPU->CPU 拷贝)
+    std::vector<float> ReadTensor(ggml_tensor* t);
+};
+
+//======================================================
+// RAII Golden Data 加载器
+//======================================================
+struct GoldenTensor {
+    float* data = nullptr;
+    std::vector<size_t> shape;
+    std::string name;
+    
+    GoldenTensor() = default;
+    
+    // 从 dir/activations/{name}.npy 加载
+    GoldenTensor(const std::string& dir, const std::string& name);
+    
+    ~GoldenTensor();
+    
+    // 禁止拷贝
+    GoldenTensor(const GoldenTensor&) = delete;
+    GoldenTensor& operator=(const GoldenTensor&) = delete;
+    
+    // 允许移动
+    GoldenTensor(GoldenTensor&& o) noexcept;
+    GoldenTensor& operator=(GoldenTensor&& o) noexcept;
+    
+    bool valid() const { return data != nullptr; }
+    size_t nelements() const;
+    
+    // 打印形状
+    void PrintShape(const std::string& prefix = "") const;
+};
+
+//======================================================
+// 断言宏
+//======================================================
+#define TEST_ASSERT(cond, msg) \
+    do { \
+        if (!(cond)) { \
+            std::cerr << "\n[ASSERT FAILED] " << msg << std::endl; \
+            std::cerr << "  File: " << __FILE__ << ":" << __LINE__ << std::endl; \
+            return 1; \
+        } \
+    } while(0)
+
+#define TEST_ASSERT_LOAD(tensor, name) \
+    TEST_ASSERT((tensor).valid(), "Failed to load " name ".npy from " + GetTestDataDir())
+
+//======================================================
+// 辅助函数
+//======================================================
+
+// 比较结果并打印报告
+bool CompareAndReport(
+    const std::string& name,
+    const float* expected, size_t n_expected,
+    const float* actual, size_t n_actual,
+    float atol = -1.0f, // < 0 means use default/env
+    float rtol = -1.0f
+);
+
+// 日志宏
+#define LOG_STEP(step, total, msg) \
+    std::cout << "\n[" << step << "/" << total << "] " << msg << std::endl
+
+#define LOG_PASS() std::cout << "\n✓ PASSED" << std::endl
+#define LOG_FAIL() std::cout << "\n✗ FAILED" << std::endl

+ 92 - 0
tests/test_component_bandsplit.cpp

@@ -0,0 +1,92 @@
+#include "test_common.h"
+
+int main(int argc, char* argv[]) {
+    std::cout << "Test: BandSplit Component Verification" << std::endl;
+    
+    // 1. 获取资源
+    std::string model_path = GetModelPath();
+    std::string data_dir = GetTestDataDir();
+    
+    if (argc > 1) model_path = argv[1];
+    if (argc > 2) data_dir = argv[2];
+    
+    LOG_STEP(1, 4, "Loading model from " + model_path);
+    MelBandRoformer model;
+    model.Initialize(model_path);
+    
+    LOG_STEP(2, 4, "Loading golden tensors from " + data_dir);
+    GoldenTensor input(data_dir, "band_split_in");
+    GoldenTensor expected(data_dir, "after_band_split");
+    
+    TEST_ASSERT_LOAD(input, "band_split_in");
+    TEST_ASSERT_LOAD(expected, "after_band_split");
+    
+    input.PrintShape("Input");
+    expected.PrintShape("Expected");
+    
+    // PyTorch [batch, bands, time, dim] -> GGML [dim, time, bands, batch] ? 
+    // Wait, utils.cpp says: load_npy returns raw data and shape.
+    // PyTorch input: [batch, bands, time, dim]
+    // GGML expected Input: [dim, bands, time, batch] ? No.
+    // Let's check original test...
+    // Original: total_dim_input(idx=2), n_frames(idx=1), batch(idx=0).
+    // Original input: [batch, frames, dim] ??
+    // band_split_in.npy shape from original output: [1, 301, 384] (Batch, Time, Dim)?
+    // No, let's look at export_debug.py line 219: `x = rearrange(x, 'b t (f c) -> b t f c')` ??
+    // Wait, export_debug.py:
+    //   x = stft_repr[batch_arange, freq_indices] -> [b, f, t, c]
+    //   x = rearrange(x, 'b f t c -> b t (f c)') -> [b, t, features]
+    // So 'band_split_in' is [Batch, Time, Features]
+    // GGML Tensor likely: [Features, Time, Batch] (Transposed for column-major/GGML)
+    
+    int batch = input.shape[0];
+    int n_frames = input.shape[1];
+    int total_dim = input.shape[2];
+    
+    // 3. Build Graph
+    LOG_STEP(3, 4, "Building computation graph");
+    TestContext tc(&model);
+    
+    // GGML Tensor shape: [dim, n_frames, batch]
+    ggml_tensor* in_tensor = ggml_new_tensor_3d(tc.ctx, GGML_TYPE_F32, total_dim, n_frames, batch);
+    ggml_set_input(in_tensor);
+    
+    ggml_tensor* out = model.BuildBandSplitGraph(tc.ctx, in_tensor, tc.gf, n_frames, batch);
+    TEST_ASSERT(out, "BuildBandSplitGraph returned nullptr");
+    
+    // Mark output for computation
+    ggml_build_forward_expand(tc.gf, out);
+    
+    // 4. Exec
+    LOG_STEP(4, 4, "Executing");
+    if (!tc.AllocateGraph()) {
+        std::cerr << "Graph allocation failed" << std::endl;
+        return 1;
+    }
+    
+    // Copy input (NumPy [B, T, D] -> GGML [D, T, B])
+    // The memory layout of NumPy [B,T,D] (C-contiguous) is:
+    //   Batch 0 -> Time 0 -> Dim 0..D
+    // GGML [D, T, B] (F-contiguous-ish, but tensor struct is different)
+    // Actually GGML default tensor is [ne0, ne1, ne2, ne3]
+    // ne0 is fastest moving dimension. 
+    // If we say tensor is [D, T, B], ne0=D, ne1=T, ne2=B.
+    // So data layout is D contiguous, then T, then B.
+    // This MATCHES NumPy [B, T, D] C-contiguous!
+    //   NumPy: fast index is last dim (D).
+    //   GGML: fast index is first dim (ne0=D).
+    // So we can memcpy directly!
+    
+    ggml_backend_tensor_set(in_tensor, input.data, 0, ggml_nbytes(in_tensor));
+    tc.Compute();
+    
+    // 5. Compare
+    auto output = tc.ReadTensor(out);
+    
+    bool pass = CompareAndReport("BandSplit", 
+                                  expected.data, expected.nelements(),
+                                  output.data(), output.size());
+    
+    if (pass) LOG_PASS(); else LOG_FAIL();
+    return pass ? 0 : 1;
+}

+ 219 - 0
tests/test_component_layers.cpp

@@ -0,0 +1,219 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <string>
+#include <cstring>
+#include <cstdlib>
+#include <ggml.h>
+#include <ggml-alloc.h>
+#include <ggml-backend.h>
+#include "../src/model.h"
+#include "../src/utils.h"
+
+/**
+ * test_component_layers.cpp
+ * 
+ * Verifies Transformer layers against golden tensors from export_debug.py
+ * Copied from tests_old/test_component_layers.cpp with env var support
+ */
+
+std::string GetModelPath() {
+    const char* env = std::getenv("MBR_MODEL_PATH");
+    return env ? env : "mel_band_roformer.gguf";
+}
+
+std::string GetTestDataDir() {
+    const char* env = std::getenv("MBR_TEST_DATA_DIR");
+    return env ? env : ".";
+}
+
+int main(int argc, char* argv[]) {
+    std::cout << "========================================" << std::endl;
+    std::cout << "Test: Transformer Layers Verification" << std::endl;
+    std::cout << "========================================" << std::endl;
+    
+    std::string model_path = GetModelPath();
+    std::string debug_dir = GetTestDataDir();
+    
+    if (argc > 1) model_path = argv[1];
+    if (argc > 2) debug_dir = argv[2];
+    
+    try {
+        // 1. Load Model
+        std::cout << "\n[1/6] Loading model..." << std::endl;
+        MelBandRoformer model;
+        model.Initialize(model_path);
+        
+        // 2. Load golden tensors
+        std::cout << "\n[2/6] Loading golden tensors..." << std::endl;
+        
+        // Load after_band_split (input to Transformers)
+        auto [input_data, input_shape] = utils::load_activation(debug_dir, "after_band_split");
+        if (!input_data) {
+            std::cerr << "Failed to load after_band_split.npy" << std::endl;
+            return 1;
+        }
+        std::cout << "  Input (after_band_split) shape: [";
+        for (size_t i = 0; i < input_shape.size(); ++i) {
+            std::cout << input_shape[i];
+            if (i < input_shape.size() - 1) std::cout << ", ";
+        }
+        std::cout << "]" << std::endl;
+        
+        // Load before_mask_est (expected output after all 6 layers)
+        auto [expected_data, expected_shape] = utils::load_activation(debug_dir, "before_mask_est");
+        if (!expected_data) {
+            std::cerr << "Failed to load before_mask_est.npy" << std::endl;
+            utils::free_npy_data(input_data);
+            return 1;
+        }
+        std::cout << "  Expected (before_mask_est) shape: [";
+        for (size_t i = 0; i < expected_shape.size(); ++i) {
+            std::cout << expected_shape[i];
+            if (i < expected_shape.size() - 1) std::cout << ", ";
+        }
+        std::cout << "]" << std::endl;
+        
+        // Extract dimensions from shapes
+        // PyTorch: [batch, time, bands, dim]
+        int batch = static_cast<int>(input_shape[0]);
+        int n_frames = static_cast<int>(input_shape[1]);
+        int n_bands = static_cast<int>(input_shape[2]);
+        int dim = static_cast<int>(input_shape[3]);
+        
+        std::cout << "  batch=" << batch << ", n_frames=" << n_frames 
+                  << ", n_bands=" << n_bands << ", dim=" << dim << std::endl;
+        
+        // 3. Build computation graph
+        std::cout << "\n[3/6] Building computation graph..." << std::endl;
+        
+        size_t mem_size = 1024 * 1024 * 1024;  // 1GB for Transformers
+        struct ggml_init_params ctx_params = {
+            /*.mem_size   = */ mem_size,
+            /*.mem_buffer = */ nullptr,
+            /*.no_alloc   = */ true,
+        };
+        ggml_context* ctx = ggml_init(ctx_params);
+        
+        // Expanded position tensors for CUDA RoPE compatibility:
+        // pos_time_exp: size [T * F * B], repeating [0..T-1] for each F*B batch
+        // pos_freq_exp: size [F * T * B], repeating [0..F-1] for each T*B batch
+        int time_exp_size = n_frames * n_bands * batch;  // T * F * B
+        int freq_exp_size = n_bands * n_frames * batch;  // F * T * B
+        
+        std::vector<int32_t> pos_time_exp_data(time_exp_size);
+        for (int i = 0; i < time_exp_size; ++i) {
+            pos_time_exp_data[i] = i % n_frames;  // Repeat [0..T-1]
+        }
+        
+        std::vector<int32_t> pos_freq_exp_data(freq_exp_size);
+        for (int i = 0; i < freq_exp_size; ++i) {
+            pos_freq_exp_data[i] = i % n_bands;  // Repeat [0..F-1]
+        }
+        
+        ggml_cgraph* gf = ggml_new_graph_custom(ctx, 32768, false);
+        
+        // Create input tensor: [dim, bands, time, batch] (GGML order)
+        ggml_tensor* input = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 
+                                                 dim, n_bands, n_frames, batch);
+        ggml_set_input(input);
+        
+        // Create expanded position tensors for RoPE
+        ggml_tensor* pos_time_exp = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, time_exp_size);
+        ggml_set_input(pos_time_exp);
+        
+        ggml_tensor* pos_freq_exp = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, freq_exp_size);
+        ggml_set_input(pos_freq_exp);
+        
+        // Build Transformers graph
+        ggml_tensor* x = model.BuildTransformersGraph(ctx, input, gf, pos_time_exp, pos_freq_exp, n_frames, batch);
+        if (!x) {
+            std::cerr << "FAILED: BuildTransformersGraph returned nullptr" << std::endl;
+            utils::free_npy_data(input_data);
+            utils::free_npy_data(expected_data);
+            ggml_free(ctx);
+            return 1;
+        }
+        
+        // Mark output
+        ggml_tensor* output = ggml_dup(ctx, x);
+        ggml_set_output(output);
+        ggml_build_forward_expand(gf, output);
+        
+        std::cout << "  Graph built with " << ggml_graph_n_nodes(gf) << " nodes" << std::endl;
+        
+        // 4. Allocate and execute
+        std::cout << "\n[4/6] Allocating graph..." << std::endl;
+        
+        ggml_gallocr_t allocr = ggml_gallocr_new(
+            ggml_backend_get_default_buffer_type(model.GetBackend())
+        );
+        
+        if (!ggml_gallocr_alloc_graph(allocr, gf)) {
+            std::cerr << "FAILED: Failed to allocate graph" << std::endl;
+            utils::free_npy_data(input_data);
+            utils::free_npy_data(expected_data);
+            ggml_gallocr_free(allocr);
+            ggml_free(ctx);
+            return 1;
+        }
+        
+        std::cout << "\n[5/6] Executing graph..." << std::endl;
+        
+        // Copy input data
+        ggml_backend_tensor_set(input, input_data, 0, ggml_nbytes(input));
+        
+        // Copy expanded position tensors
+        ggml_backend_tensor_set(pos_time_exp, pos_time_exp_data.data(), 0, ggml_nbytes(pos_time_exp));
+        ggml_backend_tensor_set(pos_freq_exp, pos_freq_exp_data.data(), 0, ggml_nbytes(pos_freq_exp));
+        
+        // Compute
+        ggml_backend_graph_compute(model.GetBackend(), gf);
+        
+        // 5. Compare results
+        std::cout << "\n[6/6] Comparing results..." << std::endl;
+        
+        // Copy output from GPU to CPU for comparison
+        std::vector<float> output_data(ggml_nelements(output));
+        ggml_backend_tensor_get(output, output_data.data(), 0, ggml_nbytes(output));
+        
+        // Compare element counts
+        size_t expected_nelements = utils::shape_nelements(expected_shape);
+        std::cout << "  Output elements: " << output_data.size() << std::endl;
+        std::cout << "  Expected elements: " << expected_nelements << std::endl;
+        
+        // Compute comparison statistics directly
+        float max_abs = 0.0f;
+        float sum_abs = 0.0f;
+        for (size_t i = 0; i < output_data.size() && i < expected_nelements; ++i) {
+            float diff = std::abs(expected_data[i] - output_data[i]);
+            max_abs = std::max(max_abs, diff);
+            sum_abs += diff;
+        }
+        float mean_abs = sum_abs / output_data.size();
+        
+        std::cout << "\n[Comparison] Transformers Output" << std::endl;
+        std::cout << "  Max abs diff:  " << max_abs << std::endl;
+        std::cout << "  Mean abs diff: " << mean_abs << std::endl;
+        
+        bool match = max_abs <= 3e-2f || mean_abs <= 3e-3f;
+        
+        // Cleanup
+        utils::free_npy_data(input_data);
+        utils::free_npy_data(expected_data);
+        ggml_gallocr_free(allocr);
+        ggml_free(ctx);
+        
+        if (match) {
+            std::cout << "\nPASSED: Transformers match PyTorch output" << std::endl;
+            return 0;
+        } else {
+            std::cout << "\nFAILED: Transformers do not match PyTorch output" << std::endl;
+            return 1;
+        }
+        
+    } catch (const std::exception& e) {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return 1;
+    }
+}

+ 113 - 0
tests/test_component_mask.cpp

@@ -0,0 +1,113 @@
+#include "test_common.h"
+
+int main(int argc, char* argv[]) {
+    std::cout << "Test: MaskEstimator Component Verification" << std::endl;
+    
+    std::string model_path = GetModelPath();
+    std::string data_dir = GetTestDataDir();
+    
+    if (argc > 1) model_path = argv[1];
+    if (argc > 2) data_dir = argv[2];
+    
+    LOG_STEP(1, 4, "Loading model from " + model_path);
+    MelBandRoformer model;
+    model.Initialize(model_path);
+    
+    LOG_STEP(2, 4, "Loading golden tensors");
+    GoldenTensor input(data_dir, "before_mask_est");
+    GoldenTensor expected(data_dir, "mask_est0");
+    
+    TEST_ASSERT_LOAD(input, "before_mask_est");
+    TEST_ASSERT_LOAD(expected, "mask_est0");
+    
+    input.PrintShape("Input");
+    expected.PrintShape("Expected");
+    
+    // Input PyTorch: [1, T, Bands, Dim] -> [1, 301, 60, 64] ?
+    // Let's check export_debug.py line 246
+    // x (before_mask_est) comes from freq_transformer.
+    // x shape is [batch, time, bands, dim] (rearranged in line 229: b t f d)
+    // Wait, line 229 says: x = rearrange(x, 'b f t d -> b t f d')
+    // So input is [B, T, Bands, Dim]
+    
+    int batch = input.shape[0];
+    int n_frames = input.shape[1];
+    int n_bands = input.shape[2];
+    int dim = input.shape[3];
+    
+    // 3. Build Graph
+    LOG_STEP(3, 4, "Building computation graph");
+    TestContext tc(&model);
+    
+    // GGML Input: [Dim, Bands, Frames, Batch] (ne0=Dim)
+    // Matches NumPy [B, T, Bands, Dim] layout directly
+    ggml_tensor* in_tensor = ggml_new_tensor_4d(tc.ctx, GGML_TYPE_F32, dim, n_bands, n_frames, batch);
+    ggml_set_input(in_tensor);
+    
+    ggml_tensor* out = model.BuildMaskEstimatorGraph(tc.ctx, in_tensor, tc.gf, n_frames, batch);
+    TEST_ASSERT(out, "BuildMaskEstimatorGraph returned nullptr");
+    
+    ggml_build_forward_expand(tc.gf, out);
+    
+    // 4. Exec
+    LOG_STEP(4, 4, "Executing");
+    if (!tc.AllocateGraph()) return 1;
+    
+    ggml_backend_tensor_set(in_tensor, input.data, 0, ggml_nbytes(in_tensor));
+    tc.Compute();
+    
+    // 5. Compare
+    auto output = tc.ReadTensor(out);
+    
+    // For multi-stem models (like Deux with 2 stems), the output will contain all stems.
+    // mask_est0.npy likely only contains the first stem (or the target stem).
+    // If output size > expected size, we should compare only the matching portion (first stem).
+    
+    size_t expected_size = expected.nelements();
+    size_t actual_size = output.size();
+    
+    bool pass = false;
+    if (actual_size > expected_size && actual_size % expected_size == 0) {
+        // De-interleave Stem 0
+        // Data layout: [Freqs, Stems, Frames, Batch] (ne0, ne1, ne2, ne3)
+        // Stride per frame = Freqs * Stems
+        // We want Stem 0 for each frame.
+        
+        std::vector<float> stem0_output;
+        stem0_output.reserve(expected_size);
+        
+        int num_stems = (int)(actual_size / expected_size);
+        int n_frames = (int)input.shape[1]; // Known from input
+        int n_freqs = (int)(expected_size / n_frames); // Inferred Freqs per frame
+        
+        std::cout << "Detected multi-stem output (" << num_stems << " stems). Verifying Stem 0..." << std::endl;
+        
+        // Verify assumption
+        if ((size_t)(num_stems * n_freqs * n_frames) != actual_size) {
+            std::cerr << "Warning: Shape mismatch calculation in verification logic." << std::endl;
+        }
+
+        for (int t = 0; t < n_frames; ++t) {
+            size_t frame_start = t * (n_freqs * num_stems);
+            size_t stem0_start = frame_start; // Stem 0 is at offset 0 in the stride
+            
+            // Copy n_freqs elements
+            for (int f = 0; f < n_freqs; ++f) {
+                if (stem0_start + f < output.size()) {
+                    stem0_output.push_back(output[stem0_start + f]);
+                }
+            }
+        }
+        
+        pass = CompareAndReport("MaskEstimator (Stem 0)",
+                                expected.data, expected_size,
+                                stem0_output.data(), stem0_output.size());
+    } else {
+        pass = CompareAndReport("MaskEstimator", 
+                                expected.data, expected.nelements(),
+                                output.data(), output.size());
+    }
+                                  
+    if (pass) LOG_PASS(); else LOG_FAIL();
+    return pass ? 0 : 1;
+}

+ 95 - 0
tests/test_component_stft.cpp

@@ -0,0 +1,95 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <iomanip>
+#include "../src/stft.h"
+
+int main() {
+    std::cout << "Test: Component STFT/ISTFT" << std::endl;
+    
+    // Parameters
+    const int sample_rate = 44100;
+    const int n_fft = 2048;
+    const int hop_length = 441;
+    const int win_length = 2048;
+    const int n_freq = n_fft / 2 + 1;
+    const int n_samples = 44100 * 2; // 2 seconds
+    
+    // 1. Generate Signal (Sine wave mixture)
+    std::vector<float> input(n_samples);
+    for (int i = 0; i < n_samples; ++i) {
+        float t = static_cast<float>(i) / sample_rate;
+        input[i] = std::sin(2.0f * M_PI * 440.0f * t) + 
+                   0.5f * std::sin(2.0f * M_PI * 880.0f * t);
+    }
+    
+    // 2. Generate Window
+    std::vector<float> window(win_length);
+    stft::hann_window(window.data(), win_length);
+    
+    // 3. Compute STFT
+    int n_frames = 0;
+    // Estimate size: n_freq * estimated_frames * 2, give some buffer
+    std::vector<float> stft_out(n_freq * 500 * 2); 
+    
+    stft::compute_stft(
+        input.data(),
+        n_samples,
+        n_fft,
+        hop_length,
+        win_length,
+        window.data(),
+        true, // center
+        stft_out.data(),
+        &n_frames
+    );
+    
+    std::cout << "STFT Computed: " << n_frames << " frames" << std::endl;
+    
+    if (n_frames == 0) {
+        std::cerr << "Failed: 0 frames" << std::endl;
+        return 1;
+    }
+    
+    // 4. Compute ISTFT
+    std::vector<float> output(n_samples);
+    
+    stft::compute_istft(
+        stft_out.data(),
+        n_freq,
+        n_frames,
+        n_fft,
+        hop_length,
+        win_length,
+        window.data(),
+        true, // center
+        n_samples,
+        output.data()
+    );
+    
+    // 5. Verify Reconstruction (MSE/MAE)
+    float max_diff = 0.0f;
+    float mae = 0.0f;
+    
+    for (int i = 0; i < n_samples; ++i) {
+        float diff = std::abs(input[i] - output[i]);
+        if (diff > max_diff) max_diff = diff;
+        mae += diff;
+    }
+    mae /= n_samples;
+    
+    std::cout << "Reconstruction Error:" << std::endl;
+    std::cout << "  Max Diff: " << max_diff << std::endl;
+    std::cout << "  MAE:      " << mae << std::endl;
+    
+    // STFT/ISTFT with Hann window and overlap >= 50% should be near perfect
+    // COLA constraint check: 2048/441 = ~4.6 overlaps, excellent.
+    
+    if (max_diff > 1e-4) {
+        std::cerr << "FAILED: Reconstruction error too high (> 1e-4)" << std::endl;
+        return 1;
+    }
+    
+    std::cout << "PASSED" << std::endl;
+    return 0;
+}

+ 114 - 0
tests/test_inference.cpp

@@ -0,0 +1,114 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <string>
+#include <cstdlib>
+#include "mel_band_roformer/inference.h"
+#include "../src/utils.h"
+
+/**
+ * test_inference.cpp
+ * 
+ * Verifies Inference class against golden tensors from export_debug.py
+ * Copied from tests_old/test_inference.cpp with env var support
+ */
+
+std::string GetModelPath() {
+    const char* env = std::getenv("MBR_MODEL_PATH");
+    return env ? env : "mel_band_roformer.gguf";
+}
+
+std::string GetTestDataDir() {
+    const char* env = std::getenv("MBR_TEST_DATA_DIR");
+    return env ? env : ".";
+}
+
+int main(int argc, char* argv[]) {
+    std::cout << "========================================" << std::endl;
+    std::cout << "Test: Inference Class Verification" << std::endl;
+    std::cout << "========================================" << std::endl;
+    
+    std::string model_path = GetModelPath();
+    std::string debug_dir = GetTestDataDir();
+    
+    if (argc > 1) model_path = argv[1];
+    if (argc > 2) debug_dir = argv[2];
+    
+    try {
+        // 1. Initialize Inference
+        std::cout << "\n[1/3] Initializing Inference Engine..." << std::endl;
+        Inference engine(model_path);
+        
+        // 2. Load Input Audio
+        std::cout << "\n[2/3] Loading Input Audio..." << std::endl;
+        auto [input_audio_ptr, input_audio_shape] = utils::load_activation(debug_dir, "input_audio");
+        if (!input_audio_ptr) return 1;
+        
+        // Convert to vector (input_audio.npy is [batch, channels, samples] interleaved)
+        // input_audio_shape: [1, 2, 132300]
+        size_t total_samples = input_audio_shape[0] * input_audio_shape[1] * input_audio_shape[2];
+        std::vector<float> input_audio(input_audio_ptr, input_audio_ptr + total_samples);
+        
+        // 3. Process
+        std::cout << "\n[3/3] Processing Audio..." << std::endl;
+        // Use ProcessChunk to verify raw model output without Overlap-Add windowing/padding
+        // This matches the generation of output_audio.npy
+        std::vector<std::vector<float>> output_stems = engine.ProcessChunk(input_audio);
+        std::vector<float> output_audio = output_stems[0];
+        
+        std::cout << "  Input size: " << input_audio.size() << std::endl;
+        std::cout << "  Output size: " << output_audio.size() << std::endl;
+        
+        // Verify against output_audio.npy
+        std::cout << "\n[Verification] Comparing against golden output..." << std::endl;
+        auto [expected_output, expected_shape] = utils::load_activation(debug_dir, "output_audio");
+        if (!expected_output) {
+             std::cerr << "Golden output not found" << std::endl;
+             return 1;
+        }
+        
+        // expected_output: [batch=1, channels=2, samples=132300] (Planar/C-contiguous)
+        // output_audio: interleaved [ch0, ch1, ch0, ch1...]
+        
+        int channels = 2;
+        int samples = input_audio_shape[2]; // 132300
+        
+        float max_diff = 0.0f;
+        float sum_diff = 0.0f;
+        int valid_samples = 0;
+        
+        for (int i = 0; i < samples; ++i) {
+            for (int ch = 0; ch < channels; ++ch) {
+                // Expected: ch * samples + i
+                float expected = expected_output[ch * samples + i];
+                
+                // Actual: i * channels + ch
+                if (i * channels + ch >= output_audio.size()) continue;
+                float actual = output_audio[i * channels + ch];
+                
+                float diff = std::abs(expected - actual);
+                max_diff = std::max(max_diff, diff);
+                sum_diff += diff;
+                valid_samples++;
+            }
+        }
+        
+        if (valid_samples == 0) valid_samples = 1;
+
+        std::cout << "  Max abs diff: " << max_diff << std::endl;
+        std::cout << "  Mean abs diff: " << (sum_diff / valid_samples) << std::endl;
+        
+        bool pass = (sum_diff / valid_samples) < 0.1f;
+        if (pass) std::cout << "PASSED" << std::endl;
+        else std::cout << "FAILED" << std::endl;
+        
+        utils::free_npy_data(input_audio_ptr);
+        utils::free_npy_data(expected_output);
+        
+        return pass ? 0 : 1;
+        
+    } catch (const std::exception& e) {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return 1;
+    }
+}

+ 9044 - 0
third_party/dr_libs/dr_wav.h

@@ -0,0 +1,9044 @@
+/*
+WAV audio loader and writer. Choice of public domain or MIT-0. See license statements at the end of this file.
+dr_wav - v0.14.3 - 2025-12-14
+
+David Reid - mackron@gmail.com
+
+GitHub: https://github.com/mackron/dr_libs
+*/
+
+/*
+Introduction
+============
+This is a single file library. To use it, do something like the following in one .c file.
+
+    ```c
+    #define DR_WAV_IMPLEMENTATION
+    #include "dr_wav.h"
+    ```
+
+You can then #include this file in other parts of the program as you would with any other header file. Do something like the following to read audio data:
+
+    ```c
+    drwav wav;
+    if (!drwav_init_file(&wav, "my_song.wav", NULL)) {
+        // Error opening WAV file.
+    }
+
+    drwav_int32* pDecodedInterleavedPCMFrames = malloc(wav.totalPCMFrameCount * wav.channels * sizeof(drwav_int32));
+    size_t numberOfSamplesActuallyDecoded = drwav_read_pcm_frames_s32(&wav, wav.totalPCMFrameCount, pDecodedInterleavedPCMFrames);
+
+    ...
+
+    drwav_uninit(&wav);
+    ```
+
+If you just want to quickly open and read the audio data in a single operation you can do something like this:
+
+    ```c
+    unsigned int channels;
+    unsigned int sampleRate;
+    drwav_uint64 totalPCMFrameCount;
+    float* pSampleData = drwav_open_file_and_read_pcm_frames_f32("my_song.wav", &channels, &sampleRate, &totalPCMFrameCount, NULL);
+    if (pSampleData == NULL) {
+        // Error opening and reading WAV file.
+    }
+
+    ...
+
+    drwav_free(pSampleData, NULL);
+    ```
+
+The examples above use versions of the API that convert the audio data to a consistent format (32-bit signed PCM, in this case), but you can still output the
+audio data in its internal format (see notes below for supported formats):
+
+    ```c
+    size_t framesRead = drwav_read_pcm_frames(&wav, wav.totalPCMFrameCount, pDecodedInterleavedPCMFrames);
+    ```
+
+You can also read the raw bytes of audio data, which could be useful if dr_wav does not have native support for a particular data format:
+
+    ```c
+    size_t bytesRead = drwav_read_raw(&wav, bytesToRead, pRawDataBuffer);
+    ```
+
+dr_wav can also be used to output WAV files. This does not currently support compressed formats. To use this, look at `drwav_init_write()`,
+`drwav_init_file_write()`, etc. Use `drwav_write_pcm_frames()` to write samples, or `drwav_write_raw()` to write raw data in the "data" chunk.
+
+    ```c
+    drwav_data_format format;
+    format.container = drwav_container_riff;     // <-- drwav_container_riff = normal WAV files, drwav_container_w64 = Sony Wave64.
+    format.format = DR_WAVE_FORMAT_PCM;          // <-- Any of the DR_WAVE_FORMAT_* codes.
+    format.channels = 2;
+    format.sampleRate = 44100;
+    format.bitsPerSample = 16;
+    drwav_init_file_write(&wav, "data/recording.wav", &format, NULL);
+
+    ...
+
+    drwav_uint64 framesWritten = drwav_write_pcm_frames(pWav, frameCount, pSamples);
+    ```
+
+Note that writing to AIFF or RIFX is not supported.
+
+dr_wav has support for decoding from a number of different encapsulation formats. See below for details.
+
+
+Build Options
+=============
+#define these options before including this file.
+
+#define DR_WAV_NO_CONVERSION_API
+  Disables conversion APIs such as `drwav_read_pcm_frames_f32()` and `drwav_s16_to_f32()`.
+
+#define DR_WAV_NO_STDIO
+  Disables APIs that initialize a decoder from a file such as `drwav_init_file()`, `drwav_init_file_write()`, etc.
+
+#define DR_WAV_NO_WCHAR
+  Disables all functions ending with `_w`. Use this if your compiler does not provide wchar.h. Not required if DR_WAV_NO_STDIO is also defined.
+
+
+Supported Encapsulations
+========================
+- RIFF (Regular WAV)
+- RIFX (Big-Endian)
+- AIFF (Does not currently support ADPCM)
+- RF64
+- W64
+
+Note that AIFF and RIFX do not support write mode, nor do they support reading of metadata.
+
+
+Supported Encodings
+===================
+- Unsigned 8-bit PCM
+- Signed 12-bit PCM
+- Signed 16-bit PCM
+- Signed 24-bit PCM
+- Signed 32-bit PCM
+- IEEE 32-bit floating point
+- IEEE 64-bit floating point
+- A-law and u-law
+- Microsoft ADPCM
+- IMA ADPCM (DVI, format code 0x11)
+
+8-bit PCM encodings are always assumed to be unsigned. Signed 8-bit encoding can only be read with `drwav_read_raw()`.
+
+Note that ADPCM is not currently supported with AIFF. Contributions welcome.
+
+
+Notes
+=====
+- Samples are always interleaved.
+- The default read function does not do any data conversion. Use `drwav_read_pcm_frames_f32()`, `drwav_read_pcm_frames_s32()` and `drwav_read_pcm_frames_s16()`
+  to read and convert audio data to 32-bit floating point, signed 32-bit integer and signed 16-bit integer samples respectively.
+- dr_wav will try to read the WAV file as best it can, even if it's not strictly conformant to the WAV format.
+*/
+
+#ifndef dr_wav_h
+#define dr_wav_h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DRWAV_STRINGIFY(x)      #x
+#define DRWAV_XSTRINGIFY(x)     DRWAV_STRINGIFY(x)
+
+#define DRWAV_VERSION_MAJOR     0
+#define DRWAV_VERSION_MINOR     14
+#define DRWAV_VERSION_REVISION  3
+#define DRWAV_VERSION_STRING    DRWAV_XSTRINGIFY(DRWAV_VERSION_MAJOR) "." DRWAV_XSTRINGIFY(DRWAV_VERSION_MINOR) "." DRWAV_XSTRINGIFY(DRWAV_VERSION_REVISION)
+
+#include <stddef.h> /* For size_t. */
+
+/* Sized Types */
+typedef   signed char           drwav_int8;
+typedef unsigned char           drwav_uint8;
+typedef   signed short          drwav_int16;
+typedef unsigned short          drwav_uint16;
+typedef   signed int            drwav_int32;
+typedef unsigned int            drwav_uint32;
+#if defined(_MSC_VER) && !defined(__clang__)
+    typedef   signed __int64    drwav_int64;
+    typedef unsigned __int64    drwav_uint64;
+#else
+    #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+        #pragma GCC diagnostic push
+        #pragma GCC diagnostic ignored "-Wlong-long"
+        #if defined(__clang__)
+            #pragma GCC diagnostic ignored "-Wc++11-long-long"
+        #endif
+    #endif
+    typedef   signed long long  drwav_int64;
+    typedef unsigned long long  drwav_uint64;
+    #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+        #pragma GCC diagnostic pop
+    #endif
+#endif
+#if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__)) || defined(_M_X64) || defined(__ia64) || defined (_M_IA64) || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__powerpc64__)
+    typedef drwav_uint64        drwav_uintptr;
+#else
+    typedef drwav_uint32        drwav_uintptr;
+#endif
+typedef drwav_uint8             drwav_bool8;
+typedef drwav_uint32            drwav_bool32;
+#define DRWAV_TRUE              1
+#define DRWAV_FALSE             0
+/* End Sized Types */
+
+/* Decorations */
+#if !defined(DRWAV_API)
+    #if defined(DRWAV_DLL)
+        #if defined(_WIN32)
+            #define DRWAV_DLL_IMPORT  __declspec(dllimport)
+            #define DRWAV_DLL_EXPORT  __declspec(dllexport)
+            #define DRWAV_DLL_PRIVATE static
+        #else
+            #if defined(__GNUC__) && __GNUC__ >= 4
+                #define DRWAV_DLL_IMPORT  __attribute__((visibility("default")))
+                #define DRWAV_DLL_EXPORT  __attribute__((visibility("default")))
+                #define DRWAV_DLL_PRIVATE __attribute__((visibility("hidden")))
+            #else
+                #define DRWAV_DLL_IMPORT
+                #define DRWAV_DLL_EXPORT
+                #define DRWAV_DLL_PRIVATE static
+            #endif
+        #endif
+
+        #if defined(DR_WAV_IMPLEMENTATION) || defined(DRWAV_IMPLEMENTATION)
+            #define DRWAV_API  DRWAV_DLL_EXPORT
+        #else
+            #define DRWAV_API  DRWAV_DLL_IMPORT
+        #endif
+        #define DRWAV_PRIVATE DRWAV_DLL_PRIVATE
+    #else
+        #define DRWAV_API extern
+        #define DRWAV_PRIVATE static
+    #endif
+#endif
+/* End Decorations */
+
+/* Result Codes */
+typedef drwav_int32 drwav_result;
+#define DRWAV_SUCCESS                        0
+#define DRWAV_ERROR                         -1   /* A generic error. */
+#define DRWAV_INVALID_ARGS                  -2
+#define DRWAV_INVALID_OPERATION             -3
+#define DRWAV_OUT_OF_MEMORY                 -4
+#define DRWAV_OUT_OF_RANGE                  -5
+#define DRWAV_ACCESS_DENIED                 -6
+#define DRWAV_DOES_NOT_EXIST                -7
+#define DRWAV_ALREADY_EXISTS                -8
+#define DRWAV_TOO_MANY_OPEN_FILES           -9
+#define DRWAV_INVALID_FILE                  -10
+#define DRWAV_TOO_BIG                       -11
+#define DRWAV_PATH_TOO_LONG                 -12
+#define DRWAV_NAME_TOO_LONG                 -13
+#define DRWAV_NOT_DIRECTORY                 -14
+#define DRWAV_IS_DIRECTORY                  -15
+#define DRWAV_DIRECTORY_NOT_EMPTY           -16
+#define DRWAV_END_OF_FILE                   -17
+#define DRWAV_NO_SPACE                      -18
+#define DRWAV_BUSY                          -19
+#define DRWAV_IO_ERROR                      -20
+#define DRWAV_INTERRUPT                     -21
+#define DRWAV_UNAVAILABLE                   -22
+#define DRWAV_ALREADY_IN_USE                -23
+#define DRWAV_BAD_ADDRESS                   -24
+#define DRWAV_BAD_SEEK                      -25
+#define DRWAV_BAD_PIPE                      -26
+#define DRWAV_DEADLOCK                      -27
+#define DRWAV_TOO_MANY_LINKS                -28
+#define DRWAV_NOT_IMPLEMENTED               -29
+#define DRWAV_NO_MESSAGE                    -30
+#define DRWAV_BAD_MESSAGE                   -31
+#define DRWAV_NO_DATA_AVAILABLE             -32
+#define DRWAV_INVALID_DATA                  -33
+#define DRWAV_TIMEOUT                       -34
+#define DRWAV_NO_NETWORK                    -35
+#define DRWAV_NOT_UNIQUE                    -36
+#define DRWAV_NOT_SOCKET                    -37
+#define DRWAV_NO_ADDRESS                    -38
+#define DRWAV_BAD_PROTOCOL                  -39
+#define DRWAV_PROTOCOL_UNAVAILABLE          -40
+#define DRWAV_PROTOCOL_NOT_SUPPORTED        -41
+#define DRWAV_PROTOCOL_FAMILY_NOT_SUPPORTED -42
+#define DRWAV_ADDRESS_FAMILY_NOT_SUPPORTED  -43
+#define DRWAV_SOCKET_NOT_SUPPORTED          -44
+#define DRWAV_CONNECTION_RESET              -45
+#define DRWAV_ALREADY_CONNECTED             -46
+#define DRWAV_NOT_CONNECTED                 -47
+#define DRWAV_CONNECTION_REFUSED            -48
+#define DRWAV_NO_HOST                       -49
+#define DRWAV_IN_PROGRESS                   -50
+#define DRWAV_CANCELLED                     -51
+#define DRWAV_MEMORY_ALREADY_MAPPED         -52
+#define DRWAV_AT_END                        -53
+/* End Result Codes */
+
+/* Common data formats. */
+#define DR_WAVE_FORMAT_PCM          0x1
+#define DR_WAVE_FORMAT_ADPCM        0x2
+#define DR_WAVE_FORMAT_IEEE_FLOAT   0x3
+#define DR_WAVE_FORMAT_ALAW         0x6
+#define DR_WAVE_FORMAT_MULAW        0x7
+#define DR_WAVE_FORMAT_DVI_ADPCM    0x11
+#define DR_WAVE_FORMAT_EXTENSIBLE   0xFFFE
+
+/* Flags to pass into drwav_init_ex(), etc. */
+#define DRWAV_SEQUENTIAL            0x00000001
+#define DRWAV_WITH_METADATA         0x00000002
+
+DRWAV_API void drwav_version(drwav_uint32* pMajor, drwav_uint32* pMinor, drwav_uint32* pRevision);
+DRWAV_API const char* drwav_version_string(void);
+
+/* Allocation Callbacks */
+typedef struct
+{
+    void* pUserData;
+    void* (* onMalloc)(size_t sz, void* pUserData);
+    void* (* onRealloc)(void* p, size_t sz, void* pUserData);
+    void  (* onFree)(void* p, void* pUserData);
+} drwav_allocation_callbacks;
+/* End Allocation Callbacks */
+
+typedef enum
+{
+    DRWAV_SEEK_SET,
+    DRWAV_SEEK_CUR,
+    DRWAV_SEEK_END
+} drwav_seek_origin;
+
+typedef enum
+{
+    drwav_container_riff,
+    drwav_container_rifx,
+    drwav_container_w64,
+    drwav_container_rf64,
+    drwav_container_aiff
+} drwav_container;
+
+typedef struct
+{
+    union
+    {
+        drwav_uint8 fourcc[4];
+        drwav_uint8 guid[16];
+    } id;
+
+    /* The size in bytes of the chunk. */
+    drwav_uint64 sizeInBytes;
+
+    /*
+    RIFF = 2 byte alignment.
+    W64  = 8 byte alignment.
+    */
+    unsigned int paddingSize;
+} drwav_chunk_header;
+
+typedef struct
+{
+    /*
+    The format tag exactly as specified in the wave file's "fmt" chunk. This can be used by applications
+    that require support for data formats not natively supported by dr_wav.
+    */
+    drwav_uint16 formatTag;
+
+    /* The number of channels making up the audio data. When this is set to 1 it is mono, 2 is stereo, etc. */
+    drwav_uint16 channels;
+
+    /* The sample rate. Usually set to something like 44100. */
+    drwav_uint32 sampleRate;
+
+    /* Average bytes per second. You probably don't need this, but it's left here for informational purposes. */
+    drwav_uint32 avgBytesPerSec;
+
+    /* Block align. This is equal to the number of channels * bytes per sample. */
+    drwav_uint16 blockAlign;
+
+    /* Bits per sample. */
+    drwav_uint16 bitsPerSample;
+
+    /* The size of the extended data. Only used internally for validation, but left here for informational purposes. */
+    drwav_uint16 extendedSize;
+
+    /*
+    The number of valid bits per sample. When <formatTag> is equal to WAVE_FORMAT_EXTENSIBLE, <bitsPerSample>
+    is always rounded up to the nearest multiple of 8. This variable contains information about exactly how
+    many bits are valid per sample. Mainly used for informational purposes.
+    */
+    drwav_uint16 validBitsPerSample;
+
+    /* The channel mask. Not used at the moment. */
+    drwav_uint32 channelMask;
+
+    /* The sub-format, exactly as specified by the wave file. */
+    drwav_uint8 subFormat[16];
+} drwav_fmt;
+
+DRWAV_API drwav_uint16 drwav_fmt_get_format(const drwav_fmt* pFMT);
+
+
+/*
+Callback for when data is read. Return value is the number of bytes actually read.
+
+pUserData   [in]  The user data that was passed to drwav_init() and family.
+pBufferOut  [out] The output buffer.
+bytesToRead [in]  The number of bytes to read.
+
+Returns the number of bytes actually read.
+
+A return value of less than bytesToRead indicates the end of the stream. Do _not_ return from this callback until
+either the entire bytesToRead is filled or you have reached the end of the stream.
+*/
+typedef size_t (* drwav_read_proc)(void* pUserData, void* pBufferOut, size_t bytesToRead);
+
+/*
+Callback for when data is written. Returns value is the number of bytes actually written.
+
+pUserData    [in]  The user data that was passed to drwav_init_write() and family.
+pData        [out] A pointer to the data to write.
+bytesToWrite [in]  The number of bytes to write.
+
+Returns the number of bytes actually written.
+
+If the return value differs from bytesToWrite, it indicates an error.
+*/
+typedef size_t (* drwav_write_proc)(void* pUserData, const void* pData, size_t bytesToWrite);
+
+/*
+Callback for when data needs to be seeked.
+
+pUserData [in] The user data that was passed to drwav_init() and family.
+offset    [in] The number of bytes to move, relative to the origin. Will never be negative.
+origin    [in] The origin of the seek - the current position or the start of the stream.
+
+Returns whether or not the seek was successful.
+
+Whether or not it is relative to the beginning or current position is determined by the "origin" parameter which will be either DRWAV_SEEK_SET or
+DRWAV_SEEK_CUR.
+*/
+typedef drwav_bool32 (* drwav_seek_proc)(void* pUserData, int offset, drwav_seek_origin origin);
+
+/*
+Callback for when the current position in the stream needs to be retrieved.
+
+pUserData [in]  The user data that was passed to drwav_init() and family.
+pCursor   [out] A pointer to a variable to receive the current position in the stream.
+
+Returns whether or not the operation was successful.
+*/
+typedef drwav_bool32 (* drwav_tell_proc)(void* pUserData, drwav_int64* pCursor);
+
+/*
+Callback for when drwav_init_ex() finds a chunk.
+
+pChunkUserData    [in] The user data that was passed to the pChunkUserData parameter of drwav_init_ex() and family.
+onRead            [in] A pointer to the function to call when reading.
+onSeek            [in] A pointer to the function to call when seeking.
+pReadSeekUserData [in] The user data that was passed to the pReadSeekUserData parameter of drwav_init_ex() and family.
+pChunkHeader      [in] A pointer to an object containing basic header information about the chunk. Use this to identify the chunk.
+container         [in] Whether or not the WAV file is a RIFF or Wave64 container. If you're unsure of the difference, assume RIFF.
+pFMT              [in] A pointer to the object containing the contents of the "fmt" chunk.
+
+Returns the number of bytes read + seeked.
+
+To read data from the chunk, call onRead(), passing in pReadSeekUserData as the first parameter. Do the same for seeking with onSeek(). The return value must
+be the total number of bytes you have read _plus_ seeked.
+
+Use the `container` argument to discriminate the fields in `pChunkHeader->id`. If the container is `drwav_container_riff` or `drwav_container_rf64` you should
+use `id.fourcc`, otherwise you should use `id.guid`.
+
+The `pFMT` parameter can be used to determine the data format of the wave file. Use `drwav_fmt_get_format()` to get the sample format, which will be one of the
+`DR_WAVE_FORMAT_*` identifiers.
+
+The read pointer will be sitting on the first byte after the chunk's header. You must not attempt to read beyond the boundary of the chunk.
+*/
+typedef drwav_uint64 (* drwav_chunk_proc)(void* pChunkUserData, drwav_read_proc onRead, drwav_seek_proc onSeek, void* pReadSeekUserData, const drwav_chunk_header* pChunkHeader, drwav_container container, const drwav_fmt* pFMT);
+
+
+/* Structure for internal use. Only used for loaders opened with drwav_init_memory(). */
+typedef struct
+{
+    const drwav_uint8* data;
+    size_t dataSize;
+    size_t currentReadPos;
+} drwav__memory_stream;
+
+/* Structure for internal use. Only used for writers opened with drwav_init_memory_write(). */
+typedef struct
+{
+    void** ppData;
+    size_t* pDataSize;
+    size_t dataSize;
+    size_t dataCapacity;
+    size_t currentWritePos;
+} drwav__memory_stream_write;
+
+typedef struct
+{
+    drwav_container container;  /* RIFF, W64. */
+    drwav_uint32 format;        /* DR_WAVE_FORMAT_* */
+    drwav_uint32 channels;
+    drwav_uint32 sampleRate;
+    drwav_uint32 bitsPerSample;
+} drwav_data_format;
+
+typedef enum
+{
+    drwav_metadata_type_none                        = 0,
+
+    /*
+    Unknown simply means a chunk that drwav does not handle specifically. You can still ask to
+    receive these chunks as metadata objects. It is then up to you to interpret the chunk's data.
+    You can also write unknown metadata to a wav file. Be careful writing unknown chunks if you
+    have also edited the audio data. The unknown chunks could represent offsets/sizes that no
+    longer correctly correspond to the audio data.
+    */
+    drwav_metadata_type_unknown                     = 1 << 0,
+
+    /* Only 1 of each of these metadata items are allowed in a wav file. */
+    drwav_metadata_type_smpl                        = 1 << 1,
+    drwav_metadata_type_inst                        = 1 << 2,
+    drwav_metadata_type_cue                         = 1 << 3,
+    drwav_metadata_type_acid                        = 1 << 4,
+    drwav_metadata_type_bext                        = 1 << 5,
+
+    /*
+    Wav files often have a LIST chunk. This is a chunk that contains a set of subchunks. For this
+    higher-level metadata API, we don't make a distinction between a regular chunk and a LIST
+    subchunk. Instead, they are all just 'metadata' items.
+
+    There can be multiple of these metadata items in a wav file.
+    */
+    drwav_metadata_type_list_label                  = 1 << 6,
+    drwav_metadata_type_list_note                   = 1 << 7,
+    drwav_metadata_type_list_labelled_cue_region    = 1 << 8,
+
+    drwav_metadata_type_list_info_software          = 1 << 9,
+    drwav_metadata_type_list_info_copyright         = 1 << 10,
+    drwav_metadata_type_list_info_title             = 1 << 11,
+    drwav_metadata_type_list_info_artist            = 1 << 12,
+    drwav_metadata_type_list_info_comment           = 1 << 13,
+    drwav_metadata_type_list_info_date              = 1 << 14,
+    drwav_metadata_type_list_info_genre             = 1 << 15,
+    drwav_metadata_type_list_info_album             = 1 << 16,
+    drwav_metadata_type_list_info_tracknumber       = 1 << 17,
+    drwav_metadata_type_list_info_location          = 1 << 18,
+    drwav_metadata_type_list_info_organization      = 1 << 19,
+    drwav_metadata_type_list_info_keywords          = 1 << 20,
+    drwav_metadata_type_list_info_medium            = 1 << 21,
+    drwav_metadata_type_list_info_description       = 1 << 22,
+
+    /* Other type constants for convenience. */
+    drwav_metadata_type_list_all_info_strings       = drwav_metadata_type_list_info_software
+                                                    | drwav_metadata_type_list_info_copyright
+                                                    | drwav_metadata_type_list_info_title
+                                                    | drwav_metadata_type_list_info_artist
+                                                    | drwav_metadata_type_list_info_comment
+                                                    | drwav_metadata_type_list_info_date
+                                                    | drwav_metadata_type_list_info_genre
+                                                    | drwav_metadata_type_list_info_album
+                                                    | drwav_metadata_type_list_info_tracknumber
+                                                    | drwav_metadata_type_list_info_location
+                                                    | drwav_metadata_type_list_info_organization
+                                                    | drwav_metadata_type_list_info_keywords
+                                                    | drwav_metadata_type_list_info_medium
+                                                    | drwav_metadata_type_list_info_description,
+
+    drwav_metadata_type_list_all_adtl               = drwav_metadata_type_list_label
+                                                    | drwav_metadata_type_list_note
+                                                    | drwav_metadata_type_list_labelled_cue_region,
+
+    drwav_metadata_type_all                         = -2,   /*0xFFFFFFFF & ~drwav_metadata_type_unknown,*/
+    drwav_metadata_type_all_including_unknown       = -1    /*0xFFFFFFFF,*/
+} drwav_metadata_type;
+
+/*
+Sampler Metadata
+
+The sampler chunk contains information about how a sound should be played in the context of a whole
+audio production, and when used in a sampler. See https://en.wikipedia.org/wiki/Sample-based_synthesis.
+*/
+typedef enum
+{
+    drwav_smpl_loop_type_forward  = 0,
+    drwav_smpl_loop_type_pingpong = 1,
+    drwav_smpl_loop_type_backward = 2
+} drwav_smpl_loop_type;
+
+typedef struct
+{
+    /* The ID of the associated cue point, see drwav_cue and drwav_cue_point. As with all cue point IDs, this can correspond to a label chunk to give this loop a name, see drwav_list_label_or_note. */
+    drwav_uint32 cuePointId;
+
+    /* See drwav_smpl_loop_type. */
+    drwav_uint32 type;
+
+    /* The offset of the first sample to be played in the loop. */
+    drwav_uint32 firstSampleOffset;
+
+    /* The offset into the audio data of the last sample to be played in the loop. */
+    drwav_uint32 lastSampleOffset;
+
+    /* A value to represent that playback should occur at a point between samples. This value ranges from 0 to UINT32_MAX. Where a value of 0 means no fraction, and a value of (UINT32_MAX / 2) would mean half a sample. */
+    drwav_uint32 sampleFraction;
+
+    /* Number of times to play the loop. 0 means loop infinitely. */
+    drwav_uint32 playCount;
+} drwav_smpl_loop;
+
+typedef struct
+{
+    /* IDs for a particular MIDI manufacturer. 0 if not used. */
+    drwav_uint32 manufacturerId;
+    drwav_uint32 productId;
+
+    /* The period of 1 sample in nanoseconds. */
+    drwav_uint32 samplePeriodNanoseconds;
+
+    /* The MIDI root note of this file. 0 to 127. */
+    drwav_uint32 midiUnityNote;
+
+    /* The fraction of a semitone up from the given MIDI note. This is a value from 0 to UINT32_MAX, where 0 means no change and (UINT32_MAX / 2) is half a semitone (AKA 50 cents). */
+    drwav_uint32 midiPitchFraction;
+
+    /* Data relating to SMPTE standards which are used for syncing audio and video. 0 if not used. */
+    drwav_uint32 smpteFormat;
+    drwav_uint32 smpteOffset;
+
+    /* drwav_smpl_loop loops. */
+    drwav_uint32 sampleLoopCount;
+
+    /* Optional sampler-specific data. */
+    drwav_uint32 samplerSpecificDataSizeInBytes;
+
+    drwav_smpl_loop* pLoops;
+    drwav_uint8* pSamplerSpecificData;
+} drwav_smpl;
+
+/*
+Instrument Metadata
+
+The inst metadata contains data about how a sound should be played as part of an instrument. This
+commonly read by samplers. See https://en.wikipedia.org/wiki/Sample-based_synthesis.
+*/
+typedef struct
+{
+    drwav_int8 midiUnityNote;   /* The root note of the audio as a MIDI note number. 0 to 127. */
+    drwav_int8 fineTuneCents;   /* -50 to +50 */
+    drwav_int8 gainDecibels;    /* -64 to +64 */
+    drwav_int8 lowNote;         /* 0 to 127 */
+    drwav_int8 highNote;        /* 0 to 127 */
+    drwav_int8 lowVelocity;     /* 1 to 127 */
+    drwav_int8 highVelocity;    /* 1 to 127 */
+} drwav_inst;
+
+/*
+Cue Metadata
+
+Cue points are markers at specific points in the audio. They often come with an associated piece of
+drwav_list_label_or_note metadata which contains the text for the marker.
+*/
+typedef struct
+{
+    /* Unique identification value. */
+    drwav_uint32 id;
+
+    /* Set to 0. This is only relevant if there is a 'playlist' chunk - which is not supported by dr_wav. */
+    drwav_uint32 playOrderPosition;
+
+    /* Should always be "data". This represents the fourcc value of the chunk that this cue point corresponds to. dr_wav only supports a single data chunk so this should always be "data". */
+    drwav_uint8 dataChunkId[4];
+
+    /* Set to 0. This is only relevant if there is a wave list chunk. dr_wav, like lots of readers/writers, do not support this. */
+    drwav_uint32 chunkStart;
+
+    /* Set to 0 for uncompressed formats. Else the last byte in compressed wave data where decompression can begin to find the value of the corresponding sample value. */
+    drwav_uint32 blockStart;
+
+    /* For uncompressed formats this is the offset of the cue point into the audio data. For compressed formats this is relative to the block specified with blockStart. */
+    drwav_uint32 sampleOffset;
+} drwav_cue_point;
+
+typedef struct
+{
+    drwav_uint32 cuePointCount;
+    drwav_cue_point *pCuePoints;
+} drwav_cue;
+
+/*
+Acid Metadata
+
+This chunk contains some information about the time signature and the tempo of the audio.
+*/
+typedef enum
+{
+    drwav_acid_flag_one_shot      = 1,  /* If this is not set, then it is a loop instead of a one-shot. */
+    drwav_acid_flag_root_note_set = 2,
+    drwav_acid_flag_stretch       = 4,
+    drwav_acid_flag_disk_based    = 8,
+    drwav_acid_flag_acidizer      = 16  /* Not sure what this means. */
+} drwav_acid_flag;
+
+typedef struct
+{
+    /* A bit-field, see drwav_acid_flag. */
+    drwav_uint32 flags;
+
+    /* Valid if flags contains drwav_acid_flag_root_note_set. It represents the MIDI root note the file - a value from 0 to 127. */
+    drwav_uint16 midiUnityNote;
+
+    /* Reserved values that should probably be ignored. reserved1 seems to often be 128 and reserved2 is 0. */
+    drwav_uint16 reserved1;
+    float reserved2;
+
+    /* Number of beats. */
+    drwav_uint32 numBeats;
+
+    /* The time signature of the audio. */
+    drwav_uint16 meterDenominator;
+    drwav_uint16 meterNumerator;
+
+    /* Beats per minute of the track. Setting a value of 0 suggests that there is no tempo. */
+    float tempo;
+} drwav_acid;
+
+/*
+Cue Label or Note metadata
+
+These are 2 different types of metadata, but they have the exact same format. Labels tend to be the
+more common and represent a short name for a cue point. Notes might be used to represent a longer
+comment.
+*/
+typedef struct
+{
+    /* The ID of a cue point that this label or note corresponds to. */
+    drwav_uint32 cuePointId;
+
+    /* Size of the string not including any null terminator. */
+    drwav_uint32 stringLength;
+
+    /* The string. The *init_with_metadata functions null terminate this for convenience. */
+    char* pString;
+} drwav_list_label_or_note;
+
+/*
+BEXT metadata, also known as Broadcast Wave Format (BWF)
+
+This metadata adds some extra description to an audio file. You must check the version field to
+determine if the UMID or the loudness fields are valid.
+*/
+typedef struct
+{
+    /*
+    These top 3 fields, and the umid field are actually defined in the standard as a statically
+    sized buffers. In order to reduce the size of this struct (and therefore the union in the
+    metadata struct), we instead store these as pointers.
+    */
+    char* pDescription;                 /* Can be NULL or a null-terminated string, must be <= 256 characters. */
+    char* pOriginatorName;              /* Can be NULL or a null-terminated string, must be <= 32 characters. */
+    char* pOriginatorReference;         /* Can be NULL or a null-terminated string, must be <= 32 characters. */
+    char  pOriginationDate[10];         /* ASCII "yyyy:mm:dd". */
+    char  pOriginationTime[8];          /* ASCII "hh:mm:ss". */
+    drwav_uint64 timeReference;         /* First sample count since midnight. */
+    drwav_uint16 version;               /* Version of the BWF, check this to see if the fields below are valid. */
+
+    /*
+    Unrestricted ASCII characters containing a collection of strings terminated by CR/LF. Each
+    string shall contain a description of a coding process applied to the audio data.
+    */
+    char* pCodingHistory;
+    drwav_uint32 codingHistorySize;
+
+    /* Fields below this point are only valid if the version is 1 or above. */
+    drwav_uint8* pUMID;                  /* Exactly 64 bytes of SMPTE UMID */
+
+    /* Fields below this point are only valid if the version is 2 or above. */
+    drwav_uint16 loudnessValue;         /* Integrated Loudness Value of the file in LUFS (multiplied by 100). */
+    drwav_uint16 loudnessRange;         /* Loudness Range of the file in LU (multiplied by 100). */
+    drwav_uint16 maxTruePeakLevel;      /* Maximum True Peak Level of the file expressed as dBTP (multiplied by 100). */
+    drwav_uint16 maxMomentaryLoudness;  /* Highest value of the Momentary Loudness Level of the file in LUFS (multiplied by 100). */
+    drwav_uint16 maxShortTermLoudness;  /* Highest value of the Short-Term Loudness Level of the file in LUFS (multiplied by 100). */
+} drwav_bext;
+
+/*
+Info Text Metadata
+
+There a many different types of information text that can be saved in this format. This is where
+things like the album name, the artists, the year it was produced, etc are saved. See
+drwav_metadata_type for the full list of types that dr_wav supports.
+*/
+typedef struct
+{
+    /* Size of the string not including any null terminator. */
+    drwav_uint32 stringLength;
+
+    /* The string. The *init_with_metadata functions null terminate this for convenience. */
+    char* pString;
+} drwav_list_info_text;
+
+/*
+Labelled Cue Region Metadata
+
+The labelled cue region metadata is used to associate some region of audio with text. The region
+starts at a cue point, and extends for the given number of samples.
+*/
+typedef struct
+{
+    /* The ID of a cue point that this object corresponds to. */
+    drwav_uint32 cuePointId;
+
+    /* The number of samples from the cue point forwards that should be considered this region */
+    drwav_uint32 sampleLength;
+
+    /* Four characters used to say what the purpose of this region is. */
+    drwav_uint8 purposeId[4];
+
+    /* Unsure of the exact meanings of these. It appears to be acceptable to set them all to 0. */
+    drwav_uint16 country;
+    drwav_uint16 language;
+    drwav_uint16 dialect;
+    drwav_uint16 codePage;
+
+    /* Size of the string not including any null terminator. */
+    drwav_uint32 stringLength;
+
+    /* The string. The *init_with_metadata functions null terminate this for convenience. */
+    char* pString;
+} drwav_list_labelled_cue_region;
+
+/*
+Unknown Metadata
+
+This chunk just represents a type of chunk that dr_wav does not understand.
+
+Unknown metadata has a location attached to it. This is because wav files can have a LIST chunk
+that contains subchunks. These LIST chunks can be one of two types. An adtl list, or an INFO
+list. This enum is used to specify the location of a chunk that dr_wav currently doesn't support.
+*/
+typedef enum
+{
+    drwav_metadata_location_invalid,
+    drwav_metadata_location_top_level,
+    drwav_metadata_location_inside_info_list,
+    drwav_metadata_location_inside_adtl_list
+} drwav_metadata_location;
+
+typedef struct
+{
+    drwav_uint8 id[4];
+    drwav_metadata_location chunkLocation;
+    drwav_uint32 dataSizeInBytes;
+    drwav_uint8* pData;
+} drwav_unknown_metadata;
+
+/*
+Metadata is saved as a union of all the supported types.
+*/
+typedef struct
+{
+    /* Determines which item in the union is valid. */
+    drwav_metadata_type type;
+
+    union
+    {
+        drwav_cue cue;
+        drwav_smpl smpl;
+        drwav_acid acid;
+        drwav_inst inst;
+        drwav_bext bext;
+        drwav_list_label_or_note labelOrNote;   /* List label or list note. */
+        drwav_list_labelled_cue_region labelledCueRegion;
+        drwav_list_info_text infoText;          /* Any of the list info types. */
+        drwav_unknown_metadata unknown;
+    } data;
+} drwav_metadata;
+
+typedef struct
+{
+    /* A pointer to the function to call when more data is needed. */
+    drwav_read_proc onRead;
+
+    /* A pointer to the function to call when data needs to be written. Only used when the drwav object is opened in write mode. */
+    drwav_write_proc onWrite;
+
+    /* A pointer to the function to call when the wav file needs to be seeked. */
+    drwav_seek_proc onSeek;
+
+    /* A pointer to the function to call when the position of the stream needs to be retrieved. */
+    drwav_tell_proc onTell;
+
+    /* The user data to pass to callbacks. */
+    void* pUserData;
+
+    /* Allocation callbacks. */
+    drwav_allocation_callbacks allocationCallbacks;
+
+
+    /* Whether or not the WAV file is formatted as a standard RIFF file or W64. */
+    drwav_container container;
+
+
+    /* Structure containing format information exactly as specified by the wav file. */
+    drwav_fmt fmt;
+
+    /* The sample rate. Will be set to something like 44100. */
+    drwav_uint32 sampleRate;
+
+    /* The number of channels. This will be set to 1 for monaural streams, 2 for stereo, etc. */
+    drwav_uint16 channels;
+
+    /* The bits per sample. Will be set to something like 16, 24, etc. */
+    drwav_uint16 bitsPerSample;
+
+    /* Equal to fmt.formatTag, or the value specified by fmt.subFormat if fmt.formatTag is equal to 65534 (WAVE_FORMAT_EXTENSIBLE). */
+    drwav_uint16 translatedFormatTag;
+
+    /* The total number of PCM frames making up the audio data. */
+    drwav_uint64 totalPCMFrameCount;
+
+
+    /* The size in bytes of the data chunk. */
+    drwav_uint64 dataChunkDataSize;
+
+    /* The position in the stream of the first data byte of the data chunk. This is used for seeking. */
+    drwav_uint64 dataChunkDataPos;
+
+    /* The number of bytes remaining in the data chunk. */
+    drwav_uint64 bytesRemaining;
+
+    /* The current read position in PCM frames. */
+    drwav_uint64 readCursorInPCMFrames;
+
+
+    /*
+    Only used in sequential write mode. Keeps track of the desired size of the "data" chunk at the point of initialization time. Always
+    set to 0 for non-sequential writes and when the drwav object is opened in read mode. Used for validation.
+    */
+    drwav_uint64 dataChunkDataSizeTargetWrite;
+
+    /* Keeps track of whether or not the wav writer was initialized in sequential mode. */
+    drwav_bool32 isSequentialWrite;
+
+
+    /* A array of metadata. This is valid after the *init_with_metadata call returns. It will be valid until drwav_uninit() is called. You can take ownership of this data with drwav_take_ownership_of_metadata(). */
+    drwav_metadata* pMetadata;
+    drwav_uint32 metadataCount;
+
+
+    /* A hack to avoid a DRWAV_MALLOC() when opening a decoder with drwav_init_memory(). */
+    drwav__memory_stream memoryStream;
+    drwav__memory_stream_write memoryStreamWrite;
+
+
+    /* Microsoft ADPCM specific data. */
+    struct
+    {
+        drwav_uint32 bytesRemainingInBlock;
+        drwav_uint16 predictor[2];
+        drwav_int32  delta[2];
+        drwav_int32  cachedFrames[4];  /* Samples are stored in this cache during decoding. */
+        drwav_uint32 cachedFrameCount;
+        drwav_int32  prevFrames[2][2]; /* The previous 2 samples for each channel (2 channels at most). */
+    } msadpcm;
+
+    /* IMA ADPCM specific data. */
+    struct
+    {
+        drwav_uint32 bytesRemainingInBlock;
+        drwav_int32  predictor[2];
+        drwav_int32  stepIndex[2];
+        drwav_int32  cachedFrames[16]; /* Samples are stored in this cache during decoding. */
+        drwav_uint32 cachedFrameCount;
+    } ima;
+
+    /* AIFF specific data. */
+    struct
+    {
+        drwav_bool8 isLE;   /* Will be set to true if the audio data is little-endian encoded. */
+        drwav_bool8 isUnsigned; /* Only used for 8-bit samples. When set to true, will be treated as unsigned. */
+    } aiff;
+} drwav;
+
+
+/*
+Initializes a pre-allocated drwav object for reading.
+
+pWav                         [out]          A pointer to the drwav object being initialized.
+onRead                       [in]           The function to call when data needs to be read from the client.
+onSeek                       [in]           The function to call when the read position of the client data needs to move.
+onChunk                      [in, optional] The function to call when a chunk is enumerated at initialized time.
+pUserData, pReadSeekUserData [in, optional] A pointer to application defined data that will be passed to onRead and onSeek.
+pChunkUserData               [in, optional] A pointer to application defined data that will be passed to onChunk.
+flags                        [in, optional] A set of flags for controlling how things are loaded.
+
+Returns true if successful; false otherwise.
+
+Close the loader with drwav_uninit().
+
+This is the lowest level function for initializing a WAV file. You can also use drwav_init_file() and drwav_init_memory()
+to open the stream from a file or from a block of memory respectively.
+
+Possible values for flags:
+  DRWAV_SEQUENTIAL: Never perform a backwards seek while loading. This disables the chunk callback and will cause this function
+                    to return as soon as the data chunk is found. Any chunks after the data chunk will be ignored.
+
+drwav_init() is equivalent to "drwav_init_ex(pWav, onRead, onSeek, NULL, pUserData, NULL, 0);".
+
+The onChunk callback is not called for the WAVE or FMT chunks. The contents of the FMT chunk can be read from pWav->fmt
+after the function returns.
+
+See also: drwav_init_file(), drwav_init_memory(), drwav_uninit()
+*/
+DRWAV_API drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_tell_proc onTell, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_ex(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_tell_proc onTell, drwav_chunk_proc onChunk, void* pReadSeekTellUserData, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_with_metadata(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_tell_proc onTell, void* pUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks);
+
+/*
+Initializes a pre-allocated drwav object for writing.
+
+onWrite               [in]           The function to call when data needs to be written.
+onSeek                [in]           The function to call when the write position needs to move.
+pUserData             [in, optional] A pointer to application defined data that will be passed to onWrite and onSeek.
+metadata, numMetadata [in, optional] An array of metadata objects that should be written to the file. The array is not edited. You are responsible for this metadata memory and it must maintain valid until drwav_uninit() is called.
+
+Returns true if successful; false otherwise.
+
+Close the writer with drwav_uninit().
+
+This is the lowest level function for initializing a WAV file. You can also use drwav_init_file_write() and drwav_init_memory_write()
+to open the stream from a file or from a block of memory respectively.
+
+If the total sample count is known, you can use drwav_init_write_sequential(). This avoids the need for dr_wav to perform
+a post-processing step for storing the total sample count and the size of the data chunk which requires a backwards seek.
+
+See also: drwav_init_file_write(), drwav_init_memory_write(), drwav_uninit()
+*/
+DRWAV_API drwav_bool32 drwav_init_write(drwav* pWav, const drwav_data_format* pFormat, drwav_write_proc onWrite, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_write_sequential(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_write_proc onWrite, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_write_sequential_pcm_frames(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, drwav_write_proc onWrite, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_write_with_metadata(drwav* pWav, const drwav_data_format* pFormat, drwav_write_proc onWrite, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks, drwav_metadata* pMetadata, drwav_uint32 metadataCount);
+
+/*
+Utility function to determine the target size of the entire data to be written (including all headers and chunks).
+
+Returns the target size in bytes.
+
+The metadata argument can be NULL meaning no metadata exists.
+
+Useful if the application needs to know the size to allocate.
+
+Only writing to the RIFF chunk and one data chunk is currently supported.
+
+See also: drwav_init_write(), drwav_init_file_write(), drwav_init_memory_write()
+*/
+DRWAV_API drwav_uint64 drwav_target_write_size_bytes(const drwav_data_format* pFormat, drwav_uint64 totalFrameCount, drwav_metadata* pMetadata, drwav_uint32 metadataCount);
+
+/*
+Take ownership of the metadata objects that were allocated via one of the init_with_metadata() function calls. The init_with_metdata functions perform a single heap allocation for this metadata.
+
+Useful if you want the data to persist beyond the lifetime of the drwav object.
+
+You must free the data returned from this function using drwav_free().
+*/
+DRWAV_API drwav_metadata* drwav_take_ownership_of_metadata(drwav* pWav);
+
+/*
+Uninitializes the given drwav object.
+
+Use this only for objects initialized with drwav_init*() functions (drwav_init(), drwav_init_ex(), drwav_init_write(), drwav_init_write_sequential()).
+*/
+DRWAV_API drwav_result drwav_uninit(drwav* pWav);
+
+
+/*
+Reads raw audio data.
+
+This is the lowest level function for reading audio data. It simply reads the given number of
+bytes of the raw internal sample data.
+
+Consider using drwav_read_pcm_frames_s16(), drwav_read_pcm_frames_s32() or drwav_read_pcm_frames_f32() for
+reading sample data in a consistent format.
+
+pBufferOut can be NULL in which case a seek will be performed.
+
+Returns the number of bytes actually read.
+*/
+DRWAV_API size_t drwav_read_raw(drwav* pWav, size_t bytesToRead, void* pBufferOut);
+
+/*
+Reads up to the specified number of PCM frames from the WAV file.
+
+The output data will be in the file's internal format, converted to native-endian byte order. Use
+drwav_read_pcm_frames_s16/f32/s32() to read data in a specific format.
+
+If the return value is less than <framesToRead> it means the end of the file has been reached or
+you have requested more PCM frames than can possibly fit in the output buffer.
+
+This function will only work when sample data is of a fixed size and uncompressed. If you are
+using a compressed format consider using drwav_read_raw() or drwav_read_pcm_frames_s16/s32/f32().
+
+pBufferOut can be NULL in which case a seek will be performed.
+*/
+DRWAV_API drwav_uint64 drwav_read_pcm_frames(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut);
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_le(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut);
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_be(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut);
+
+/*
+Seeks to the given PCM frame.
+
+Returns true if successful; false otherwise.
+*/
+DRWAV_API drwav_bool32 drwav_seek_to_pcm_frame(drwav* pWav, drwav_uint64 targetFrameIndex);
+
+/*
+Retrieves the current read position in pcm frames.
+*/
+DRWAV_API drwav_result drwav_get_cursor_in_pcm_frames(drwav* pWav, drwav_uint64* pCursor);
+
+/*
+Retrieves the length of the file.
+*/
+DRWAV_API drwav_result drwav_get_length_in_pcm_frames(drwav* pWav, drwav_uint64* pLength);
+
+
+/*
+Writes raw audio data.
+
+Returns the number of bytes actually written. If this differs from bytesToWrite, it indicates an error.
+*/
+DRWAV_API size_t drwav_write_raw(drwav* pWav, size_t bytesToWrite, const void* pData);
+
+/*
+Writes PCM frames.
+
+Returns the number of PCM frames written.
+
+Input samples need to be in native-endian byte order. On big-endian architectures the input data will be converted to
+little-endian. Use drwav_write_raw() to write raw audio data without performing any conversion.
+*/
+DRWAV_API drwav_uint64 drwav_write_pcm_frames(drwav* pWav, drwav_uint64 framesToWrite, const void* pData);
+DRWAV_API drwav_uint64 drwav_write_pcm_frames_le(drwav* pWav, drwav_uint64 framesToWrite, const void* pData);
+DRWAV_API drwav_uint64 drwav_write_pcm_frames_be(drwav* pWav, drwav_uint64 framesToWrite, const void* pData);
+
+/* Conversion Utilities */
+#ifndef DR_WAV_NO_CONVERSION_API
+
+/*
+Reads a chunk of audio data and converts it to signed 16-bit PCM samples.
+
+pBufferOut can be NULL in which case a seek will be performed.
+
+Returns the number of PCM frames actually read.
+
+If the return value is less than <framesToRead> it means the end of the file has been reached.
+*/
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut);
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16le(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut);
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16be(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut);
+
+/* Low-level function for converting unsigned 8-bit PCM samples to signed 16-bit PCM samples. */
+DRWAV_API void drwav_u8_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting signed 24-bit PCM samples to signed 16-bit PCM samples. */
+DRWAV_API void drwav_s24_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting signed 32-bit PCM samples to signed 16-bit PCM samples. */
+DRWAV_API void drwav_s32_to_s16(drwav_int16* pOut, const drwav_int32* pIn, size_t sampleCount);
+
+/* Low-level function for converting IEEE 32-bit floating point samples to signed 16-bit PCM samples. */
+DRWAV_API void drwav_f32_to_s16(drwav_int16* pOut, const float* pIn, size_t sampleCount);
+
+/* Low-level function for converting IEEE 64-bit floating point samples to signed 16-bit PCM samples. */
+DRWAV_API void drwav_f64_to_s16(drwav_int16* pOut, const double* pIn, size_t sampleCount);
+
+/* Low-level function for converting A-law samples to signed 16-bit PCM samples. */
+DRWAV_API void drwav_alaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting u-law samples to signed 16-bit PCM samples. */
+DRWAV_API void drwav_mulaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+
+/*
+Reads a chunk of audio data and converts it to IEEE 32-bit floating point samples.
+
+pBufferOut can be NULL in which case a seek will be performed.
+
+Returns the number of PCM frames actually read.
+
+If the return value is less than <framesToRead> it means the end of the file has been reached.
+*/
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut);
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32le(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut);
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32be(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut);
+
+/* Low-level function for converting unsigned 8-bit PCM samples to IEEE 32-bit floating point samples. */
+DRWAV_API void drwav_u8_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting signed 16-bit PCM samples to IEEE 32-bit floating point samples. */
+DRWAV_API void drwav_s16_to_f32(float* pOut, const drwav_int16* pIn, size_t sampleCount);
+
+/* Low-level function for converting signed 24-bit PCM samples to IEEE 32-bit floating point samples. */
+DRWAV_API void drwav_s24_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting signed 32-bit PCM samples to IEEE 32-bit floating point samples. */
+DRWAV_API void drwav_s32_to_f32(float* pOut, const drwav_int32* pIn, size_t sampleCount);
+
+/* Low-level function for converting IEEE 64-bit floating point samples to IEEE 32-bit floating point samples. */
+DRWAV_API void drwav_f64_to_f32(float* pOut, const double* pIn, size_t sampleCount);
+
+/* Low-level function for converting A-law samples to IEEE 32-bit floating point samples. */
+DRWAV_API void drwav_alaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting u-law samples to IEEE 32-bit floating point samples. */
+DRWAV_API void drwav_mulaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+
+/*
+Reads a chunk of audio data and converts it to signed 32-bit PCM samples.
+
+pBufferOut can be NULL in which case a seek will be performed.
+
+Returns the number of PCM frames actually read.
+
+If the return value is less than <framesToRead> it means the end of the file has been reached.
+*/
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut);
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32le(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut);
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32be(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut);
+
+/* Low-level function for converting unsigned 8-bit PCM samples to signed 32-bit PCM samples. */
+DRWAV_API void drwav_u8_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting signed 16-bit PCM samples to signed 32-bit PCM samples. */
+DRWAV_API void drwav_s16_to_s32(drwav_int32* pOut, const drwav_int16* pIn, size_t sampleCount);
+
+/* Low-level function for converting signed 24-bit PCM samples to signed 32-bit PCM samples. */
+DRWAV_API void drwav_s24_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting IEEE 32-bit floating point samples to signed 32-bit PCM samples. */
+DRWAV_API void drwav_f32_to_s32(drwav_int32* pOut, const float* pIn, size_t sampleCount);
+
+/* Low-level function for converting IEEE 64-bit floating point samples to signed 32-bit PCM samples. */
+DRWAV_API void drwav_f64_to_s32(drwav_int32* pOut, const double* pIn, size_t sampleCount);
+
+/* Low-level function for converting A-law samples to signed 32-bit PCM samples. */
+DRWAV_API void drwav_alaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+/* Low-level function for converting u-law samples to signed 32-bit PCM samples. */
+DRWAV_API void drwav_mulaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount);
+
+#endif  /* DR_WAV_NO_CONVERSION_API */
+
+
+/* High-Level Convenience Helpers */
+
+#ifndef DR_WAV_NO_STDIO
+/*
+Helper for initializing a wave file for reading using stdio.
+
+This holds the internal FILE object until drwav_uninit() is called. Keep this in mind if you're caching drwav
+objects because the operating system may restrict the number of file handles an application can have open at
+any given time.
+*/
+DRWAV_API drwav_bool32 drwav_init_file(drwav* pWav, const char* filename, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_ex(drwav* pWav, const char* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_w(drwav* pWav, const wchar_t* filename, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_ex_w(drwav* pWav, const wchar_t* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_with_metadata(drwav* pWav, const char* filename, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_with_metadata_w(drwav* pWav, const wchar_t* filename, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks);
+
+
+/*
+Helper for initializing a wave file for writing using stdio.
+
+This holds the internal FILE object until drwav_uninit() is called. Keep this in mind if you're caching drwav
+objects because the operating system may restrict the number of file handles an application can have open at
+any given time.
+*/
+DRWAV_API drwav_bool32 drwav_init_file_write(drwav* pWav, const char* filename, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_write_sequential(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_write_sequential_pcm_frames(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_write_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_write_sequential_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_file_write_sequential_pcm_frames_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks);
+#endif  /* DR_WAV_NO_STDIO */
+
+/*
+Helper for initializing a loader from a pre-allocated memory buffer.
+
+This does not create a copy of the data. It is up to the application to ensure the buffer remains valid for
+the lifetime of the drwav object.
+
+The buffer should contain the contents of the entire wave file, not just the sample data.
+*/
+DRWAV_API drwav_bool32 drwav_init_memory(drwav* pWav, const void* data, size_t dataSize, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_memory_ex(drwav* pWav, const void* data, size_t dataSize, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_memory_with_metadata(drwav* pWav, const void* data, size_t dataSize, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks);
+
+/*
+Helper for initializing a writer which outputs data to a memory buffer.
+
+dr_wav will manage the memory allocations, however it is up to the caller to free the data with drwav_free().
+
+The buffer will remain allocated even after drwav_uninit() is called. The buffer should not be considered valid
+until after drwav_uninit() has been called.
+*/
+DRWAV_API drwav_bool32 drwav_init_memory_write(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_memory_write_sequential(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_bool32 drwav_init_memory_write_sequential_pcm_frames(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks);
+
+
+#ifndef DR_WAV_NO_CONVERSION_API
+/*
+Opens and reads an entire wav file in a single operation.
+
+The return value is a heap-allocated buffer containing the audio data. Use drwav_free() to free the buffer.
+*/
+DRWAV_API drwav_int16* drwav_open_and_read_pcm_frames_s16(drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_tell_proc onTell, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API float* drwav_open_and_read_pcm_frames_f32(drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_tell_proc onTell, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_int32* drwav_open_and_read_pcm_frames_s32(drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_tell_proc onTell, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+#ifndef DR_WAV_NO_STDIO
+/*
+Opens and decodes an entire wav file in a single operation.
+
+The return value is a heap-allocated buffer containing the audio data. Use drwav_free() to free the buffer.
+*/
+DRWAV_API drwav_int16* drwav_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API float* drwav_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_int32* drwav_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_int16* drwav_open_file_and_read_pcm_frames_s16_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API float* drwav_open_file_and_read_pcm_frames_f32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_int32* drwav_open_file_and_read_pcm_frames_s32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+#endif
+/*
+Opens and decodes an entire wav file from a block of memory in a single operation.
+
+The return value is a heap-allocated buffer containing the audio data. Use drwav_free() to free the buffer.
+*/
+DRWAV_API drwav_int16* drwav_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API float* drwav_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+DRWAV_API drwav_int32* drwav_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
+#endif
+
+/* Frees data that was allocated internally by dr_wav. */
+DRWAV_API void drwav_free(void* p, const drwav_allocation_callbacks* pAllocationCallbacks);
+
+/* Converts bytes from a wav stream to a sized type of native endian. */
+DRWAV_API drwav_uint16 drwav_bytes_to_u16(const drwav_uint8* data);
+DRWAV_API drwav_int16 drwav_bytes_to_s16(const drwav_uint8* data);
+DRWAV_API drwav_uint32 drwav_bytes_to_u32(const drwav_uint8* data);
+DRWAV_API drwav_int32 drwav_bytes_to_s32(const drwav_uint8* data);
+DRWAV_API drwav_uint64 drwav_bytes_to_u64(const drwav_uint8* data);
+DRWAV_API drwav_int64 drwav_bytes_to_s64(const drwav_uint8* data);
+DRWAV_API float drwav_bytes_to_f32(const drwav_uint8* data);
+
+/* Compares a GUID for the purpose of checking the type of a Wave64 chunk. */
+DRWAV_API drwav_bool32 drwav_guid_equal(const drwav_uint8 a[16], const drwav_uint8 b[16]);
+
+/* Compares a four-character-code for the purpose of checking the type of a RIFF chunk. */
+DRWAV_API drwav_bool32 drwav_fourcc_equal(const drwav_uint8* a, const char* b);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  /* dr_wav_h */
+
+
+/************************************************************************************************************************************************************
+ ************************************************************************************************************************************************************
+
+ IMPLEMENTATION
+
+ ************************************************************************************************************************************************************
+ ************************************************************************************************************************************************************/
+#if defined(DR_WAV_IMPLEMENTATION) || defined(DRWAV_IMPLEMENTATION)
+#ifndef dr_wav_c
+#define dr_wav_c
+
+#ifdef __MRC__
+/* MrC currently doesn't compile dr_wav correctly with any optimizations enabled. */
+#pragma options opt off
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h> /* For INT_MAX */
+
+#ifndef DR_WAV_NO_STDIO
+#include <stdio.h>
+#ifndef DR_WAV_NO_WCHAR
+#include <wchar.h>
+#endif
+#endif
+
+/* Standard library stuff. */
+#ifndef DRWAV_ASSERT
+#include <assert.h>
+#define DRWAV_ASSERT(expression)           assert(expression)
+#endif
+#ifndef DRWAV_MALLOC
+#define DRWAV_MALLOC(sz)                   malloc((sz))
+#endif
+#ifndef DRWAV_REALLOC
+#define DRWAV_REALLOC(p, sz)               realloc((p), (sz))
+#endif
+#ifndef DRWAV_FREE
+#define DRWAV_FREE(p)                      free((p))
+#endif
+#ifndef DRWAV_COPY_MEMORY
+#define DRWAV_COPY_MEMORY(dst, src, sz)    memcpy((dst), (src), (sz))
+#endif
+#ifndef DRWAV_ZERO_MEMORY
+#define DRWAV_ZERO_MEMORY(p, sz)           memset((p), 0, (sz))
+#endif
+#ifndef DRWAV_ZERO_OBJECT
+#define DRWAV_ZERO_OBJECT(p)               DRWAV_ZERO_MEMORY((p), sizeof(*p))
+#endif
+
+#define drwav_countof(x)                   (sizeof(x) / sizeof(x[0]))
+#define drwav_align(x, a)                  ((((x) + (a) - 1) / (a)) * (a))
+#define drwav_min(a, b)                    (((a) < (b)) ? (a) : (b))
+#define drwav_max(a, b)                    (((a) > (b)) ? (a) : (b))
+#define drwav_clamp(x, lo, hi)             (drwav_max((lo), drwav_min((hi), (x))))
+#define drwav_offset_ptr(p, offset)        (((drwav_uint8*)(p)) + (offset))
+
+#define DRWAV_MAX_SIMD_VECTOR_SIZE         32
+
+/* Architecture Detection */
+#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC))
+    #define DRWAV_X64
+#elif defined(__i386) || defined(_M_IX86)
+    #define DRWAV_X86
+#elif defined(__arm__) || defined(_M_ARM)
+    #define DRWAV_ARM
+#endif
+/* End Architecture Detection */
+
+/* Inline */
+#ifdef _MSC_VER
+    #define DRWAV_INLINE __forceinline
+#elif defined(__GNUC__)
+    /*
+    I've had a bug report where GCC is emitting warnings about functions possibly not being inlineable. This warning happens when
+    the __attribute__((always_inline)) attribute is defined without an "inline" statement. I think therefore there must be some
+    case where "__inline__" is not always defined, thus the compiler emitting these warnings. When using -std=c89 or -ansi on the
+    command line, we cannot use the "inline" keyword and instead need to use "__inline__". In an attempt to work around this issue
+    I am using "__inline__" only when we're compiling in strict ANSI mode.
+    */
+    #if defined(__STRICT_ANSI__)
+        #define DRWAV_GNUC_INLINE_HINT __inline__
+    #else
+        #define DRWAV_GNUC_INLINE_HINT inline
+    #endif
+
+    #if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 2)) || defined(__clang__)
+        #define DRWAV_INLINE DRWAV_GNUC_INLINE_HINT __attribute__((always_inline))
+    #else
+        #define DRWAV_INLINE DRWAV_GNUC_INLINE_HINT
+    #endif
+#elif defined(__WATCOMC__)
+    #define DRWAV_INLINE __inline
+#else
+    #define DRWAV_INLINE
+#endif
+/* End Inline */
+
+/* SIZE_MAX */
+#if defined(SIZE_MAX)
+    #define DRWAV_SIZE_MAX  SIZE_MAX
+#else
+    #if defined(_WIN64) || defined(_LP64) || defined(__LP64__)
+        #define DRWAV_SIZE_MAX  ((drwav_uint64)0xFFFFFFFFFFFFFFFF)
+    #else
+        #define DRWAV_SIZE_MAX  0xFFFFFFFF
+    #endif
+#endif
+/* End SIZE_MAX */
+
+/* Weird bit manipulation is for C89 compatibility (no direct support for 64-bit integers). */
+#define DRWAV_INT64_MIN ((drwav_int64) ((drwav_uint64)0x80000000 << 32))
+#define DRWAV_INT64_MAX ((drwav_int64)(((drwav_uint64)0x7FFFFFFF << 32) | 0xFFFFFFFF))
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+    #define DRWAV_HAS_BYTESWAP16_INTRINSIC
+    #define DRWAV_HAS_BYTESWAP32_INTRINSIC
+    #define DRWAV_HAS_BYTESWAP64_INTRINSIC
+#elif defined(__clang__)
+    #if defined(__has_builtin)
+        #if __has_builtin(__builtin_bswap16)
+            #define DRWAV_HAS_BYTESWAP16_INTRINSIC
+        #endif
+        #if __has_builtin(__builtin_bswap32)
+            #define DRWAV_HAS_BYTESWAP32_INTRINSIC
+        #endif
+        #if __has_builtin(__builtin_bswap64)
+            #define DRWAV_HAS_BYTESWAP64_INTRINSIC
+        #endif
+    #endif
+#elif defined(__GNUC__)
+    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+        #define DRWAV_HAS_BYTESWAP32_INTRINSIC
+        #define DRWAV_HAS_BYTESWAP64_INTRINSIC
+    #endif
+    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+        #define DRWAV_HAS_BYTESWAP16_INTRINSIC
+    #endif
+#endif
+
+DRWAV_API void drwav_version(drwav_uint32* pMajor, drwav_uint32* pMinor, drwav_uint32* pRevision)
+{
+    if (pMajor) {
+        *pMajor = DRWAV_VERSION_MAJOR;
+    }
+
+    if (pMinor) {
+        *pMinor = DRWAV_VERSION_MINOR;
+    }
+
+    if (pRevision) {
+        *pRevision = DRWAV_VERSION_REVISION;
+    }
+}
+
+DRWAV_API const char* drwav_version_string(void)
+{
+    return DRWAV_VERSION_STRING;
+}
+
+/*
+These limits are used for basic validation when initializing the decoder. If you exceed these limits, first of all: what on Earth are
+you doing?! (Let me know, I'd be curious!) Second, you can adjust these by #define-ing them before the dr_wav implementation.
+*/
+#ifndef DRWAV_MAX_SAMPLE_RATE
+#define DRWAV_MAX_SAMPLE_RATE       384000
+#endif
+#ifndef DRWAV_MAX_CHANNELS
+#define DRWAV_MAX_CHANNELS          256
+#endif
+#ifndef DRWAV_MAX_BITS_PER_SAMPLE
+#define DRWAV_MAX_BITS_PER_SAMPLE   64
+#endif
+
+static const drwav_uint8 drwavGUID_W64_RIFF[16] = {0x72,0x69,0x66,0x66, 0x2E,0x91, 0xCF,0x11, 0xA5,0xD6, 0x28,0xDB,0x04,0xC1,0x00,0x00};    /* 66666972-912E-11CF-A5D6-28DB04C10000 */
+static const drwav_uint8 drwavGUID_W64_WAVE[16] = {0x77,0x61,0x76,0x65, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    /* 65766177-ACF3-11D3-8CD1-00C04F8EDB8A */
+/*static const drwav_uint8 drwavGUID_W64_JUNK[16] = {0x6A,0x75,0x6E,0x6B, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};*/    /* 6B6E756A-ACF3-11D3-8CD1-00C04F8EDB8A */
+static const drwav_uint8 drwavGUID_W64_FMT [16] = {0x66,0x6D,0x74,0x20, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    /* 20746D66-ACF3-11D3-8CD1-00C04F8EDB8A */
+static const drwav_uint8 drwavGUID_W64_FACT[16] = {0x66,0x61,0x63,0x74, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    /* 74636166-ACF3-11D3-8CD1-00C04F8EDB8A */
+static const drwav_uint8 drwavGUID_W64_DATA[16] = {0x64,0x61,0x74,0x61, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    /* 61746164-ACF3-11D3-8CD1-00C04F8EDB8A */
+/*static const drwav_uint8 drwavGUID_W64_SMPL[16] = {0x73,0x6D,0x70,0x6C, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};*/    /* 6C706D73-ACF3-11D3-8CD1-00C04F8EDB8A */
+
+
+static DRWAV_INLINE int drwav__is_little_endian(void)
+{
+#if defined(DRWAV_X86) || defined(DRWAV_X64)
+    return DRWAV_TRUE;
+#elif defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN
+    return DRWAV_TRUE;
+#else
+    int n = 1;
+    return (*(char*)&n) == 1;
+#endif
+}
+
+
+static DRWAV_INLINE void drwav_bytes_to_guid(const drwav_uint8* data, drwav_uint8* guid)
+{
+    int i;
+    for (i = 0; i < 16; ++i) {
+        guid[i] = data[i];
+    }
+}
+
+
+static DRWAV_INLINE drwav_uint16 drwav__bswap16(drwav_uint16 n)
+{
+#ifdef DRWAV_HAS_BYTESWAP16_INTRINSIC
+    #if defined(_MSC_VER)
+        return _byteswap_ushort(n);
+    #elif defined(__GNUC__) || defined(__clang__)
+        return __builtin_bswap16(n);
+    #else
+        #error "This compiler does not support the byte swap intrinsic."
+    #endif
+#else
+    return ((n & 0xFF00) >> 8) |
+           ((n & 0x00FF) << 8);
+#endif
+}
+
+static DRWAV_INLINE drwav_uint32 drwav__bswap32(drwav_uint32 n)
+{
+#ifdef DRWAV_HAS_BYTESWAP32_INTRINSIC
+    #if defined(_MSC_VER)
+        return _byteswap_ulong(n);
+    #elif defined(__GNUC__) || defined(__clang__)
+        #if defined(DRWAV_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 6) && !defined(DRWAV_64BIT)   /* <-- 64-bit inline assembly has not been tested, so disabling for now. */
+            /* Inline assembly optimized implementation for ARM. In my testing, GCC does not generate optimized code with __builtin_bswap32(). */
+            drwav_uint32 r;
+            __asm__ __volatile__ (
+            #if defined(DRWAV_64BIT)
+                "rev %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(n)   /* <-- This is untested. If someone in the community could test this, that would be appreciated! */
+            #else
+                "rev %[out], %[in]" : [out]"=r"(r) : [in]"r"(n)
+            #endif
+            );
+            return r;
+        #else
+            return __builtin_bswap32(n);
+        #endif
+    #else
+        #error "This compiler does not support the byte swap intrinsic."
+    #endif
+#else
+    return ((n & 0xFF000000) >> 24) |
+           ((n & 0x00FF0000) >>  8) |
+           ((n & 0x0000FF00) <<  8) |
+           ((n & 0x000000FF) << 24);
+#endif
+}
+
+static DRWAV_INLINE drwav_uint64 drwav__bswap64(drwav_uint64 n)
+{
+#ifdef DRWAV_HAS_BYTESWAP64_INTRINSIC
+    #if defined(_MSC_VER)
+        return _byteswap_uint64(n);
+    #elif defined(__GNUC__) || defined(__clang__)
+        return __builtin_bswap64(n);
+    #else
+        #error "This compiler does not support the byte swap intrinsic."
+    #endif
+#else
+    /* Weird "<< 32" bitshift is required for C89 because it doesn't support 64-bit constants. Should be optimized out by a good compiler. */
+    return ((n & ((drwav_uint64)0xFF000000 << 32)) >> 56) |
+           ((n & ((drwav_uint64)0x00FF0000 << 32)) >> 40) |
+           ((n & ((drwav_uint64)0x0000FF00 << 32)) >> 24) |
+           ((n & ((drwav_uint64)0x000000FF << 32)) >>  8) |
+           ((n & ((drwav_uint64)0xFF000000      )) <<  8) |
+           ((n & ((drwav_uint64)0x00FF0000      )) << 24) |
+           ((n & ((drwav_uint64)0x0000FF00      )) << 40) |
+           ((n & ((drwav_uint64)0x000000FF      )) << 56);
+#endif
+}
+
+
+static DRWAV_INLINE drwav_int16 drwav__bswap_s16(drwav_int16 n)
+{
+    return (drwav_int16)drwav__bswap16((drwav_uint16)n);
+}
+
+static DRWAV_INLINE void drwav__bswap_samples_s16(drwav_int16* pSamples, drwav_uint64 sampleCount)
+{
+    drwav_uint64 iSample;
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        pSamples[iSample] = drwav__bswap_s16(pSamples[iSample]);
+    }
+}
+
+
+static DRWAV_INLINE void drwav__bswap_s24(drwav_uint8* p)
+{
+    drwav_uint8 t;
+    t = p[0];
+    p[0] = p[2];
+    p[2] = t;
+}
+
+static DRWAV_INLINE void drwav__bswap_samples_s24(drwav_uint8* pSamples, drwav_uint64 sampleCount)
+{
+    drwav_uint64 iSample;
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        drwav_uint8* pSample = pSamples + (iSample*3);
+        drwav__bswap_s24(pSample);
+    }
+}
+
+
+static DRWAV_INLINE drwav_int32 drwav__bswap_s32(drwav_int32 n)
+{
+    return (drwav_int32)drwav__bswap32((drwav_uint32)n);
+}
+
+static DRWAV_INLINE void drwav__bswap_samples_s32(drwav_int32* pSamples, drwav_uint64 sampleCount)
+{
+    drwav_uint64 iSample;
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        pSamples[iSample] = drwav__bswap_s32(pSamples[iSample]);
+    }
+}
+
+
+static DRWAV_INLINE drwav_int64 drwav__bswap_s64(drwav_int64 n)
+{
+    return (drwav_int64)drwav__bswap64((drwav_uint64)n);
+}
+
+static DRWAV_INLINE void drwav__bswap_samples_s64(drwav_int64* pSamples, drwav_uint64 sampleCount)
+{
+    drwav_uint64 iSample;
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        pSamples[iSample] = drwav__bswap_s64(pSamples[iSample]);
+    }
+}
+
+
+static DRWAV_INLINE float drwav__bswap_f32(float n)
+{
+    union {
+        drwav_uint32 i;
+        float f;
+    } x;
+    x.f = n;
+    x.i = drwav__bswap32(x.i);
+
+    return x.f;
+}
+
+static DRWAV_INLINE void drwav__bswap_samples_f32(float* pSamples, drwav_uint64 sampleCount)
+{
+    drwav_uint64 iSample;
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        pSamples[iSample] = drwav__bswap_f32(pSamples[iSample]);
+    }
+}
+
+
+static DRWAV_INLINE void drwav__bswap_samples(void* pSamples, drwav_uint64 sampleCount, drwav_uint32 bytesPerSample)
+{
+    switch (bytesPerSample)
+    {
+        case 1:
+        {
+            /* No-op. */
+        } break;
+        case 2:
+        {
+            drwav__bswap_samples_s16((drwav_int16*)pSamples, sampleCount);
+        } break;
+        case 3:
+        {
+            drwav__bswap_samples_s24((drwav_uint8*)pSamples, sampleCount);
+        } break;
+        case 4:
+        {
+            drwav__bswap_samples_s32((drwav_int32*)pSamples, sampleCount);
+        } break;
+        case 8:
+        {
+            drwav__bswap_samples_s64((drwav_int64*)pSamples, sampleCount);
+        } break;
+        default:
+        {
+            /* Unsupported format. */
+            DRWAV_ASSERT(DRWAV_FALSE);
+        } break;
+    }
+}
+
+
+
+DRWAV_PRIVATE DRWAV_INLINE drwav_bool32 drwav_is_container_be(drwav_container container)
+{
+    if (container == drwav_container_rifx || container == drwav_container_aiff) {
+        return DRWAV_TRUE;
+    } else {
+        return DRWAV_FALSE;
+    }
+}
+
+
+DRWAV_PRIVATE DRWAV_INLINE drwav_uint16 drwav_bytes_to_u16_le(const drwav_uint8* data)
+{
+    return ((drwav_uint16)data[0] << 0) | ((drwav_uint16)data[1] << 8);
+}
+
+DRWAV_PRIVATE DRWAV_INLINE drwav_uint16 drwav_bytes_to_u16_be(const drwav_uint8* data)
+{
+    return ((drwav_uint16)data[1] << 0) | ((drwav_uint16)data[0] << 8);
+}
+
+DRWAV_PRIVATE DRWAV_INLINE drwav_uint16 drwav_bytes_to_u16_ex(const drwav_uint8* data, drwav_container container)
+{
+    if (drwav_is_container_be(container)) {
+        return drwav_bytes_to_u16_be(data);
+    } else {
+        return drwav_bytes_to_u16_le(data);
+    }
+}
+
+
+DRWAV_PRIVATE DRWAV_INLINE drwav_uint32 drwav_bytes_to_u32_le(const drwav_uint8* data)
+{
+    return ((drwav_uint32)data[0] << 0) | ((drwav_uint32)data[1] << 8) | ((drwav_uint32)data[2] << 16) | ((drwav_uint32)data[3] << 24);
+}
+
+DRWAV_PRIVATE DRWAV_INLINE drwav_uint32 drwav_bytes_to_u32_be(const drwav_uint8* data)
+{
+    return ((drwav_uint32)data[3] << 0) | ((drwav_uint32)data[2] << 8) | ((drwav_uint32)data[1] << 16) | ((drwav_uint32)data[0] << 24);
+}
+
+DRWAV_PRIVATE DRWAV_INLINE drwav_uint32 drwav_bytes_to_u32_ex(const drwav_uint8* data, drwav_container container)
+{
+    if (drwav_is_container_be(container)) {
+        return drwav_bytes_to_u32_be(data);
+    } else {
+        return drwav_bytes_to_u32_le(data);
+    }
+}
+
+
+
+DRWAV_PRIVATE drwav_int64 drwav_aiff_extented_to_s64(const drwav_uint8* data)
+{
+    drwav_uint32 exponent = ((drwav_uint32)data[0] << 8) | data[1];
+    drwav_uint64 hi = ((drwav_uint64)data[2] << 24) | ((drwav_uint64)data[3] << 16) | ((drwav_uint64)data[4] <<  8) | ((drwav_uint64)data[5] <<  0);
+    drwav_uint64 lo = ((drwav_uint64)data[6] << 24) | ((drwav_uint64)data[7] << 16) | ((drwav_uint64)data[8] <<  8) | ((drwav_uint64)data[9] <<  0);
+    drwav_uint64 significand = (hi << 32) | lo;
+    int sign = exponent >> 15;
+
+    /* Remove sign bit. */
+    exponent &= 0x7FFF;
+
+    /* Special cases. */
+    if (exponent == 0 && significand == 0) {
+        return 0;
+    } else if (exponent == 0x7FFF) {
+        return sign ? DRWAV_INT64_MIN : DRWAV_INT64_MAX;    /* Infinite. */
+    }
+
+    exponent -= 16383;
+
+    if (exponent > 63) {
+        return sign ? DRWAV_INT64_MIN : DRWAV_INT64_MAX;    /* Too big for a 64-bit integer. */
+    } else if (exponent < 1) {
+        return 0;  /* Number is less than 1, so rounds down to 0. */
+    }
+
+    significand >>= (63 - exponent);
+
+    if (sign) {
+        return -(drwav_int64)significand;
+    } else {
+        return  (drwav_int64)significand;
+    }
+}
+
+
+DRWAV_PRIVATE void* drwav__malloc_default(size_t sz, void* pUserData)
+{
+    (void)pUserData;
+    return DRWAV_MALLOC(sz);
+}
+
+DRWAV_PRIVATE void* drwav__realloc_default(void* p, size_t sz, void* pUserData)
+{
+    (void)pUserData;
+    return DRWAV_REALLOC(p, sz);
+}
+
+DRWAV_PRIVATE void drwav__free_default(void* p, void* pUserData)
+{
+    (void)pUserData;
+    DRWAV_FREE(p);
+}
+
+
+DRWAV_PRIVATE void* drwav__malloc_from_callbacks(size_t sz, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks == NULL) {
+        return NULL;
+    }
+
+    if (pAllocationCallbacks->onMalloc != NULL) {
+        return pAllocationCallbacks->onMalloc(sz, pAllocationCallbacks->pUserData);
+    }
+
+    /* Try using realloc(). */
+    if (pAllocationCallbacks->onRealloc != NULL) {
+        return pAllocationCallbacks->onRealloc(NULL, sz, pAllocationCallbacks->pUserData);
+    }
+
+    return NULL;
+}
+
+DRWAV_PRIVATE void* drwav__realloc_from_callbacks(void* p, size_t szNew, size_t szOld, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks == NULL) {
+        return NULL;
+    }
+
+    if (pAllocationCallbacks->onRealloc != NULL) {
+        return pAllocationCallbacks->onRealloc(p, szNew, pAllocationCallbacks->pUserData);
+    }
+
+    /* Try emulating realloc() in terms of malloc()/free(). */
+    if (pAllocationCallbacks->onMalloc != NULL && pAllocationCallbacks->onFree != NULL) {
+        void* p2;
+
+        p2 = pAllocationCallbacks->onMalloc(szNew, pAllocationCallbacks->pUserData);
+        if (p2 == NULL) {
+            return NULL;
+        }
+
+        if (p != NULL) {
+            DRWAV_COPY_MEMORY(p2, p, szOld);
+            pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
+        }
+
+        return p2;
+    }
+
+    return NULL;
+}
+
+DRWAV_PRIVATE void drwav__free_from_callbacks(void* p, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (p == NULL || pAllocationCallbacks == NULL) {
+        return;
+    }
+
+    if (pAllocationCallbacks->onFree != NULL) {
+        pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
+    }
+}
+
+
+DRWAV_PRIVATE drwav_allocation_callbacks drwav_copy_allocation_callbacks_or_defaults(const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks != NULL) {
+        /* Copy. */
+        return *pAllocationCallbacks;
+    } else {
+        /* Defaults. */
+        drwav_allocation_callbacks allocationCallbacks;
+        allocationCallbacks.pUserData = NULL;
+        allocationCallbacks.onMalloc  = drwav__malloc_default;
+        allocationCallbacks.onRealloc = drwav__realloc_default;
+        allocationCallbacks.onFree    = drwav__free_default;
+        return allocationCallbacks;
+    }
+}
+
+
+static DRWAV_INLINE drwav_bool32 drwav__is_compressed_format_tag(drwav_uint16 formatTag)
+{
+    return
+        formatTag == DR_WAVE_FORMAT_ADPCM ||
+        formatTag == DR_WAVE_FORMAT_DVI_ADPCM;
+}
+
+DRWAV_PRIVATE unsigned int drwav__chunk_padding_size_riff(drwav_uint64 chunkSize)
+{
+    return (unsigned int)(chunkSize % 2);
+}
+
+DRWAV_PRIVATE unsigned int drwav__chunk_padding_size_w64(drwav_uint64 chunkSize)
+{
+    return (unsigned int)(chunkSize % 8);
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_s16__msadpcm(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16* pBufferOut);
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_s16__ima(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16* pBufferOut);
+DRWAV_PRIVATE drwav_bool32 drwav_init_write__internal(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount);
+
+DRWAV_PRIVATE drwav_result drwav__read_chunk_header(drwav_read_proc onRead, void* pUserData, drwav_container container, drwav_uint64* pRunningBytesReadOut, drwav_chunk_header* pHeaderOut)
+{
+    if (container == drwav_container_riff || container == drwav_container_rifx || container == drwav_container_rf64 || container == drwav_container_aiff) {
+        drwav_uint8 sizeInBytes[4];
+
+        if (onRead(pUserData, pHeaderOut->id.fourcc, 4) != 4) {
+            return DRWAV_AT_END;
+        }
+
+        if (onRead(pUserData, sizeInBytes, 4) != 4) {
+            return DRWAV_INVALID_FILE;
+        }
+
+        pHeaderOut->sizeInBytes = drwav_bytes_to_u32_ex(sizeInBytes, container);
+        pHeaderOut->paddingSize = drwav__chunk_padding_size_riff(pHeaderOut->sizeInBytes);
+
+        *pRunningBytesReadOut += 8;
+    } else if (container == drwav_container_w64) {
+        drwav_uint8 sizeInBytes[8];
+
+        if (onRead(pUserData, pHeaderOut->id.guid, 16) != 16) {
+            return DRWAV_AT_END;
+        }
+
+        if (onRead(pUserData, sizeInBytes, 8) != 8) {
+            return DRWAV_INVALID_FILE;
+        }
+
+        pHeaderOut->sizeInBytes = drwav_bytes_to_u64(sizeInBytes) - 24;    /* <-- Subtract 24 because w64 includes the size of the header. */
+        pHeaderOut->paddingSize = drwav__chunk_padding_size_w64(pHeaderOut->sizeInBytes);
+        *pRunningBytesReadOut += 24;
+    } else {
+        return DRWAV_INVALID_FILE;
+    }
+
+    return DRWAV_SUCCESS;
+}
+
+DRWAV_PRIVATE drwav_bool32 drwav__seek_forward(drwav_seek_proc onSeek, drwav_uint64 offset, void* pUserData)
+{
+    drwav_uint64 bytesRemainingToSeek = offset;
+    while (bytesRemainingToSeek > 0) {
+        if (bytesRemainingToSeek > 0x7FFFFFFF) {
+            if (!onSeek(pUserData, 0x7FFFFFFF, DRWAV_SEEK_CUR)) {
+                return DRWAV_FALSE;
+            }
+            bytesRemainingToSeek -= 0x7FFFFFFF;
+        } else {
+            if (!onSeek(pUserData, (int)bytesRemainingToSeek, DRWAV_SEEK_CUR)) {
+                return DRWAV_FALSE;
+            }
+            bytesRemainingToSeek = 0;
+        }
+    }
+
+    return DRWAV_TRUE;
+}
+
+DRWAV_PRIVATE drwav_bool32 drwav__seek_from_start(drwav_seek_proc onSeek, drwav_uint64 offset, void* pUserData)
+{
+    if (offset <= 0x7FFFFFFF) {
+        return onSeek(pUserData, (int)offset, DRWAV_SEEK_SET);
+    }
+
+    /* Larger than 32-bit seek. */
+    if (!onSeek(pUserData, 0x7FFFFFFF, DRWAV_SEEK_SET)) {
+        return DRWAV_FALSE;
+    }
+    offset -= 0x7FFFFFFF;
+
+    for (;;) {
+        if (offset <= 0x7FFFFFFF) {
+            return onSeek(pUserData, (int)offset, DRWAV_SEEK_CUR);
+        }
+
+        if (!onSeek(pUserData, 0x7FFFFFFF, DRWAV_SEEK_CUR)) {
+            return DRWAV_FALSE;
+        }
+        offset -= 0x7FFFFFFF;
+    }
+
+    /* Should never get here. */
+    /*return DRWAV_TRUE; */
+}
+
+
+
+DRWAV_PRIVATE size_t drwav__on_read(drwav_read_proc onRead, void* pUserData, void* pBufferOut, size_t bytesToRead, drwav_uint64* pCursor)
+{
+    size_t bytesRead;
+
+    DRWAV_ASSERT(onRead != NULL);
+    DRWAV_ASSERT(pCursor != NULL);
+
+    bytesRead = onRead(pUserData, pBufferOut, bytesToRead);
+    *pCursor += bytesRead;
+    return bytesRead;
+}
+
+#if 0
+DRWAV_PRIVATE drwav_bool32 drwav__on_seek(drwav_seek_proc onSeek, void* pUserData, int offset, drwav_seek_origin origin, drwav_uint64* pCursor)
+{
+    DRWAV_ASSERT(onSeek != NULL);
+    DRWAV_ASSERT(pCursor != NULL);
+
+    if (!onSeek(pUserData, offset, origin)) {
+        return DRWAV_FALSE;
+    }
+
+    if (origin == DRWAV_SEEK_SET) {
+        *pCursor = offset;
+    } else {
+        *pCursor += offset;
+    }
+
+    return DRWAV_TRUE;
+}
+#endif
+
+
+#define DRWAV_SMPL_BYTES                    36
+#define DRWAV_SMPL_LOOP_BYTES               24
+#define DRWAV_INST_BYTES                    7
+#define DRWAV_ACID_BYTES                    24
+#define DRWAV_CUE_BYTES                     4
+#define DRWAV_BEXT_BYTES                    602
+#define DRWAV_BEXT_DESCRIPTION_BYTES        256
+#define DRWAV_BEXT_ORIGINATOR_NAME_BYTES    32
+#define DRWAV_BEXT_ORIGINATOR_REF_BYTES     32
+#define DRWAV_BEXT_RESERVED_BYTES           180
+#define DRWAV_BEXT_UMID_BYTES               64
+#define DRWAV_CUE_POINT_BYTES               24
+#define DRWAV_LIST_LABEL_OR_NOTE_BYTES      4
+#define DRWAV_LIST_LABELLED_TEXT_BYTES      20
+
+#define DRWAV_METADATA_ALIGNMENT            8
+
+typedef enum
+{
+    drwav__metadata_parser_stage_count,
+    drwav__metadata_parser_stage_read
+} drwav__metadata_parser_stage;
+
+typedef struct
+{
+    drwav_read_proc onRead;
+    drwav_seek_proc onSeek;
+    void *pReadSeekUserData;
+    drwav__metadata_parser_stage stage;
+    drwav_metadata *pMetadata;
+    drwav_uint32 metadataCount;
+    drwav_uint8 *pData;
+    drwav_uint8 *pDataCursor;
+    drwav_uint64 metadataCursor;
+    drwav_uint64 extraCapacity;
+} drwav__metadata_parser;
+
+DRWAV_PRIVATE size_t drwav__metadata_memory_capacity(drwav__metadata_parser* pParser)
+{
+    drwav_uint64 cap = sizeof(drwav_metadata) * (drwav_uint64)pParser->metadataCount + pParser->extraCapacity;
+    if (cap > DRWAV_SIZE_MAX) {
+        return 0;   /* Too big. */
+    }
+
+    return (size_t)cap; /* Safe cast thanks to the check above. */
+}
+
+DRWAV_PRIVATE drwav_uint8* drwav__metadata_get_memory(drwav__metadata_parser* pParser, size_t size, size_t align)
+{
+    drwav_uint8* pResult;
+
+    if (align) {
+        drwav_uintptr modulo = (drwav_uintptr)pParser->pDataCursor % align;
+        if (modulo != 0) {
+            pParser->pDataCursor += align - modulo;
+        }
+    }
+
+    pResult = pParser->pDataCursor;
+
+    /*
+    Getting to the point where this function is called means there should always be memory
+    available. Out of memory checks should have been done at an earlier stage.
+    */
+    DRWAV_ASSERT((pResult + size) <= (pParser->pData + drwav__metadata_memory_capacity(pParser)));
+
+    pParser->pDataCursor += size;
+    return pResult;
+}
+
+DRWAV_PRIVATE void drwav__metadata_request_extra_memory_for_stage_2(drwav__metadata_parser* pParser, size_t bytes, size_t align)
+{
+    size_t extra = bytes + (align ? (align - 1) : 0);
+    pParser->extraCapacity += extra;
+}
+
+DRWAV_PRIVATE drwav_result drwav__metadata_alloc(drwav__metadata_parser* pParser, drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pParser->extraCapacity != 0 || pParser->metadataCount != 0) {
+        pAllocationCallbacks->onFree(pParser->pData, pAllocationCallbacks->pUserData);
+
+        pParser->pData = (drwav_uint8*)pAllocationCallbacks->onMalloc(drwav__metadata_memory_capacity(pParser), pAllocationCallbacks->pUserData);
+        pParser->pDataCursor = pParser->pData;
+
+        if (pParser->pData == NULL) {
+            return DRWAV_OUT_OF_MEMORY;
+        }
+
+        /*
+        We don't need to worry about specifying an alignment here because malloc always returns something
+        of suitable alignment. This also means pParser->pMetadata is all that we need to store in order
+        for us to free when we are done.
+        */
+        pParser->pMetadata = (drwav_metadata*)drwav__metadata_get_memory(pParser, sizeof(drwav_metadata) * pParser->metadataCount, 1);
+        pParser->metadataCursor = 0;
+    }
+
+    return DRWAV_SUCCESS;
+}
+
+DRWAV_PRIVATE size_t drwav__metadata_parser_read(drwav__metadata_parser* pParser, void* pBufferOut, size_t bytesToRead, drwav_uint64* pCursor)
+{
+    if (pCursor != NULL) {
+        return drwav__on_read(pParser->onRead, pParser->pReadSeekUserData, pBufferOut, bytesToRead, pCursor);
+    } else {
+        return pParser->onRead(pParser->pReadSeekUserData, pBufferOut, bytesToRead);
+    }
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav__read_smpl_to_metadata_obj(drwav__metadata_parser* pParser, const drwav_chunk_header* pChunkHeader, drwav_metadata* pMetadata)
+{
+    drwav_uint8 smplHeaderData[DRWAV_SMPL_BYTES];
+    drwav_uint64 totalBytesRead = 0;
+    size_t bytesJustRead;
+
+    if (pMetadata == NULL) {
+        return 0;
+    }
+
+    bytesJustRead = drwav__metadata_parser_read(pParser, smplHeaderData, sizeof(smplHeaderData), &totalBytesRead);
+
+    DRWAV_ASSERT(pParser->stage == drwav__metadata_parser_stage_read);
+    DRWAV_ASSERT(pChunkHeader != NULL);
+
+    if (pMetadata != NULL && bytesJustRead == sizeof(smplHeaderData)) {
+        drwav_uint32 iSampleLoop;
+
+        pMetadata->type                                     = drwav_metadata_type_smpl;
+        pMetadata->data.smpl.manufacturerId                 = drwav_bytes_to_u32(smplHeaderData + 0);
+        pMetadata->data.smpl.productId                      = drwav_bytes_to_u32(smplHeaderData + 4);
+        pMetadata->data.smpl.samplePeriodNanoseconds        = drwav_bytes_to_u32(smplHeaderData + 8);
+        pMetadata->data.smpl.midiUnityNote                  = drwav_bytes_to_u32(smplHeaderData + 12);
+        pMetadata->data.smpl.midiPitchFraction              = drwav_bytes_to_u32(smplHeaderData + 16);
+        pMetadata->data.smpl.smpteFormat                    = drwav_bytes_to_u32(smplHeaderData + 20);
+        pMetadata->data.smpl.smpteOffset                    = drwav_bytes_to_u32(smplHeaderData + 24);
+        pMetadata->data.smpl.sampleLoopCount                = drwav_bytes_to_u32(smplHeaderData + 28);
+        pMetadata->data.smpl.samplerSpecificDataSizeInBytes = drwav_bytes_to_u32(smplHeaderData + 32);
+
+        /*
+        The loop count needs to be validated against the size of the chunk for safety so we don't
+        attempt to read over the boundary of the chunk.
+        */
+        if (pMetadata->data.smpl.sampleLoopCount == (pChunkHeader->sizeInBytes - DRWAV_SMPL_BYTES) / DRWAV_SMPL_LOOP_BYTES) {
+            pMetadata->data.smpl.pLoops = (drwav_smpl_loop*)drwav__metadata_get_memory(pParser, sizeof(drwav_smpl_loop) * pMetadata->data.smpl.sampleLoopCount, DRWAV_METADATA_ALIGNMENT);
+
+            for (iSampleLoop = 0; iSampleLoop < pMetadata->data.smpl.sampleLoopCount; ++iSampleLoop) {
+                drwav_uint8 smplLoopData[DRWAV_SMPL_LOOP_BYTES];
+                bytesJustRead = drwav__metadata_parser_read(pParser, smplLoopData, sizeof(smplLoopData), &totalBytesRead);
+
+                if (bytesJustRead == sizeof(smplLoopData)) {
+                    pMetadata->data.smpl.pLoops[iSampleLoop].cuePointId        = drwav_bytes_to_u32(smplLoopData + 0);
+                    pMetadata->data.smpl.pLoops[iSampleLoop].type              = drwav_bytes_to_u32(smplLoopData + 4);
+                    pMetadata->data.smpl.pLoops[iSampleLoop].firstSampleOffset = drwav_bytes_to_u32(smplLoopData + 8);
+                    pMetadata->data.smpl.pLoops[iSampleLoop].lastSampleOffset  = drwav_bytes_to_u32(smplLoopData + 12);
+                    pMetadata->data.smpl.pLoops[iSampleLoop].sampleFraction    = drwav_bytes_to_u32(smplLoopData + 16);
+                    pMetadata->data.smpl.pLoops[iSampleLoop].playCount         = drwav_bytes_to_u32(smplLoopData + 20);
+                } else {
+                    break;
+                }
+            }
+
+            if (pMetadata->data.smpl.samplerSpecificDataSizeInBytes > 0) {
+                pMetadata->data.smpl.pSamplerSpecificData = drwav__metadata_get_memory(pParser, pMetadata->data.smpl.samplerSpecificDataSizeInBytes, 1);
+                DRWAV_ASSERT(pMetadata->data.smpl.pSamplerSpecificData != NULL);
+
+                drwav__metadata_parser_read(pParser, pMetadata->data.smpl.pSamplerSpecificData, pMetadata->data.smpl.samplerSpecificDataSizeInBytes, &totalBytesRead);
+            }
+        }
+    }
+
+    return totalBytesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav__read_cue_to_metadata_obj(drwav__metadata_parser* pParser, const drwav_chunk_header* pChunkHeader, drwav_metadata* pMetadata)
+{
+    drwav_uint8 cueHeaderSectionData[DRWAV_CUE_BYTES];
+    drwav_uint64 totalBytesRead = 0;
+    size_t bytesJustRead;
+
+    if (pMetadata == NULL) {
+        return 0;
+    }
+
+    bytesJustRead = drwav__metadata_parser_read(pParser, cueHeaderSectionData, sizeof(cueHeaderSectionData), &totalBytesRead);
+
+    DRWAV_ASSERT(pParser->stage == drwav__metadata_parser_stage_read);
+
+    if (bytesJustRead == sizeof(cueHeaderSectionData)) {
+        pMetadata->type                   = drwav_metadata_type_cue;
+        pMetadata->data.cue.cuePointCount = drwav_bytes_to_u32(cueHeaderSectionData);
+
+        /*
+        We need to validate the cue point count against the size of the chunk so we don't read
+        beyond the chunk.
+        */
+        if (pMetadata->data.cue.cuePointCount == (pChunkHeader->sizeInBytes - DRWAV_CUE_BYTES) / DRWAV_CUE_POINT_BYTES) {
+            pMetadata->data.cue.pCuePoints    = (drwav_cue_point*)drwav__metadata_get_memory(pParser, sizeof(drwav_cue_point) * pMetadata->data.cue.cuePointCount, DRWAV_METADATA_ALIGNMENT);
+            DRWAV_ASSERT(pMetadata->data.cue.pCuePoints != NULL);
+
+            if (pMetadata->data.cue.cuePointCount > 0) {
+                drwav_uint32 iCuePoint;
+
+                for (iCuePoint = 0; iCuePoint < pMetadata->data.cue.cuePointCount; ++iCuePoint) {
+                    drwav_uint8 cuePointData[DRWAV_CUE_POINT_BYTES];
+                    bytesJustRead = drwav__metadata_parser_read(pParser, cuePointData, sizeof(cuePointData), &totalBytesRead);
+
+                    if (bytesJustRead == sizeof(cuePointData)) {
+                        pMetadata->data.cue.pCuePoints[iCuePoint].id                = drwav_bytes_to_u32(cuePointData + 0);
+                        pMetadata->data.cue.pCuePoints[iCuePoint].playOrderPosition = drwav_bytes_to_u32(cuePointData + 4);
+                        pMetadata->data.cue.pCuePoints[iCuePoint].dataChunkId[0]    = cuePointData[8];
+                        pMetadata->data.cue.pCuePoints[iCuePoint].dataChunkId[1]    = cuePointData[9];
+                        pMetadata->data.cue.pCuePoints[iCuePoint].dataChunkId[2]    = cuePointData[10];
+                        pMetadata->data.cue.pCuePoints[iCuePoint].dataChunkId[3]    = cuePointData[11];
+                        pMetadata->data.cue.pCuePoints[iCuePoint].chunkStart        = drwav_bytes_to_u32(cuePointData + 12);
+                        pMetadata->data.cue.pCuePoints[iCuePoint].blockStart        = drwav_bytes_to_u32(cuePointData + 16);
+                        pMetadata->data.cue.pCuePoints[iCuePoint].sampleOffset      = drwav_bytes_to_u32(cuePointData + 20);
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    return totalBytesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav__read_inst_to_metadata_obj(drwav__metadata_parser* pParser, drwav_metadata* pMetadata)
+{
+    drwav_uint8 instData[DRWAV_INST_BYTES];
+    drwav_uint64 bytesRead;
+
+    if (pMetadata == NULL) {
+        return 0;
+    }
+
+    bytesRead = drwav__metadata_parser_read(pParser, instData, sizeof(instData), NULL);
+
+    DRWAV_ASSERT(pParser->stage == drwav__metadata_parser_stage_read);
+
+    if (bytesRead == sizeof(instData)) {
+        pMetadata->type                    = drwav_metadata_type_inst;
+        pMetadata->data.inst.midiUnityNote = (drwav_int8)instData[0];
+        pMetadata->data.inst.fineTuneCents = (drwav_int8)instData[1];
+        pMetadata->data.inst.gainDecibels  = (drwav_int8)instData[2];
+        pMetadata->data.inst.lowNote       = (drwav_int8)instData[3];
+        pMetadata->data.inst.highNote      = (drwav_int8)instData[4];
+        pMetadata->data.inst.lowVelocity   = (drwav_int8)instData[5];
+        pMetadata->data.inst.highVelocity  = (drwav_int8)instData[6];
+    }
+
+    return bytesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav__read_acid_to_metadata_obj(drwav__metadata_parser* pParser, drwav_metadata* pMetadata)
+{
+    drwav_uint8 acidData[DRWAV_ACID_BYTES];
+    drwav_uint64 bytesRead;
+
+    if (pMetadata == NULL) {
+        return 0;
+    }
+
+    bytesRead = drwav__metadata_parser_read(pParser, acidData, sizeof(acidData), NULL);
+
+    DRWAV_ASSERT(pParser->stage == drwav__metadata_parser_stage_read);
+
+    if (bytesRead == sizeof(acidData)) {
+        pMetadata->type                       = drwav_metadata_type_acid;
+        pMetadata->data.acid.flags            = drwav_bytes_to_u32(acidData + 0);
+        pMetadata->data.acid.midiUnityNote    = drwav_bytes_to_u16(acidData + 4);
+        pMetadata->data.acid.reserved1        = drwav_bytes_to_u16(acidData + 6);
+        pMetadata->data.acid.reserved2        = drwav_bytes_to_f32(acidData + 8);
+        pMetadata->data.acid.numBeats         = drwav_bytes_to_u32(acidData + 12);
+        pMetadata->data.acid.meterDenominator = drwav_bytes_to_u16(acidData + 16);
+        pMetadata->data.acid.meterNumerator   = drwav_bytes_to_u16(acidData + 18);
+        pMetadata->data.acid.tempo            = drwav_bytes_to_f32(acidData + 20);
+    }
+
+    return bytesRead;
+}
+
+DRWAV_PRIVATE size_t drwav__strlen(const char* str)
+{
+    size_t result = 0;
+
+    while (*str++) {
+        result += 1;
+    }
+
+    return result;
+}
+
+DRWAV_PRIVATE size_t drwav__strlen_clamped(const char* str, size_t maxToRead)
+{
+    size_t result = 0;
+
+    while (*str++ && result < maxToRead) {
+        result += 1;
+    }
+
+    return result;
+}
+
+DRWAV_PRIVATE char* drwav__metadata_copy_string(drwav__metadata_parser* pParser, const char* str, size_t maxToRead)
+{
+    size_t len = drwav__strlen_clamped(str, maxToRead);
+
+    if (len) {
+        char* result = (char*)drwav__metadata_get_memory(pParser, len + 1, 1);
+        DRWAV_ASSERT(result != NULL);
+
+        DRWAV_COPY_MEMORY(result, str, len);
+        result[len] = '\0';
+
+        return result;
+    } else {
+        return NULL;
+    }
+}
+
+typedef struct
+{
+    const void* pBuffer;
+    size_t sizeInBytes;
+    size_t cursor;
+} drwav_buffer_reader;
+
+DRWAV_PRIVATE drwav_result drwav_buffer_reader_init(const void* pBuffer, size_t sizeInBytes, drwav_buffer_reader* pReader)
+{
+    DRWAV_ASSERT(pBuffer != NULL);
+    DRWAV_ASSERT(pReader != NULL);
+
+    DRWAV_ZERO_OBJECT(pReader);
+
+    pReader->pBuffer     = pBuffer;
+    pReader->sizeInBytes = sizeInBytes;
+    pReader->cursor      = 0;
+
+    return DRWAV_SUCCESS;
+}
+
+DRWAV_PRIVATE const void* drwav_buffer_reader_ptr(const drwav_buffer_reader* pReader)
+{
+    DRWAV_ASSERT(pReader != NULL);
+
+    return drwav_offset_ptr(pReader->pBuffer, pReader->cursor);
+}
+
+DRWAV_PRIVATE drwav_result drwav_buffer_reader_seek(drwav_buffer_reader* pReader, size_t bytesToSeek)
+{
+    DRWAV_ASSERT(pReader != NULL);
+
+    if (pReader->cursor + bytesToSeek > pReader->sizeInBytes) {
+        return DRWAV_BAD_SEEK;  /* Seeking too far forward. */
+    }
+
+    pReader->cursor += bytesToSeek;
+
+    return DRWAV_SUCCESS;
+}
+
+DRWAV_PRIVATE drwav_result drwav_buffer_reader_read(drwav_buffer_reader* pReader, void* pDst, size_t bytesToRead, size_t* pBytesRead)
+{
+    drwav_result result = DRWAV_SUCCESS;
+    size_t bytesRemaining;
+
+    DRWAV_ASSERT(pReader != NULL);
+
+    if (pBytesRead != NULL) {
+        *pBytesRead = 0;
+    }
+
+    bytesRemaining = (pReader->sizeInBytes - pReader->cursor);
+    if (bytesToRead > bytesRemaining) {
+        bytesToRead = bytesRemaining;
+    }
+
+    if (pDst == NULL) {
+        /* Seek. */
+        result = drwav_buffer_reader_seek(pReader, bytesToRead);
+    } else {
+        /* Read. */
+        DRWAV_COPY_MEMORY(pDst, drwav_buffer_reader_ptr(pReader), bytesToRead);
+        pReader->cursor += bytesToRead;
+    }
+
+    DRWAV_ASSERT(pReader->cursor <= pReader->sizeInBytes);
+
+    if (result == DRWAV_SUCCESS) {
+        if (pBytesRead != NULL) {
+            *pBytesRead = bytesToRead;
+        }
+    }
+
+    return DRWAV_SUCCESS;
+}
+
+DRWAV_PRIVATE drwav_result drwav_buffer_reader_read_u16(drwav_buffer_reader* pReader, drwav_uint16* pDst)
+{
+    drwav_result result;
+    size_t bytesRead;
+    drwav_uint8 data[2];
+
+    DRWAV_ASSERT(pReader != NULL);
+    DRWAV_ASSERT(pDst != NULL);
+
+    *pDst = 0;  /* Safety. */
+
+    result = drwav_buffer_reader_read(pReader, data, sizeof(*pDst), &bytesRead);
+    if (result != DRWAV_SUCCESS || bytesRead != sizeof(*pDst)) {
+        return result;
+    }
+
+    *pDst = drwav_bytes_to_u16(data);
+
+    return DRWAV_SUCCESS;
+}
+
+DRWAV_PRIVATE drwav_result drwav_buffer_reader_read_u32(drwav_buffer_reader* pReader, drwav_uint32* pDst)
+{
+    drwav_result result;
+    size_t bytesRead;
+    drwav_uint8 data[4];
+
+    DRWAV_ASSERT(pReader != NULL);
+    DRWAV_ASSERT(pDst != NULL);
+
+    *pDst = 0;  /* Safety. */
+
+    result = drwav_buffer_reader_read(pReader, data, sizeof(*pDst), &bytesRead);
+    if (result != DRWAV_SUCCESS || bytesRead != sizeof(*pDst)) {
+        return result;
+    }
+
+    *pDst = drwav_bytes_to_u32(data);
+
+    return DRWAV_SUCCESS;
+}
+
+
+
+DRWAV_PRIVATE drwav_uint64 drwav__read_bext_to_metadata_obj(drwav__metadata_parser* pParser, drwav_metadata* pMetadata, drwav_uint64 chunkSize)
+{
+    drwav_uint8 bextData[DRWAV_BEXT_BYTES];
+    size_t bytesRead = drwav__metadata_parser_read(pParser, bextData, sizeof(bextData), NULL);
+
+    DRWAV_ASSERT(pParser->stage == drwav__metadata_parser_stage_read);
+
+    if (bytesRead == sizeof(bextData)) {
+        drwav_buffer_reader reader;
+        drwav_uint32 timeReferenceLow;
+        drwav_uint32 timeReferenceHigh;
+        size_t extraBytes;
+
+        pMetadata->type = drwav_metadata_type_bext;
+
+        if (drwav_buffer_reader_init(bextData, bytesRead, &reader) == DRWAV_SUCCESS) {
+            pMetadata->data.bext.pDescription = drwav__metadata_copy_string(pParser, (const char*)drwav_buffer_reader_ptr(&reader), DRWAV_BEXT_DESCRIPTION_BYTES);
+            drwav_buffer_reader_seek(&reader, DRWAV_BEXT_DESCRIPTION_BYTES);
+
+            pMetadata->data.bext.pOriginatorName = drwav__metadata_copy_string(pParser, (const char*)drwav_buffer_reader_ptr(&reader), DRWAV_BEXT_ORIGINATOR_NAME_BYTES);
+            drwav_buffer_reader_seek(&reader, DRWAV_BEXT_ORIGINATOR_NAME_BYTES);
+
+            pMetadata->data.bext.pOriginatorReference = drwav__metadata_copy_string(pParser, (const char*)drwav_buffer_reader_ptr(&reader), DRWAV_BEXT_ORIGINATOR_REF_BYTES);
+            drwav_buffer_reader_seek(&reader, DRWAV_BEXT_ORIGINATOR_REF_BYTES);
+
+            drwav_buffer_reader_read(&reader, pMetadata->data.bext.pOriginationDate, sizeof(pMetadata->data.bext.pOriginationDate), NULL);
+            drwav_buffer_reader_read(&reader, pMetadata->data.bext.pOriginationTime, sizeof(pMetadata->data.bext.pOriginationTime), NULL);
+
+            drwav_buffer_reader_read_u32(&reader, &timeReferenceLow);
+            drwav_buffer_reader_read_u32(&reader, &timeReferenceHigh);
+            pMetadata->data.bext.timeReference = ((drwav_uint64)timeReferenceHigh << 32) + timeReferenceLow;
+
+            drwav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.version);
+
+            pMetadata->data.bext.pUMID = drwav__metadata_get_memory(pParser, DRWAV_BEXT_UMID_BYTES, 1);
+            drwav_buffer_reader_read(&reader, pMetadata->data.bext.pUMID, DRWAV_BEXT_UMID_BYTES, NULL);
+
+            drwav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.loudnessValue);
+            drwav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.loudnessRange);
+            drwav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.maxTruePeakLevel);
+            drwav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.maxMomentaryLoudness);
+            drwav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.maxShortTermLoudness);
+
+            DRWAV_ASSERT((drwav_offset_ptr(drwav_buffer_reader_ptr(&reader), DRWAV_BEXT_RESERVED_BYTES)) == (bextData + DRWAV_BEXT_BYTES));
+
+            extraBytes = (size_t)(chunkSize - DRWAV_BEXT_BYTES);
+            if (extraBytes > 0) {
+                pMetadata->data.bext.pCodingHistory = (char*)drwav__metadata_get_memory(pParser, extraBytes + 1, 1);
+                DRWAV_ASSERT(pMetadata->data.bext.pCodingHistory != NULL);
+
+                bytesRead += drwav__metadata_parser_read(pParser, pMetadata->data.bext.pCodingHistory, extraBytes, NULL);
+                pMetadata->data.bext.codingHistorySize = (drwav_uint32)drwav__strlen(pMetadata->data.bext.pCodingHistory);
+            } else {
+                pMetadata->data.bext.pCodingHistory    = NULL;
+                pMetadata->data.bext.codingHistorySize = 0;
+            }
+        }
+    }
+
+    return bytesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav__read_list_label_or_note_to_metadata_obj(drwav__metadata_parser* pParser, drwav_metadata* pMetadata, drwav_uint64 chunkSize, drwav_metadata_type type)
+{
+    drwav_uint8 cueIDBuffer[DRWAV_LIST_LABEL_OR_NOTE_BYTES];
+    drwav_uint64 totalBytesRead = 0;
+    size_t bytesJustRead = drwav__metadata_parser_read(pParser, cueIDBuffer, sizeof(cueIDBuffer), &totalBytesRead);
+
+    DRWAV_ASSERT(pParser->stage == drwav__metadata_parser_stage_read);
+
+    if (bytesJustRead == sizeof(cueIDBuffer)) {
+        drwav_uint32 sizeIncludingNullTerminator;
+
+        pMetadata->type = type;
+        pMetadata->data.labelOrNote.cuePointId = drwav_bytes_to_u32(cueIDBuffer);
+
+        sizeIncludingNullTerminator = (drwav_uint32)chunkSize - DRWAV_LIST_LABEL_OR_NOTE_BYTES;
+        if (sizeIncludingNullTerminator > 0) {
+            pMetadata->data.labelOrNote.stringLength = sizeIncludingNullTerminator - 1;
+            pMetadata->data.labelOrNote.pString      = (char*)drwav__metadata_get_memory(pParser, sizeIncludingNullTerminator, 1);
+            DRWAV_ASSERT(pMetadata->data.labelOrNote.pString != NULL);
+
+            drwav__metadata_parser_read(pParser, pMetadata->data.labelOrNote.pString, sizeIncludingNullTerminator, &totalBytesRead);
+        } else {
+            pMetadata->data.labelOrNote.stringLength = 0;
+            pMetadata->data.labelOrNote.pString      = NULL;
+        }
+    }
+
+    return totalBytesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav__read_list_labelled_cue_region_to_metadata_obj(drwav__metadata_parser* pParser, drwav_metadata* pMetadata, drwav_uint64 chunkSize)
+{
+    drwav_uint8 buffer[DRWAV_LIST_LABELLED_TEXT_BYTES];
+    drwav_uint64 totalBytesRead = 0;
+    size_t bytesJustRead = drwav__metadata_parser_read(pParser, buffer, sizeof(buffer), &totalBytesRead);
+
+    DRWAV_ASSERT(pParser->stage == drwav__metadata_parser_stage_read);
+
+    if (bytesJustRead == sizeof(buffer)) {
+        drwav_uint32 sizeIncludingNullTerminator;
+
+        pMetadata->type                                = drwav_metadata_type_list_labelled_cue_region;
+        pMetadata->data.labelledCueRegion.cuePointId   = drwav_bytes_to_u32(buffer + 0);
+        pMetadata->data.labelledCueRegion.sampleLength = drwav_bytes_to_u32(buffer + 4);
+        pMetadata->data.labelledCueRegion.purposeId[0] = buffer[8];
+        pMetadata->data.labelledCueRegion.purposeId[1] = buffer[9];
+        pMetadata->data.labelledCueRegion.purposeId[2] = buffer[10];
+        pMetadata->data.labelledCueRegion.purposeId[3] = buffer[11];
+        pMetadata->data.labelledCueRegion.country      = drwav_bytes_to_u16(buffer + 12);
+        pMetadata->data.labelledCueRegion.language     = drwav_bytes_to_u16(buffer + 14);
+        pMetadata->data.labelledCueRegion.dialect      = drwav_bytes_to_u16(buffer + 16);
+        pMetadata->data.labelledCueRegion.codePage     = drwav_bytes_to_u16(buffer + 18);
+
+        sizeIncludingNullTerminator = (drwav_uint32)chunkSize - DRWAV_LIST_LABELLED_TEXT_BYTES;
+        if (sizeIncludingNullTerminator > 0) {
+            pMetadata->data.labelledCueRegion.stringLength = sizeIncludingNullTerminator - 1;
+            pMetadata->data.labelledCueRegion.pString      = (char*)drwav__metadata_get_memory(pParser, sizeIncludingNullTerminator, 1);
+            DRWAV_ASSERT(pMetadata->data.labelledCueRegion.pString != NULL);
+
+            drwav__metadata_parser_read(pParser, pMetadata->data.labelledCueRegion.pString, sizeIncludingNullTerminator, &totalBytesRead);
+        } else {
+            pMetadata->data.labelledCueRegion.stringLength = 0;
+            pMetadata->data.labelledCueRegion.pString      = NULL;
+        }
+    }
+
+    return totalBytesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav__metadata_process_info_text_chunk(drwav__metadata_parser* pParser, drwav_uint64 chunkSize, drwav_metadata_type type)
+{
+    drwav_uint64 bytesRead = 0;
+    drwav_uint32 stringSizeWithNullTerminator = (drwav_uint32)chunkSize;
+
+    if (pParser->stage == drwav__metadata_parser_stage_count) {
+        pParser->metadataCount += 1;
+        drwav__metadata_request_extra_memory_for_stage_2(pParser, stringSizeWithNullTerminator, 1);
+    } else {
+        drwav_metadata* pMetadata = &pParser->pMetadata[pParser->metadataCursor];
+        pMetadata->type = type;
+        if (stringSizeWithNullTerminator > 0) {
+            pMetadata->data.infoText.stringLength = stringSizeWithNullTerminator - 1;
+            pMetadata->data.infoText.pString = (char*)drwav__metadata_get_memory(pParser, stringSizeWithNullTerminator, 1);
+            DRWAV_ASSERT(pMetadata->data.infoText.pString != NULL);
+
+            bytesRead = drwav__metadata_parser_read(pParser, pMetadata->data.infoText.pString, (size_t)stringSizeWithNullTerminator, NULL);
+            if (bytesRead == chunkSize) {
+                pParser->metadataCursor += 1;
+            } else {
+                /* Failed to parse. */
+            }
+        } else {
+            pMetadata->data.infoText.stringLength = 0;
+            pMetadata->data.infoText.pString      = NULL;
+            pParser->metadataCursor += 1;
+        }
+    }
+
+    return bytesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav__metadata_process_unknown_chunk(drwav__metadata_parser* pParser, const drwav_uint8* pChunkId, drwav_uint64 chunkSize, drwav_metadata_location location)
+{
+    drwav_uint64 bytesRead = 0;
+
+    if (location == drwav_metadata_location_invalid) {
+        return 0;
+    }
+
+    if (drwav_fourcc_equal(pChunkId, "data") || drwav_fourcc_equal(pChunkId, "fmt ") || drwav_fourcc_equal(pChunkId, "fact")) {
+        return 0;
+    }
+
+    if (pParser->stage == drwav__metadata_parser_stage_count) {
+        pParser->metadataCount += 1;
+        drwav__metadata_request_extra_memory_for_stage_2(pParser, (size_t)chunkSize, 1);
+    } else {
+        drwav_metadata* pMetadata = &pParser->pMetadata[pParser->metadataCursor];
+        pMetadata->type                         = drwav_metadata_type_unknown;
+        pMetadata->data.unknown.chunkLocation   = location;
+        pMetadata->data.unknown.id[0]           = pChunkId[0];
+        pMetadata->data.unknown.id[1]           = pChunkId[1];
+        pMetadata->data.unknown.id[2]           = pChunkId[2];
+        pMetadata->data.unknown.id[3]           = pChunkId[3];
+        pMetadata->data.unknown.dataSizeInBytes = (drwav_uint32)chunkSize;
+        pMetadata->data.unknown.pData           = (drwav_uint8 *)drwav__metadata_get_memory(pParser, (size_t)chunkSize, 1);
+        DRWAV_ASSERT(pMetadata->data.unknown.pData != NULL);
+
+        bytesRead = drwav__metadata_parser_read(pParser, pMetadata->data.unknown.pData, pMetadata->data.unknown.dataSizeInBytes, NULL);
+        if (bytesRead == pMetadata->data.unknown.dataSizeInBytes) {
+            pParser->metadataCursor += 1;
+        } else {
+            /* Failed to read. */
+        }
+    }
+
+    return bytesRead;
+}
+
+DRWAV_PRIVATE drwav_bool32 drwav__chunk_matches(drwav_metadata_type allowedMetadataTypes, const drwav_uint8* pChunkID, drwav_metadata_type type, const char* pID)
+{
+    return (allowedMetadataTypes & type) && drwav_fourcc_equal(pChunkID, pID);
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav__metadata_process_chunk(drwav__metadata_parser* pParser, const drwav_chunk_header* pChunkHeader, drwav_metadata_type allowedMetadataTypes)
+{
+    const drwav_uint8 *pChunkID = pChunkHeader->id.fourcc;
+    drwav_uint64 bytesRead = 0;
+
+    if (drwav__chunk_matches(allowedMetadataTypes, pChunkID, drwav_metadata_type_smpl, "smpl")) {
+        if (pChunkHeader->sizeInBytes >= DRWAV_SMPL_BYTES) {
+            if (pParser->stage == drwav__metadata_parser_stage_count) {
+                drwav_uint8 buffer[4];
+                size_t bytesJustRead;
+
+                if (!pParser->onSeek(pParser->pReadSeekUserData, 28, DRWAV_SEEK_CUR)) {
+                    return bytesRead;
+                }
+                bytesRead += 28;
+
+                bytesJustRead = drwav__metadata_parser_read(pParser, buffer, sizeof(buffer), &bytesRead);
+                if (bytesJustRead == sizeof(buffer)) {
+                    drwav_uint32 loopCount = drwav_bytes_to_u32(buffer);
+                    drwav_uint64 calculatedLoopCount;
+
+                    /* The loop count must be validated against the size of the chunk. */
+                    calculatedLoopCount = (pChunkHeader->sizeInBytes - DRWAV_SMPL_BYTES) / DRWAV_SMPL_LOOP_BYTES;
+                    if (calculatedLoopCount == loopCount) {
+                        bytesJustRead = drwav__metadata_parser_read(pParser, buffer, sizeof(buffer), &bytesRead);
+                        if (bytesJustRead == sizeof(buffer)) {
+                            drwav_uint32 samplerSpecificDataSizeInBytes = drwav_bytes_to_u32(buffer);
+
+                            pParser->metadataCount += 1;
+                            drwav__metadata_request_extra_memory_for_stage_2(pParser, sizeof(drwav_smpl_loop) * loopCount, DRWAV_METADATA_ALIGNMENT);
+                            drwav__metadata_request_extra_memory_for_stage_2(pParser, samplerSpecificDataSizeInBytes, 1);
+                        }
+                    } else {
+                        /* Loop count in header does not match the size of the chunk. */
+                    }
+                }
+            } else {
+                bytesRead = drwav__read_smpl_to_metadata_obj(pParser, pChunkHeader, &pParser->pMetadata[pParser->metadataCursor]);
+                if (bytesRead == pChunkHeader->sizeInBytes) {
+                    pParser->metadataCursor += 1;
+                } else {
+                    /* Failed to parse. */
+                }
+            }
+        } else {
+            /* Incorrectly formed chunk. */
+        }
+    } else if (drwav__chunk_matches(allowedMetadataTypes, pChunkID, drwav_metadata_type_inst, "inst")) {
+        if (pChunkHeader->sizeInBytes == DRWAV_INST_BYTES) {
+            if (pParser->stage == drwav__metadata_parser_stage_count) {
+                pParser->metadataCount += 1;
+            } else {
+                bytesRead = drwav__read_inst_to_metadata_obj(pParser, &pParser->pMetadata[pParser->metadataCursor]);
+                if (bytesRead == pChunkHeader->sizeInBytes) {
+                    pParser->metadataCursor += 1;
+                } else {
+                    /* Failed to parse. */
+                }
+            }
+        } else {
+            /* Incorrectly formed chunk. */
+        }
+    } else if (drwav__chunk_matches(allowedMetadataTypes, pChunkID, drwav_metadata_type_acid, "acid")) {
+        if (pChunkHeader->sizeInBytes == DRWAV_ACID_BYTES) {
+            if (pParser->stage == drwav__metadata_parser_stage_count) {
+                pParser->metadataCount += 1;
+            } else {
+                bytesRead = drwav__read_acid_to_metadata_obj(pParser, &pParser->pMetadata[pParser->metadataCursor]);
+                if (bytesRead == pChunkHeader->sizeInBytes) {
+                    pParser->metadataCursor += 1;
+                } else {
+                    /* Failed to parse. */
+                }
+            }
+        } else {
+            /* Incorrectly formed chunk. */
+        }
+    } else if (drwav__chunk_matches(allowedMetadataTypes, pChunkID, drwav_metadata_type_cue, "cue ")) {
+        if (pChunkHeader->sizeInBytes >= DRWAV_CUE_BYTES) {
+            if (pParser->stage == drwav__metadata_parser_stage_count) {
+                size_t cueCount;
+
+                pParser->metadataCount += 1;
+                cueCount = (size_t)(pChunkHeader->sizeInBytes - DRWAV_CUE_BYTES) / DRWAV_CUE_POINT_BYTES;
+                drwav__metadata_request_extra_memory_for_stage_2(pParser, sizeof(drwav_cue_point) * cueCount, DRWAV_METADATA_ALIGNMENT);
+            } else {
+                bytesRead = drwav__read_cue_to_metadata_obj(pParser, pChunkHeader, &pParser->pMetadata[pParser->metadataCursor]);
+                if (bytesRead == pChunkHeader->sizeInBytes) {
+                    pParser->metadataCursor += 1;
+                } else {
+                    /* Failed to parse. */
+                }
+            }
+        } else {
+            /* Incorrectly formed chunk. */
+        }
+    } else if (drwav__chunk_matches(allowedMetadataTypes, pChunkID, drwav_metadata_type_bext, "bext")) {
+        if (pChunkHeader->sizeInBytes >= DRWAV_BEXT_BYTES) {
+            if (pParser->stage == drwav__metadata_parser_stage_count) {
+                /* The description field is the largest one in a bext chunk, so that is the max size of this temporary buffer. */
+                char buffer[DRWAV_BEXT_DESCRIPTION_BYTES + 1];
+                size_t allocSizeNeeded = DRWAV_BEXT_UMID_BYTES; /* We know we will need SMPTE umid size. */
+                size_t bytesJustRead;
+
+                buffer[DRWAV_BEXT_DESCRIPTION_BYTES] = '\0';
+                bytesJustRead = drwav__metadata_parser_read(pParser, buffer, DRWAV_BEXT_DESCRIPTION_BYTES, &bytesRead);
+                if (bytesJustRead != DRWAV_BEXT_DESCRIPTION_BYTES) {
+                    return bytesRead;
+                }
+                allocSizeNeeded += drwav__strlen(buffer) + 1;
+
+                buffer[DRWAV_BEXT_ORIGINATOR_NAME_BYTES] = '\0';
+                bytesJustRead = drwav__metadata_parser_read(pParser, buffer, DRWAV_BEXT_ORIGINATOR_NAME_BYTES, &bytesRead);
+                if (bytesJustRead != DRWAV_BEXT_ORIGINATOR_NAME_BYTES) {
+                    return bytesRead;
+                }
+                allocSizeNeeded += drwav__strlen(buffer) + 1;
+
+                buffer[DRWAV_BEXT_ORIGINATOR_REF_BYTES] = '\0';
+                bytesJustRead = drwav__metadata_parser_read(pParser, buffer, DRWAV_BEXT_ORIGINATOR_REF_BYTES, &bytesRead);
+                if (bytesJustRead != DRWAV_BEXT_ORIGINATOR_REF_BYTES) {
+                    return bytesRead;
+                }
+                allocSizeNeeded += drwav__strlen(buffer) + 1;
+                allocSizeNeeded += (size_t)pChunkHeader->sizeInBytes - DRWAV_BEXT_BYTES + 1; /* Coding history. */
+
+                drwav__metadata_request_extra_memory_for_stage_2(pParser, allocSizeNeeded, 1);
+
+                pParser->metadataCount += 1;
+            } else {
+                bytesRead = drwav__read_bext_to_metadata_obj(pParser, &pParser->pMetadata[pParser->metadataCursor], pChunkHeader->sizeInBytes);
+                if (bytesRead == pChunkHeader->sizeInBytes) {
+                    pParser->metadataCursor += 1;
+                } else {
+                    /* Failed to parse. */
+                }
+            }
+        } else {
+            /* Incorrectly formed chunk. */
+        }
+    } else if (drwav_fourcc_equal(pChunkID, "LIST") || drwav_fourcc_equal(pChunkID, "list")) {
+        drwav_metadata_location listType = drwav_metadata_location_invalid;
+        while (bytesRead < pChunkHeader->sizeInBytes) {
+            drwav_uint8 subchunkId[4];
+            drwav_uint8 subchunkSizeBuffer[4];
+            drwav_uint64 subchunkDataSize;
+            drwav_uint64 subchunkBytesRead = 0;
+            drwav_uint64 bytesJustRead = drwav__metadata_parser_read(pParser, subchunkId, sizeof(subchunkId), &bytesRead);
+            if (bytesJustRead != sizeof(subchunkId)) {
+                break;
+            }
+
+            /*
+            The first thing in a list chunk should be "adtl" or "INFO".
+
+              - adtl means this list is a Associated Data List Chunk and will contain labels, notes
+                or labelled cue regions.
+              - INFO means this list is an Info List Chunk containing info text chunks such as IPRD
+                which would specifies the album of this wav file.
+
+            No data follows the adtl or INFO id so we just make note of what type this list is and
+            continue.
+            */
+            if (drwav_fourcc_equal(subchunkId, "adtl")) {
+                listType = drwav_metadata_location_inside_adtl_list;
+                continue;
+            } else if (drwav_fourcc_equal(subchunkId, "INFO")) {
+                listType = drwav_metadata_location_inside_info_list;
+                continue;
+            }
+
+            bytesJustRead = drwav__metadata_parser_read(pParser, subchunkSizeBuffer, sizeof(subchunkSizeBuffer), &bytesRead);
+            if (bytesJustRead != sizeof(subchunkSizeBuffer)) {
+                break;
+            }
+            subchunkDataSize = drwav_bytes_to_u32(subchunkSizeBuffer);
+
+            if (drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_label, "labl") || drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_note, "note")) {
+                if (subchunkDataSize >= DRWAV_LIST_LABEL_OR_NOTE_BYTES) {
+                    drwav_uint64 stringSizeWithNullTerm = subchunkDataSize - DRWAV_LIST_LABEL_OR_NOTE_BYTES;
+                    if (pParser->stage == drwav__metadata_parser_stage_count) {
+                        pParser->metadataCount += 1;
+                        drwav__metadata_request_extra_memory_for_stage_2(pParser, (size_t)stringSizeWithNullTerm, 1);
+                    } else {
+                        subchunkBytesRead = drwav__read_list_label_or_note_to_metadata_obj(pParser, &pParser->pMetadata[pParser->metadataCursor], subchunkDataSize, drwav_fourcc_equal(subchunkId, "labl") ? drwav_metadata_type_list_label : drwav_metadata_type_list_note);
+                        if (subchunkBytesRead == subchunkDataSize) {
+                            pParser->metadataCursor += 1;
+                        } else {
+                            /* Failed to parse. */
+                        }
+                    }
+                } else {
+                    /* Incorrectly formed chunk. */
+                }
+            } else if (drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_labelled_cue_region, "ltxt")) {
+                if (subchunkDataSize >= DRWAV_LIST_LABELLED_TEXT_BYTES) {
+                    drwav_uint64 stringSizeWithNullTerminator = subchunkDataSize - DRWAV_LIST_LABELLED_TEXT_BYTES;
+                    if (pParser->stage == drwav__metadata_parser_stage_count) {
+                        pParser->metadataCount += 1;
+                        drwav__metadata_request_extra_memory_for_stage_2(pParser, (size_t)stringSizeWithNullTerminator, 1);
+                    } else {
+                        subchunkBytesRead = drwav__read_list_labelled_cue_region_to_metadata_obj(pParser, &pParser->pMetadata[pParser->metadataCursor], subchunkDataSize);
+                        if (subchunkBytesRead == subchunkDataSize) {
+                            pParser->metadataCursor += 1;
+                        } else {
+                            /* Failed to parse. */
+                        }
+                    }
+                } else {
+                    /* Incorrectly formed chunk. */
+                }
+            } else if (drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_info_software, "ISFT")) {
+                subchunkBytesRead = drwav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  drwav_metadata_type_list_info_software);
+            } else if (drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_info_copyright, "ICOP")) {
+                subchunkBytesRead = drwav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  drwav_metadata_type_list_info_copyright);
+            } else if (drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_info_title, "INAM")) {
+                subchunkBytesRead = drwav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  drwav_metadata_type_list_info_title);
+            } else if (drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_info_artist, "IART")) {
+                subchunkBytesRead = drwav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  drwav_metadata_type_list_info_artist);
+            } else if (drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_info_comment, "ICMT")) {
+                subchunkBytesRead = drwav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  drwav_metadata_type_list_info_comment);
+            } else if (drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_info_date, "ICRD")) {
+                subchunkBytesRead = drwav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  drwav_metadata_type_list_info_date);
+            } else if (drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_info_genre, "IGNR")) {
+                subchunkBytesRead = drwav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  drwav_metadata_type_list_info_genre);
+            } else if (drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_info_album, "IPRD")) {
+                subchunkBytesRead = drwav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  drwav_metadata_type_list_info_album);
+            } else if (drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_info_tracknumber, "ITRK")) {
+                subchunkBytesRead = drwav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  drwav_metadata_type_list_info_tracknumber);
+            } else if (drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_info_location, "IARL")) {
+                subchunkBytesRead = drwav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  drwav_metadata_type_list_info_location);
+            } else if (drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_info_organization, "ICMS")) {
+                subchunkBytesRead = drwav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  drwav_metadata_type_list_info_organization);
+            } else if (drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_info_keywords, "IKEY")) {
+                subchunkBytesRead = drwav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  drwav_metadata_type_list_info_keywords);
+            } else if (drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_info_medium, "IMED")) {
+                subchunkBytesRead = drwav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  drwav_metadata_type_list_info_medium);
+            } else if (drwav__chunk_matches(allowedMetadataTypes, subchunkId, drwav_metadata_type_list_info_description, "ISBJ")) {
+                subchunkBytesRead = drwav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  drwav_metadata_type_list_info_description);
+            } else if ((allowedMetadataTypes & drwav_metadata_type_unknown) != 0) {
+                subchunkBytesRead = drwav__metadata_process_unknown_chunk(pParser, subchunkId, subchunkDataSize, listType);
+            }
+
+            bytesRead += subchunkBytesRead;
+            DRWAV_ASSERT(subchunkBytesRead <= subchunkDataSize);
+
+            if (subchunkBytesRead < subchunkDataSize) {
+                drwav_uint64 bytesToSeek = subchunkDataSize - subchunkBytesRead;
+
+                if (!pParser->onSeek(pParser->pReadSeekUserData, (int)bytesToSeek, DRWAV_SEEK_CUR)) {
+                    break;
+                }
+                bytesRead += bytesToSeek;
+            }
+
+            if ((subchunkDataSize % 2) == 1) {
+                if (!pParser->onSeek(pParser->pReadSeekUserData, 1, DRWAV_SEEK_CUR)) {
+                    break;
+                }
+                bytesRead += 1;
+            }
+        }
+    } else if ((allowedMetadataTypes & drwav_metadata_type_unknown) != 0) {
+        bytesRead = drwav__metadata_process_unknown_chunk(pParser, pChunkID, pChunkHeader->sizeInBytes, drwav_metadata_location_top_level);
+    }
+
+    return bytesRead;
+}
+
+
+DRWAV_PRIVATE drwav_uint32 drwav_get_bytes_per_pcm_frame(drwav* pWav)
+{
+    drwav_uint32 bytesPerFrame;
+
+    /*
+    The bytes per frame is a bit ambiguous. It can be either be based on the bits per sample, or the block align. The way I'm doing it here
+    is that if the bits per sample is a multiple of 8, use floor(bitsPerSample*channels/8), otherwise fall back to the block align.
+    */
+    if ((pWav->bitsPerSample & 0x7) == 0) {
+        /* Bits per sample is a multiple of 8. */
+        bytesPerFrame = (pWav->bitsPerSample * pWav->fmt.channels) >> 3;
+    } else {
+        bytesPerFrame = pWav->fmt.blockAlign;
+    }
+
+    /* Validation for known formats. a-law and mu-law should be 1 byte per channel. If it's not, it's not decodable. */
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ALAW || pWav->translatedFormatTag == DR_WAVE_FORMAT_MULAW) {
+        if (bytesPerFrame != pWav->fmt.channels) {
+            return 0;   /* Invalid file. */
+        }
+    }
+
+    return bytesPerFrame;
+}
+
+DRWAV_API drwav_uint16 drwav_fmt_get_format(const drwav_fmt* pFMT)
+{
+    if (pFMT == NULL) {
+        return 0;
+    }
+
+    if (pFMT->formatTag != DR_WAVE_FORMAT_EXTENSIBLE) {
+        return pFMT->formatTag;
+    } else {
+        return drwav_bytes_to_u16(pFMT->subFormat);    /* Only the first two bytes are required. */
+    }
+}
+
+DRWAV_PRIVATE drwav_bool32 drwav_preinit(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_tell_proc onTell, void* pReadSeekTellUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pWav == NULL || onRead == NULL || onSeek == NULL) { /* <-- onTell is optional. */
+        return DRWAV_FALSE;
+    }
+
+    DRWAV_ZERO_MEMORY(pWav, sizeof(*pWav));
+    pWav->onRead    = onRead;
+    pWav->onSeek    = onSeek;
+    pWav->onTell    = onTell;
+    pWav->pUserData = pReadSeekTellUserData;
+    pWav->allocationCallbacks = drwav_copy_allocation_callbacks_or_defaults(pAllocationCallbacks);
+
+    if (pWav->allocationCallbacks.onFree == NULL || (pWav->allocationCallbacks.onMalloc == NULL && pWav->allocationCallbacks.onRealloc == NULL)) {
+        return DRWAV_FALSE;    /* Invalid allocation callbacks. */
+    }
+
+    return DRWAV_TRUE;
+}
+
+DRWAV_PRIVATE drwav_bool32 drwav_init__internal(drwav* pWav, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags)
+{
+    /* This function assumes drwav_preinit() has been called beforehand. */
+    drwav_result result;
+    drwav_uint64 cursor;    /* <-- Keeps track of the byte position so we can seek to specific locations. */
+    drwav_bool32 sequential;
+    drwav_uint8 riff[4];
+    drwav_fmt fmt;
+    unsigned short translatedFormatTag;
+    drwav_uint64 dataChunkSize = 0;             /* <-- Important! Don't explicitly set this to 0 anywhere else. Calculation of the size of the data chunk is performed in different paths depending on the container. */
+    drwav_uint64 sampleCountFromFactChunk = 0;  /* Same as dataChunkSize - make sure this is the only place this is initialized to 0. */
+    drwav_uint64 metadataStartPos;
+    drwav__metadata_parser metadataParser;
+    drwav_bool8 isProcessingMetadata = DRWAV_FALSE;
+    drwav_bool8 foundChunk_fmt  = DRWAV_FALSE;
+    drwav_bool8 foundChunk_data = DRWAV_FALSE;
+    drwav_bool8 isAIFCFormType = DRWAV_FALSE;   /* Only used with AIFF. */
+    drwav_uint64 aiffFrameCount = 0;
+
+    cursor = 0;
+    sequential = (flags & DRWAV_SEQUENTIAL) != 0;
+    DRWAV_ZERO_OBJECT(&fmt);
+
+    /* The first 4 bytes should be the RIFF identifier. */
+    if (drwav__on_read(pWav->onRead, pWav->pUserData, riff, sizeof(riff), &cursor) != sizeof(riff)) {
+        return DRWAV_FALSE;
+    }
+
+    /*
+    The first 4 bytes can be used to identify the container. For RIFF files it will start with "RIFF" and for
+    w64 it will start with "riff".
+    */
+    if (drwav_fourcc_equal(riff, "RIFF")) {
+        pWav->container = drwav_container_riff;
+    } else if (drwav_fourcc_equal(riff, "RIFX")) {
+        pWav->container = drwav_container_rifx;
+    } else if (drwav_fourcc_equal(riff, "riff")) {
+        int i;
+        drwav_uint8 riff2[12];
+
+        pWav->container = drwav_container_w64;
+
+        /* Check the rest of the GUID for validity. */
+        if (drwav__on_read(pWav->onRead, pWav->pUserData, riff2, sizeof(riff2), &cursor) != sizeof(riff2)) {
+            return DRWAV_FALSE;
+        }
+
+        for (i = 0; i < 12; ++i) {
+            if (riff2[i] != drwavGUID_W64_RIFF[i+4]) {
+                return DRWAV_FALSE;
+            }
+        }
+    } else if (drwav_fourcc_equal(riff, "RF64")) {
+        pWav->container = drwav_container_rf64;
+    } else if (drwav_fourcc_equal(riff, "FORM")) {
+        pWav->container = drwav_container_aiff;
+    } else {
+        return DRWAV_FALSE;   /* Unknown or unsupported container. */
+    }
+
+
+    if (pWav->container == drwav_container_riff || pWav->container == drwav_container_rifx || pWav->container == drwav_container_rf64) {
+        drwav_uint8 chunkSizeBytes[4];
+        drwav_uint8 wave[4];
+
+        if (drwav__on_read(pWav->onRead, pWav->pUserData, chunkSizeBytes, sizeof(chunkSizeBytes), &cursor) != sizeof(chunkSizeBytes)) {
+            return DRWAV_FALSE;
+        }
+
+        if (pWav->container == drwav_container_riff || pWav->container == drwav_container_rifx) {
+            if (drwav_bytes_to_u32_ex(chunkSizeBytes, pWav->container) < 36) {
+                /*
+                I've had a report of a WAV file failing to load when the size of the WAVE chunk is not encoded
+                and is instead just set to 0. I'm going to relax the validation here to allow these files to
+                load. Considering the chunk size isn't actually used this should be safe. With this change my
+                test suite still passes.
+                */
+                /*return DRWAV_FALSE;*/    /* Chunk size should always be at least 36 bytes. */
+            }
+        } else if (pWav->container == drwav_container_rf64) {
+            if (drwav_bytes_to_u32_le(chunkSizeBytes) != 0xFFFFFFFF) {
+                return DRWAV_FALSE;    /* Chunk size should always be set to -1/0xFFFFFFFF for RF64. The actual size is retrieved later. */
+            }
+        } else {
+            return DRWAV_FALSE; /* Should never hit this. */
+        }
+
+        if (drwav__on_read(pWav->onRead, pWav->pUserData, wave, sizeof(wave), &cursor) != sizeof(wave)) {
+            return DRWAV_FALSE;
+        }
+
+        if (!drwav_fourcc_equal(wave, "WAVE")) {
+            return DRWAV_FALSE;    /* Expecting "WAVE". */
+        }
+    } else if (pWav->container == drwav_container_w64) {
+        drwav_uint8 chunkSizeBytes[8];
+        drwav_uint8 wave[16];
+
+        if (drwav__on_read(pWav->onRead, pWav->pUserData, chunkSizeBytes, sizeof(chunkSizeBytes), &cursor) != sizeof(chunkSizeBytes)) {
+            return DRWAV_FALSE;
+        }
+
+        if (drwav_bytes_to_u64(chunkSizeBytes) < 80) {
+            return DRWAV_FALSE;
+        }
+
+        if (drwav__on_read(pWav->onRead, pWav->pUserData, wave, sizeof(wave), &cursor) != sizeof(wave)) {
+            return DRWAV_FALSE;
+        }
+
+        if (!drwav_guid_equal(wave, drwavGUID_W64_WAVE)) {
+            return DRWAV_FALSE;
+        }
+    } else if (pWav->container == drwav_container_aiff) {
+        drwav_uint8 chunkSizeBytes[4];
+        drwav_uint8 aiff[4];
+
+        if (drwav__on_read(pWav->onRead, pWav->pUserData, chunkSizeBytes, sizeof(chunkSizeBytes), &cursor) != sizeof(chunkSizeBytes)) {
+            return DRWAV_FALSE;
+        }
+
+        if (drwav_bytes_to_u32_be(chunkSizeBytes) < 18) {
+            return DRWAV_FALSE;
+        }
+
+        if (drwav__on_read(pWav->onRead, pWav->pUserData, aiff, sizeof(aiff), &cursor) != sizeof(aiff)) {
+            return DRWAV_FALSE;
+        }
+
+        if (drwav_fourcc_equal(aiff, "AIFF")) {
+            isAIFCFormType = DRWAV_FALSE;
+        } else if (drwav_fourcc_equal(aiff, "AIFC")) {
+            isAIFCFormType = DRWAV_TRUE;
+        } else {
+            return DRWAV_FALSE; /* Expecting "AIFF" or "AIFC". */
+        }
+    } else {
+        return DRWAV_FALSE;
+    }
+
+
+    /* For RF64, the "ds64" chunk must come next, before the "fmt " chunk. */
+    if (pWav->container == drwav_container_rf64) {
+        drwav_uint8 sizeBytes[8];
+        drwav_uint64 bytesRemainingInChunk;
+        drwav_chunk_header header;
+        result = drwav__read_chunk_header(pWav->onRead, pWav->pUserData, pWav->container, &cursor, &header);
+        if (result != DRWAV_SUCCESS) {
+            return DRWAV_FALSE;
+        }
+
+        if (!drwav_fourcc_equal(header.id.fourcc, "ds64")) {
+            return DRWAV_FALSE; /* Expecting "ds64". */
+        }
+
+        bytesRemainingInChunk = header.sizeInBytes + header.paddingSize;
+
+        /* We don't care about the size of the RIFF chunk - skip it. */
+        if (!drwav__seek_forward(pWav->onSeek, 8, pWav->pUserData)) {
+            return DRWAV_FALSE;
+        }
+        bytesRemainingInChunk -= 8;
+        cursor += 8;
+
+
+        /* Next 8 bytes is the size of the "data" chunk. */
+        if (drwav__on_read(pWav->onRead, pWav->pUserData, sizeBytes, sizeof(sizeBytes), &cursor) != sizeof(sizeBytes)) {
+            return DRWAV_FALSE;
+        }
+        bytesRemainingInChunk -= 8;
+        dataChunkSize = drwav_bytes_to_u64(sizeBytes);
+
+
+        /* Next 8 bytes is the same count which we would usually derived from the FACT chunk if it was available. */
+        if (drwav__on_read(pWav->onRead, pWav->pUserData, sizeBytes, sizeof(sizeBytes), &cursor) != sizeof(sizeBytes)) {
+            return DRWAV_FALSE;
+        }
+        bytesRemainingInChunk -= 8;
+        sampleCountFromFactChunk = drwav_bytes_to_u64(sizeBytes);
+
+
+        /* Skip over everything else. */
+        if (!drwav__seek_forward(pWav->onSeek, bytesRemainingInChunk, pWav->pUserData)) {
+            return DRWAV_FALSE;
+        }
+        cursor += bytesRemainingInChunk;
+    }
+
+
+    metadataStartPos = cursor;
+
+    /*
+    Whether or not we are processing metadata controls how we load. We can load more efficiently when
+    metadata is not being processed, but we also cannot process metadata for Wave64 because I have not
+    been able to test it. If someone is able to test this and provide a patch I'm happy to enable it.
+
+    Seqential mode cannot support metadata because it involves seeking backwards.
+    */
+    isProcessingMetadata = !sequential && ((flags & DRWAV_WITH_METADATA) != 0);
+
+    /* Don't allow processing of metadata with untested containers. */
+    if (pWav->container != drwav_container_riff && pWav->container != drwav_container_rf64) {
+        isProcessingMetadata = DRWAV_FALSE;
+    }
+
+    DRWAV_ZERO_MEMORY(&metadataParser, sizeof(metadataParser));
+    if (isProcessingMetadata) {
+        metadataParser.onRead = pWav->onRead;
+        metadataParser.onSeek = pWav->onSeek;
+        metadataParser.pReadSeekUserData = pWav->pUserData;
+        metadataParser.stage  = drwav__metadata_parser_stage_count;
+    }
+
+
+    /*
+    From here on out, chunks might be in any order. In order to robustly handle metadata we'll need
+    to loop through every chunk and handle them as we find them. In sequential mode we need to get
+    out of the loop as soon as we find the data chunk because we won't be able to seek back.
+    */
+    for (;;) {  /* For each chunk... */
+        drwav_chunk_header header;
+        drwav_uint64 chunkSize;
+
+        result = drwav__read_chunk_header(pWav->onRead, pWav->pUserData, pWav->container, &cursor, &header);
+        if (result != DRWAV_SUCCESS) {
+            break;
+        }
+
+        chunkSize = header.sizeInBytes;
+
+
+        /*
+        Always tell the caller about this chunk. We cannot do this in sequential mode because the
+        callback is allowed to read from the file, in which case we'll need to rewind.
+        */
+        if (!sequential && onChunk != NULL) {
+            drwav_uint64 callbackBytesRead = onChunk(pChunkUserData, pWav->onRead, pWav->onSeek, pWav->pUserData, &header, pWav->container, &fmt);
+
+            /*
+            dr_wav may need to read the contents of the chunk, so we now need to seek back to the position before
+            we called the callback.
+            */
+            if (callbackBytesRead > 0) {
+                if (drwav__seek_from_start(pWav->onSeek, cursor, pWav->pUserData) == DRWAV_FALSE) {
+                    return DRWAV_FALSE;
+                }
+            }
+        }
+
+
+        /* Explicitly handle known chunks first. */
+
+        /* "fmt " */
+        if (((pWav->container == drwav_container_riff || pWav->container == drwav_container_rifx || pWav->container == drwav_container_rf64) && drwav_fourcc_equal(header.id.fourcc, "fmt ")) ||
+            ((pWav->container == drwav_container_w64) && drwav_guid_equal(header.id.guid, drwavGUID_W64_FMT))) {
+            drwav_uint8 fmtData[16];
+
+            foundChunk_fmt = DRWAV_TRUE;
+
+            if (pWav->onRead(pWav->pUserData, fmtData, sizeof(fmtData)) != sizeof(fmtData)) {
+                return DRWAV_FALSE;
+            }
+            cursor += sizeof(fmtData);
+
+            fmt.formatTag      = drwav_bytes_to_u16_ex(fmtData + 0,  pWav->container);
+            fmt.channels       = drwav_bytes_to_u16_ex(fmtData + 2,  pWav->container);
+            fmt.sampleRate     = drwav_bytes_to_u32_ex(fmtData + 4,  pWav->container);
+            fmt.avgBytesPerSec = drwav_bytes_to_u32_ex(fmtData + 8,  pWav->container);
+            fmt.blockAlign     = drwav_bytes_to_u16_ex(fmtData + 12, pWav->container);
+            fmt.bitsPerSample  = drwav_bytes_to_u16_ex(fmtData + 14, pWav->container);
+
+            fmt.extendedSize       = 0;
+            fmt.validBitsPerSample = 0;
+            fmt.channelMask        = 0;
+            DRWAV_ZERO_MEMORY(fmt.subFormat, sizeof(fmt.subFormat));
+
+            if (header.sizeInBytes > 16) {
+                drwav_uint8 fmt_cbSize[2];
+                int bytesReadSoFar = 0;
+
+                if (pWav->onRead(pWav->pUserData, fmt_cbSize, sizeof(fmt_cbSize)) != sizeof(fmt_cbSize)) {
+                    return DRWAV_FALSE;    /* Expecting more data. */
+                }
+                cursor += sizeof(fmt_cbSize);
+
+                bytesReadSoFar = 18;
+
+                fmt.extendedSize = drwav_bytes_to_u16_ex(fmt_cbSize, pWav->container);
+                if (fmt.extendedSize > 0) {
+                    /* Simple validation. */
+                    if (fmt.formatTag == DR_WAVE_FORMAT_EXTENSIBLE) {
+                        if (fmt.extendedSize != 22) {
+                            return DRWAV_FALSE;
+                        }
+                    }
+
+                    if (fmt.formatTag == DR_WAVE_FORMAT_EXTENSIBLE) {
+                        drwav_uint8 fmtext[22];
+
+                        if (pWav->onRead(pWav->pUserData, fmtext, fmt.extendedSize) != fmt.extendedSize) {
+                            return DRWAV_FALSE;    /* Expecting more data. */
+                        }
+
+                        fmt.validBitsPerSample = drwav_bytes_to_u16_ex(fmtext + 0, pWav->container);
+                        fmt.channelMask        = drwav_bytes_to_u32_ex(fmtext + 2, pWav->container);
+                        drwav_bytes_to_guid(fmtext + 6, fmt.subFormat);
+                    } else {
+                        if (pWav->onSeek(pWav->pUserData, fmt.extendedSize, DRWAV_SEEK_CUR) == DRWAV_FALSE) {
+                            return DRWAV_FALSE;
+                        }
+                    }
+                    cursor += fmt.extendedSize;
+
+                    bytesReadSoFar += fmt.extendedSize;
+                }
+
+                /* Seek past any leftover bytes. For w64 the leftover will be defined based on the chunk size. */
+                if (pWav->onSeek(pWav->pUserData, (int)(header.sizeInBytes - bytesReadSoFar), DRWAV_SEEK_CUR) == DRWAV_FALSE) {
+                    return DRWAV_FALSE;
+                }
+                cursor += (header.sizeInBytes - bytesReadSoFar);
+            }
+
+            if (header.paddingSize > 0) {
+                if (drwav__seek_forward(pWav->onSeek, header.paddingSize, pWav->pUserData) == DRWAV_FALSE) {
+                    break;
+                }
+                cursor += header.paddingSize;
+            }
+
+            /* Go to the next chunk. Don't include this chunk in metadata. */
+            continue;
+        }
+
+        /* "data" */
+        if (((pWav->container == drwav_container_riff || pWav->container == drwav_container_rifx || pWav->container == drwav_container_rf64) && drwav_fourcc_equal(header.id.fourcc, "data")) ||
+            ((pWav->container == drwav_container_w64) && drwav_guid_equal(header.id.guid, drwavGUID_W64_DATA))) {
+            foundChunk_data = DRWAV_TRUE;
+
+            pWav->dataChunkDataPos  = cursor;
+
+            if (pWav->container != drwav_container_rf64) {  /* The data chunk size for RF64 will always be set to 0xFFFFFFFF here. It was set to it's true value earlier. */
+                dataChunkSize = chunkSize;
+            }
+
+            /* If we're running in sequential mode, or we're not reading metadata, we have enough now that we can get out of the loop. */
+            if (sequential || !isProcessingMetadata) {
+                break;      /* No need to keep reading beyond the data chunk. */
+            } else {
+                chunkSize += header.paddingSize;    /* <-- Make sure we seek past the padding. */
+                if (drwav__seek_forward(pWav->onSeek, chunkSize, pWav->pUserData) == DRWAV_FALSE) {
+                    break;
+                }
+                cursor += chunkSize;
+
+                continue;   /* There may be some more metadata to read. */
+            }
+        }
+
+        /* "fact". This is optional. Can use this to get the sample count which is useful for compressed formats. For RF64 we retrieved the sample count from the ds64 chunk earlier. */
+        if (((pWav->container == drwav_container_riff || pWav->container == drwav_container_rifx || pWav->container == drwav_container_rf64) && drwav_fourcc_equal(header.id.fourcc, "fact")) ||
+            ((pWav->container == drwav_container_w64) && drwav_guid_equal(header.id.guid, drwavGUID_W64_FACT))) {
+            if (pWav->container == drwav_container_riff || pWav->container == drwav_container_rifx) {
+                drwav_uint8 sampleCount[4];
+                if (drwav__on_read(pWav->onRead, pWav->pUserData, &sampleCount, 4, &cursor) != 4) {
+                    return DRWAV_FALSE;
+                }
+
+                chunkSize -= 4;
+
+                /*
+                The sample count in the "fact" chunk is either unreliable, or I'm not understanding it properly. For now I am only enabling this
+                for Microsoft ADPCM formats.
+                */
+                if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
+                    sampleCountFromFactChunk = drwav_bytes_to_u32_ex(sampleCount, pWav->container);
+                } else {
+                    sampleCountFromFactChunk = 0;
+                }
+            } else if (pWav->container == drwav_container_w64) {
+                if (drwav__on_read(pWav->onRead, pWav->pUserData, &sampleCountFromFactChunk, 8, &cursor) != 8) {
+                    return DRWAV_FALSE;
+                }
+
+                chunkSize -= 8;
+            } else if (pWav->container == drwav_container_rf64) {
+                /* We retrieved the sample count from the ds64 chunk earlier so no need to do that here. */
+            }
+
+            /* Seek to the next chunk in preparation for the next iteration. */
+            chunkSize += header.paddingSize;    /* <-- Make sure we seek past the padding. */
+            if (drwav__seek_forward(pWav->onSeek, chunkSize, pWav->pUserData) == DRWAV_FALSE) {
+                break;
+            }
+            cursor += chunkSize;
+
+            continue;
+        }
+
+
+        /* "COMM". AIFF/AIFC only. */
+        if (pWav->container == drwav_container_aiff && drwav_fourcc_equal(header.id.fourcc, "COMM")) {
+            drwav_uint8 commData[24];
+            drwav_uint32 commDataBytesToRead;
+            drwav_uint16 channels;
+            drwav_uint32 frameCount;
+            drwav_uint16 sampleSizeInBits;
+            drwav_int64  sampleRate;
+            drwav_uint16 compressionFormat;
+
+            foundChunk_fmt = DRWAV_TRUE;
+
+            if (isAIFCFormType) {
+                commDataBytesToRead = 24;
+                if (header.sizeInBytes < commDataBytesToRead) {
+                    return DRWAV_FALSE; /* Invalid COMM chunk. */
+                }
+            } else {
+                commDataBytesToRead = 18;
+                if (header.sizeInBytes != commDataBytesToRead) {
+                    return DRWAV_FALSE; /* INVALID COMM chunk. */
+                }
+            }
+
+            if (drwav__on_read(pWav->onRead, pWav->pUserData, commData, commDataBytesToRead, &cursor) != commDataBytesToRead) {
+                return DRWAV_FALSE;
+            }
+
+
+            channels         = drwav_bytes_to_u16_ex     (commData + 0, pWav->container);
+            frameCount       = drwav_bytes_to_u32_ex     (commData + 2, pWav->container);
+            sampleSizeInBits = drwav_bytes_to_u16_ex     (commData + 6, pWav->container);
+            sampleRate       = drwav_aiff_extented_to_s64(commData + 8);
+
+            if (sampleRate < 0 || sampleRate > 0xFFFFFFFF) {
+                return DRWAV_FALSE; /* Invalid sample rate. */
+            }
+
+            if (isAIFCFormType) {
+                const drwav_uint8* type = commData + 18;
+
+                if (drwav_fourcc_equal(type, "NONE")) {
+                    compressionFormat = DR_WAVE_FORMAT_PCM; /* PCM, big-endian. */
+                } else if (drwav_fourcc_equal(type, "raw ")) {
+                    compressionFormat = DR_WAVE_FORMAT_PCM;
+
+                    /* In my testing, it looks like when the "raw " compression type is used, 8-bit samples should be considered unsigned. */
+                    if (sampleSizeInBits == 8) {
+                        pWav->aiff.isUnsigned = DRWAV_TRUE;
+                    }
+                } else if (drwav_fourcc_equal(type, "sowt")) {
+                    compressionFormat = DR_WAVE_FORMAT_PCM; /* PCM, little-endian. */
+                    pWav->aiff.isLE = DRWAV_TRUE;
+                } else if (drwav_fourcc_equal(type, "fl32") || drwav_fourcc_equal(type, "fl64") || drwav_fourcc_equal(type, "FL32") || drwav_fourcc_equal(type, "FL64")) {
+                    compressionFormat = DR_WAVE_FORMAT_IEEE_FLOAT;
+                } else if (drwav_fourcc_equal(type, "alaw") || drwav_fourcc_equal(type, "ALAW")) {
+                    compressionFormat = DR_WAVE_FORMAT_ALAW;
+                } else if (drwav_fourcc_equal(type, "ulaw") || drwav_fourcc_equal(type, "ULAW")) {
+                    compressionFormat = DR_WAVE_FORMAT_MULAW;
+                } else if (drwav_fourcc_equal(type, "ima4")) {
+                    compressionFormat = DR_WAVE_FORMAT_DVI_ADPCM;
+                    sampleSizeInBits  = 4;
+
+                    /*
+                    I haven't been able to figure out how to get correct decoding for IMA ADPCM. Until this is figured out
+                    we'll need to abort when we encounter such an encoding. Advice welcome!
+                    */
+                    (void)compressionFormat;
+                    (void)sampleSizeInBits;
+
+                    return DRWAV_FALSE;
+                } else {
+                    return DRWAV_FALSE; /* Unknown or unsupported compression format. Need to abort. */
+                }
+            } else {
+                compressionFormat = DR_WAVE_FORMAT_PCM; /* It's a standard AIFF form which is always compressed. */
+            }
+
+            /* With AIFF we want to use the explicitly defined frame count rather than deriving it from the size of the chunk. */
+            aiffFrameCount = frameCount;
+
+            /* We should now have enough information to fill out our fmt structure. */
+            fmt.formatTag      = compressionFormat;
+            fmt.channels       = channels;
+            fmt.sampleRate     = (drwav_uint32)sampleRate;
+            fmt.bitsPerSample  = sampleSizeInBits;
+            fmt.blockAlign     = (drwav_uint16)(fmt.channels * fmt.bitsPerSample / 8);
+            fmt.avgBytesPerSec = fmt.blockAlign * fmt.sampleRate;
+
+            if (fmt.blockAlign == 0 && compressionFormat == DR_WAVE_FORMAT_DVI_ADPCM) {
+                fmt.blockAlign = 34 * fmt.channels;
+            }
+
+            /*
+            Weird one. I've seen some alaw and ulaw encoded files that for some reason set the bits per sample to 16 when
+            it should be 8. To get this working I need to explicitly check for this and change it.
+            */
+            if (compressionFormat == DR_WAVE_FORMAT_ALAW || compressionFormat == DR_WAVE_FORMAT_MULAW) {
+                if (fmt.bitsPerSample > 8) {
+                    fmt.bitsPerSample = 8;
+                    fmt.blockAlign = fmt.channels;
+                }
+            }
+
+            /* In AIFF, samples are padded to 8 byte boundaries. We need to round up our bits per sample here. */
+            fmt.bitsPerSample += (fmt.bitsPerSample & 7);
+
+
+            /* If the form type is AIFC there will be some additional data in the chunk. We need to seek past it. */
+            if (isAIFCFormType) {
+                if (drwav__seek_forward(pWav->onSeek, (chunkSize - commDataBytesToRead), pWav->pUserData) == DRWAV_FALSE) {
+                    return DRWAV_FALSE;
+                }
+                cursor += (chunkSize - commDataBytesToRead);
+            }
+
+            /* Don't fall through or else we'll end up treating this chunk as metadata which is incorrect. */
+            continue;
+        }
+
+
+        /* "SSND". AIFF/AIFC only. This is the AIFF equivalent of the "data" chunk. */
+        if (pWav->container == drwav_container_aiff && drwav_fourcc_equal(header.id.fourcc, "SSND")) {
+            drwav_uint8 offsetAndBlockSizeData[8];
+            drwav_uint32 offset;
+
+            foundChunk_data = DRWAV_TRUE;
+
+            if (drwav__on_read(pWav->onRead, pWav->pUserData, offsetAndBlockSizeData, sizeof(offsetAndBlockSizeData), &cursor) != sizeof(offsetAndBlockSizeData)) {
+                return DRWAV_FALSE;
+            }
+
+            /* The position of the audio data starts at an offset. */
+            offset = drwav_bytes_to_u32_ex(offsetAndBlockSizeData + 0, pWav->container);
+            pWav->dataChunkDataPos = cursor + offset;
+
+            /* The data chunk size needs to be reduced by the offset or else seeking will break. */
+            dataChunkSize = chunkSize;
+            if (dataChunkSize  > offset) {
+                dataChunkSize -= offset;
+            } else {
+                dataChunkSize = 0;
+            }
+
+            if (sequential) {
+                if (foundChunk_fmt) {   /* <-- Name is misleading, but will be set to true if the COMM chunk has been parsed. */
+                    /*
+                    Getting here means we're opening in sequential mode and we've found the SSND (data) and COMM (fmt) chunks. We need
+                    to get out of the loop here or else we'll end up going past the data chunk and will have no way of getting back to
+                    it since we're not allowed to seek backwards.
+
+                    One subtle detail here is that there is an offset with the SSND chunk. We need to make sure we seek past this offset
+                    so we're left sitting on the first byte of actual audio data.
+                    */
+                    if (drwav__seek_forward(pWav->onSeek, offset, pWav->pUserData) == DRWAV_FALSE) {
+                        return DRWAV_FALSE;
+                    }
+                    cursor += offset;
+
+                    break;
+                } else {
+                    /*
+                    Getting here means the COMM chunk was not found. In sequential mode, if we haven't yet found the COMM chunk
+                    we'll need to abort because we can't be doing a backwards seek back to the SSND chunk in order to read the
+                    data. For this reason, this configuration of AIFF files are not supported with sequential mode.
+                    */
+                    return DRWAV_FALSE;
+                }
+            } else {
+                chunkSize += header.paddingSize;                /* <-- Make sure we seek past the padding. */
+                chunkSize -= sizeof(offsetAndBlockSizeData);    /* <-- This was read earlier. */
+
+                if (drwav__seek_forward(pWav->onSeek, chunkSize, pWav->pUserData) == DRWAV_FALSE) {
+                    break;
+                }
+                cursor += chunkSize;
+
+                continue;   /* There may be some more metadata to read. */
+            }
+        }
+
+
+        /* Getting here means it's not a chunk that we care about internally, but might need to be handled as metadata by the caller. */
+        if (isProcessingMetadata) {
+            drwav__metadata_process_chunk(&metadataParser, &header, drwav_metadata_type_all_including_unknown);
+
+            /* Go back to the start of the chunk so we can normalize the position of the cursor. */
+            if (drwav__seek_from_start(pWav->onSeek, cursor, pWav->pUserData) == DRWAV_FALSE) {
+                break;  /* Failed to seek. Can't reliable read the remaining chunks. Get out. */
+            }
+        }
+
+
+        /* Make sure we skip past the content of this chunk before we go to the next one. */
+        chunkSize += header.paddingSize;    /* <-- Make sure we seek past the padding. */
+        if (drwav__seek_forward(pWav->onSeek, chunkSize, pWav->pUserData) == DRWAV_FALSE) {
+            break;
+        }
+        cursor += chunkSize;
+    }
+
+    /* There's some mandatory chunks that must exist. If they were not found in the iteration above we must abort. */
+    if (!foundChunk_fmt || !foundChunk_data) {
+        return DRWAV_FALSE;
+    }
+
+    /* Basic validation. */
+    if ((fmt.sampleRate    == 0 || fmt.sampleRate    > DRWAV_MAX_SAMPLE_RATE    ) ||
+        (fmt.channels      == 0 || fmt.channels      > DRWAV_MAX_CHANNELS       ) ||
+        (fmt.bitsPerSample == 0 || fmt.bitsPerSample > DRWAV_MAX_BITS_PER_SAMPLE) ||
+        fmt.blockAlign == 0) {
+        return DRWAV_FALSE; /* Probably an invalid WAV file. */
+    }
+
+    /* Translate the internal format. */
+    translatedFormatTag = fmt.formatTag;
+    if (translatedFormatTag == DR_WAVE_FORMAT_EXTENSIBLE) {
+        translatedFormatTag = drwav_bytes_to_u16_ex(fmt.subFormat + 0, pWav->container);
+    }
+
+    /* We may have moved passed the data chunk. If so we need to move back. If running in sequential mode we can assume we are already sitting on the data chunk. */
+    if (!sequential) {
+        if (!drwav__seek_from_start(pWav->onSeek, pWav->dataChunkDataPos, pWav->pUserData)) {
+            return DRWAV_FALSE;
+        }
+        cursor = pWav->dataChunkDataPos;
+    }
+
+
+    /*
+    At this point we should have done the initial parsing of each of our chunks, but we now need to
+    do a second pass to extract the actual contents of the metadata (the first pass just calculated
+    the length of the memory allocation).
+
+    We only do this if we've actually got metadata to parse.
+    */
+    if (isProcessingMetadata && metadataParser.metadataCount > 0) {
+        if (drwav__seek_from_start(pWav->onSeek, metadataStartPos, pWav->pUserData) == DRWAV_FALSE) {
+            return DRWAV_FALSE;
+        }
+
+        result = drwav__metadata_alloc(&metadataParser, &pWav->allocationCallbacks);
+        if (result != DRWAV_SUCCESS) {
+            return DRWAV_FALSE;
+        }
+
+        metadataParser.stage = drwav__metadata_parser_stage_read;
+
+        for (;;) {
+            drwav_chunk_header header;
+            drwav_uint64 metadataBytesRead;
+
+            result = drwav__read_chunk_header(pWav->onRead, pWav->pUserData, pWav->container, &cursor, &header);
+            if (result != DRWAV_SUCCESS) {
+                break;
+            }
+
+            metadataBytesRead = drwav__metadata_process_chunk(&metadataParser, &header, drwav_metadata_type_all_including_unknown);
+
+            /* Move to the end of the chunk so we can keep iterating. */
+            if (drwav__seek_forward(pWav->onSeek, (header.sizeInBytes + header.paddingSize) - metadataBytesRead, pWav->pUserData) == DRWAV_FALSE) {
+                drwav_free(metadataParser.pMetadata, &pWav->allocationCallbacks);
+                return DRWAV_FALSE;
+            }
+        }
+
+        /* Getting here means we're finished parsing the metadata. */
+        pWav->pMetadata     = metadataParser.pMetadata;
+        pWav->metadataCount = metadataParser.metadataCount;
+    }
+
+    /*
+    It's possible for the size reported in the data chunk to be greater than that of the file. We
+    need to do a validation check here to make sure we don't exceed the file size. To skip this
+    check, set the onTell callback to NULL.
+    */
+    if (pWav->onTell != NULL && pWav->onSeek != NULL) {
+        if (pWav->onSeek(pWav->pUserData, 0, DRWAV_SEEK_END) == DRWAV_TRUE) {
+            drwav_int64 fileSize;
+            if (pWav->onTell(pWav->pUserData, &fileSize)) {
+                if (dataChunkSize + pWav->dataChunkDataPos > (drwav_uint64)fileSize) {
+                    dataChunkSize = (drwav_uint64)fileSize - pWav->dataChunkDataPos;
+                }
+            }
+        } else {
+            /*
+            Failed to seek to the end of the file. It might not be supported by the backend so in
+            this case we cannot perform the validation check.
+            */
+        }
+    }
+
+    /*
+    I've seen a WAV file in the wild where a RIFF-ecapsulated file has the size of it's "RIFF" and
+    "data" chunks set to 0xFFFFFFFF when the file is definitely not that big. In this case we're
+    going to have to calculate the size by reading and discarding bytes, and then seeking back. We
+    cannot do this in sequential mode. We just assume that the rest of the file is audio data.
+    */
+    if (dataChunkSize == 0xFFFFFFFF && (pWav->container == drwav_container_riff || pWav->container == drwav_container_rifx) && pWav->isSequentialWrite == DRWAV_FALSE) {
+        dataChunkSize = 0;
+
+        for (;;) {
+            drwav_uint8 temp[4096];
+            size_t bytesRead = pWav->onRead(pWav->pUserData, temp, sizeof(temp));
+            dataChunkSize += bytesRead;
+
+            if (bytesRead < sizeof(temp)) {
+                break;
+            }
+        }
+    }
+
+    /* At this point we want to be sitting on the first byte of the raw audio data. */
+    if (drwav__seek_from_start(pWav->onSeek, pWav->dataChunkDataPos, pWav->pUserData) == DRWAV_FALSE) {
+        drwav_free(pWav->pMetadata, &pWav->allocationCallbacks);
+        return DRWAV_FALSE;
+    }
+
+
+    pWav->fmt                 = fmt;
+    pWav->sampleRate          = fmt.sampleRate;
+    pWav->channels            = fmt.channels;
+    pWav->bitsPerSample       = fmt.bitsPerSample;
+    pWav->translatedFormatTag = translatedFormatTag;
+
+    /*
+    I've had a report where files would start glitching after seeking. The reason for this is the data
+    chunk is not a clean multiple of the PCM frame size in bytes. Where this becomes a problem is when
+    seeking, because the number of bytes remaining in the data chunk is used to calculate the current
+    byte position. If this byte position is not aligned to the number of bytes in a PCM frame, it will
+    result in the seek not being cleanly positioned at the start of the PCM frame thereby resulting in
+    all decoded frames after that being corrupted.
+
+    To address this, we need to round the data chunk size down to the nearest multiple of the frame size.
+    */
+    if (!drwav__is_compressed_format_tag(translatedFormatTag)) {
+        drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+        if (bytesPerFrame > 0) {
+            dataChunkSize -= (dataChunkSize % bytesPerFrame);
+        }
+    }
+
+    pWav->bytesRemaining      = dataChunkSize;
+    pWav->dataChunkDataSize   = dataChunkSize;
+
+    if (sampleCountFromFactChunk != 0) {
+        pWav->totalPCMFrameCount = sampleCountFromFactChunk;
+    } else if (aiffFrameCount != 0) {
+        pWav->totalPCMFrameCount = aiffFrameCount;
+    } else {
+        drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+        if (bytesPerFrame == 0) {
+            drwav_free(pWav->pMetadata, &pWav->allocationCallbacks);
+            return DRWAV_FALSE; /* Invalid file. */
+        }
+
+        pWav->totalPCMFrameCount = dataChunkSize / bytesPerFrame;
+
+        if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
+            drwav_uint64 totalBlockHeaderSizeInBytes;
+            drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign;
+
+            /* Make sure any trailing partial block is accounted for. */
+            if ((blockCount * fmt.blockAlign) < dataChunkSize) {
+                blockCount += 1;
+            }
+
+            /* We decode two samples per byte. There will be blockCount headers in the data chunk. This is enough to know how to calculate the total PCM frame count. */
+            totalBlockHeaderSizeInBytes = blockCount * (6*fmt.channels);
+            pWav->totalPCMFrameCount = ((dataChunkSize - totalBlockHeaderSizeInBytes) * 2) / fmt.channels;
+        }
+        if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
+            drwav_uint64 totalBlockHeaderSizeInBytes;
+            drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign;
+
+            /* Make sure any trailing partial block is accounted for. */
+            if ((blockCount * fmt.blockAlign) < dataChunkSize) {
+                blockCount += 1;
+            }
+
+            /* We decode two samples per byte. There will be blockCount headers in the data chunk. This is enough to know how to calculate the total PCM frame count. */
+            totalBlockHeaderSizeInBytes = blockCount * (4*fmt.channels);
+            pWav->totalPCMFrameCount = ((dataChunkSize - totalBlockHeaderSizeInBytes) * 2) / fmt.channels;
+
+            /* The header includes a decoded sample for each channel which acts as the initial predictor sample. */
+            pWav->totalPCMFrameCount += blockCount;
+        }
+    }
+
+    /* Some formats only support a certain number of channels. */
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM || pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
+        if (pWav->channels > 2) {
+            drwav_free(pWav->pMetadata, &pWav->allocationCallbacks);
+            return DRWAV_FALSE;
+        }
+    }
+
+    /* The number of bytes per frame must be known. If not, it's an invalid file and not decodable. */
+    if (drwav_get_bytes_per_pcm_frame(pWav) == 0) {
+        drwav_free(pWav->pMetadata, &pWav->allocationCallbacks);
+        return DRWAV_FALSE;
+    }
+
+#ifdef DR_WAV_LIBSNDFILE_COMPAT
+    /*
+    I use libsndfile as a benchmark for testing, however in the version I'm using (from the Windows installer on the libsndfile website),
+    it appears the total sample count libsndfile uses for MS-ADPCM is incorrect. It would seem they are computing the total sample count
+    from the number of blocks, however this results in the inclusion of extra silent samples at the end of the last block. The correct
+    way to know the total sample count is to inspect the "fact" chunk, which should always be present for compressed formats, and should
+    always include the sample count. This little block of code below is only used to emulate the libsndfile logic so I can properly run my
+    correctness tests against libsndfile, and is disabled by default.
+    */
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
+        drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign;
+        pWav->totalPCMFrameCount = (((blockCount * (fmt.blockAlign - (6*pWav->channels))) * 2)) / fmt.channels;  /* x2 because two samples per byte. */
+    }
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
+        drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign;
+        pWav->totalPCMFrameCount = (((blockCount * (fmt.blockAlign - (4*pWav->channels))) * 2) + (blockCount * pWav->channels)) / fmt.channels;
+    }
+#endif
+
+    return DRWAV_TRUE;
+}
+
+DRWAV_API drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_tell_proc onTell, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_ex(pWav, onRead, onSeek, onTell, NULL, pUserData, NULL, 0, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_ex(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_tell_proc onTell, drwav_chunk_proc onChunk, void* pReadSeekTellUserData, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (!drwav_preinit(pWav, onRead, onSeek, onTell, pReadSeekTellUserData, pAllocationCallbacks)) {
+        return DRWAV_FALSE;
+    }
+
+    return drwav_init__internal(pWav, onChunk, pChunkUserData, flags);
+}
+
+DRWAV_API drwav_bool32 drwav_init_with_metadata(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_tell_proc onTell, void* pUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (!drwav_preinit(pWav, onRead, onSeek, onTell, pUserData, pAllocationCallbacks)) {
+        return DRWAV_FALSE;
+    }
+
+    return drwav_init__internal(pWav, NULL, NULL, flags | DRWAV_WITH_METADATA);
+}
+
+DRWAV_API drwav_metadata* drwav_take_ownership_of_metadata(drwav* pWav)
+{
+    drwav_metadata *result = pWav->pMetadata;
+
+    pWav->pMetadata     = NULL;
+    pWav->metadataCount = 0;
+
+    return result;
+}
+
+
+DRWAV_PRIVATE size_t drwav__write(drwav* pWav, const void* pData, size_t dataSize)
+{
+    DRWAV_ASSERT(pWav          != NULL);
+    DRWAV_ASSERT(pWav->onWrite != NULL);
+
+    /* Generic write. Assumes no byte reordering required. */
+    return pWav->onWrite(pWav->pUserData, pData, dataSize);
+}
+
+DRWAV_PRIVATE size_t drwav__write_byte(drwav* pWav, drwav_uint8 byte)
+{
+    DRWAV_ASSERT(pWav          != NULL);
+    DRWAV_ASSERT(pWav->onWrite != NULL);
+
+    return pWav->onWrite(pWav->pUserData, &byte, 1);
+}
+
+DRWAV_PRIVATE size_t drwav__write_u16ne_to_le(drwav* pWav, drwav_uint16 value)
+{
+    DRWAV_ASSERT(pWav          != NULL);
+    DRWAV_ASSERT(pWav->onWrite != NULL);
+
+    if (!drwav__is_little_endian()) {
+        value = drwav__bswap16(value);
+    }
+
+    return drwav__write(pWav, &value, 2);
+}
+
+DRWAV_PRIVATE size_t drwav__write_u32ne_to_le(drwav* pWav, drwav_uint32 value)
+{
+    DRWAV_ASSERT(pWav          != NULL);
+    DRWAV_ASSERT(pWav->onWrite != NULL);
+
+    if (!drwav__is_little_endian()) {
+        value = drwav__bswap32(value);
+    }
+
+    return drwav__write(pWav, &value, 4);
+}
+
+DRWAV_PRIVATE size_t drwav__write_u64ne_to_le(drwav* pWav, drwav_uint64 value)
+{
+    DRWAV_ASSERT(pWav          != NULL);
+    DRWAV_ASSERT(pWav->onWrite != NULL);
+
+    if (!drwav__is_little_endian()) {
+        value = drwav__bswap64(value);
+    }
+
+    return drwav__write(pWav, &value, 8);
+}
+
+DRWAV_PRIVATE size_t drwav__write_f32ne_to_le(drwav* pWav, float value)
+{
+    union {
+       drwav_uint32 u32;
+       float f32;
+    } u;
+
+    DRWAV_ASSERT(pWav          != NULL);
+    DRWAV_ASSERT(pWav->onWrite != NULL);
+
+    u.f32 = value;
+
+    if (!drwav__is_little_endian()) {
+        u.u32 = drwav__bswap32(u.u32);
+    }
+
+    return drwav__write(pWav, &u.u32, 4);
+}
+
+DRWAV_PRIVATE size_t drwav__write_or_count(drwav* pWav, const void* pData, size_t dataSize)
+{
+    if (pWav == NULL) {
+        return dataSize;
+    }
+
+    return drwav__write(pWav, pData, dataSize);
+}
+
+DRWAV_PRIVATE size_t drwav__write_or_count_byte(drwav* pWav, drwav_uint8 byte)
+{
+    if (pWav == NULL) {
+        return 1;
+    }
+
+    return drwav__write_byte(pWav, byte);
+}
+
+DRWAV_PRIVATE size_t drwav__write_or_count_u16ne_to_le(drwav* pWav, drwav_uint16 value)
+{
+    if (pWav == NULL) {
+        return 2;
+    }
+
+    return drwav__write_u16ne_to_le(pWav, value);
+}
+
+DRWAV_PRIVATE size_t drwav__write_or_count_u32ne_to_le(drwav* pWav, drwav_uint32 value)
+{
+    if (pWav == NULL) {
+        return 4;
+    }
+
+    return drwav__write_u32ne_to_le(pWav, value);
+}
+
+#if 0   /* Unused for now. */
+DRWAV_PRIVATE size_t drwav__write_or_count_u64ne_to_le(drwav* pWav, drwav_uint64 value)
+{
+    if (pWav == NULL) {
+        return 8;
+    }
+
+    return drwav__write_u64ne_to_le(pWav, value);
+}
+#endif
+
+DRWAV_PRIVATE size_t drwav__write_or_count_f32ne_to_le(drwav* pWav, float value)
+{
+    if (pWav == NULL) {
+        return 4;
+    }
+
+    return drwav__write_f32ne_to_le(pWav, value);
+}
+
+DRWAV_PRIVATE size_t drwav__write_or_count_string_to_fixed_size_buf(drwav* pWav, char* str, size_t bufFixedSize)
+{
+    size_t len;
+
+    if (pWav == NULL) {
+        return bufFixedSize;
+    }
+
+    len = drwav__strlen_clamped(str, bufFixedSize);
+    drwav__write_or_count(pWav, str, len);
+
+    if (len < bufFixedSize) {
+        size_t i;
+        for (i = 0; i < bufFixedSize - len; ++i) {
+            drwav__write_byte(pWav, 0);
+        }
+    }
+
+    return bufFixedSize;
+}
+
+
+/* pWav can be NULL meaning just count the bytes that would be written. */
+DRWAV_PRIVATE size_t drwav__write_or_count_metadata(drwav* pWav, drwav_metadata* pMetadatas, drwav_uint32 metadataCount)
+{
+    size_t bytesWritten = 0;
+    drwav_bool32 hasListAdtl = DRWAV_FALSE;
+    drwav_bool32 hasListInfo = DRWAV_FALSE;
+    drwav_uint32 iMetadata;
+
+    if (pMetadatas == NULL || metadataCount == 0) {
+        return 0;
+    }
+
+    for (iMetadata = 0; iMetadata < metadataCount; ++iMetadata) {
+        drwav_metadata* pMetadata = &pMetadatas[iMetadata];
+        drwav_uint32 chunkSize = 0;
+
+        if ((pMetadata->type & drwav_metadata_type_list_all_info_strings) || (pMetadata->type == drwav_metadata_type_unknown && pMetadata->data.unknown.chunkLocation == drwav_metadata_location_inside_info_list)) {
+            hasListInfo = DRWAV_TRUE;
+        }
+
+        if ((pMetadata->type & drwav_metadata_type_list_all_adtl) || (pMetadata->type == drwav_metadata_type_unknown && pMetadata->data.unknown.chunkLocation == drwav_metadata_location_inside_adtl_list)) {
+            hasListAdtl = DRWAV_TRUE;
+        }
+
+        switch (pMetadata->type) {
+            case drwav_metadata_type_smpl:
+            {
+                drwav_uint32 iLoop;
+
+                chunkSize = DRWAV_SMPL_BYTES + DRWAV_SMPL_LOOP_BYTES * pMetadata->data.smpl.sampleLoopCount + pMetadata->data.smpl.samplerSpecificDataSizeInBytes;
+
+                bytesWritten += drwav__write_or_count(pWav, "smpl", 4);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, chunkSize);
+
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.manufacturerId);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.productId);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.samplePeriodNanoseconds);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.midiUnityNote);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.midiPitchFraction);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.smpteFormat);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.smpteOffset);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.sampleLoopCount);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.samplerSpecificDataSizeInBytes);
+
+                for (iLoop = 0; iLoop < pMetadata->data.smpl.sampleLoopCount; ++iLoop) {
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].cuePointId);
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].type);
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].firstSampleOffset);
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].lastSampleOffset);
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].sampleFraction);
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].playCount);
+                }
+
+                if (pMetadata->data.smpl.samplerSpecificDataSizeInBytes > 0) {
+                    bytesWritten += drwav__write_or_count(pWav, pMetadata->data.smpl.pSamplerSpecificData, pMetadata->data.smpl.samplerSpecificDataSizeInBytes);
+                }
+            } break;
+
+            case drwav_metadata_type_inst:
+            {
+                chunkSize = DRWAV_INST_BYTES;
+
+                bytesWritten += drwav__write_or_count(pWav, "inst", 4);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, chunkSize);
+                bytesWritten += drwav__write_or_count(pWav, &pMetadata->data.inst.midiUnityNote, 1);
+                bytesWritten += drwav__write_or_count(pWav, &pMetadata->data.inst.fineTuneCents, 1);
+                bytesWritten += drwav__write_or_count(pWav, &pMetadata->data.inst.gainDecibels, 1);
+                bytesWritten += drwav__write_or_count(pWav, &pMetadata->data.inst.lowNote, 1);
+                bytesWritten += drwav__write_or_count(pWav, &pMetadata->data.inst.highNote, 1);
+                bytesWritten += drwav__write_or_count(pWav, &pMetadata->data.inst.lowVelocity, 1);
+                bytesWritten += drwav__write_or_count(pWav, &pMetadata->data.inst.highVelocity, 1);
+            } break;
+
+            case drwav_metadata_type_cue:
+            {
+                drwav_uint32 iCuePoint;
+
+                chunkSize = DRWAV_CUE_BYTES + DRWAV_CUE_POINT_BYTES * pMetadata->data.cue.cuePointCount;
+
+                bytesWritten += drwav__write_or_count(pWav, "cue ", 4);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, chunkSize);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.cuePointCount);
+                for (iCuePoint = 0; iCuePoint < pMetadata->data.cue.cuePointCount; ++iCuePoint) {
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].id);
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].playOrderPosition);
+                    bytesWritten += drwav__write_or_count(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].dataChunkId, 4);
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].chunkStart);
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].blockStart);
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].sampleOffset);
+                }
+            } break;
+
+            case drwav_metadata_type_acid:
+            {
+                chunkSize = DRWAV_ACID_BYTES;
+
+                bytesWritten += drwav__write_or_count(pWav, "acid", 4);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, chunkSize);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.acid.flags);
+                bytesWritten += drwav__write_or_count_u16ne_to_le(pWav, pMetadata->data.acid.midiUnityNote);
+                bytesWritten += drwav__write_or_count_u16ne_to_le(pWav, pMetadata->data.acid.reserved1);
+                bytesWritten += drwav__write_or_count_f32ne_to_le(pWav, pMetadata->data.acid.reserved2);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.acid.numBeats);
+                bytesWritten += drwav__write_or_count_u16ne_to_le(pWav, pMetadata->data.acid.meterDenominator);
+                bytesWritten += drwav__write_or_count_u16ne_to_le(pWav, pMetadata->data.acid.meterNumerator);
+                bytesWritten += drwav__write_or_count_f32ne_to_le(pWav, pMetadata->data.acid.tempo);
+            } break;
+
+            case drwav_metadata_type_bext:
+            {
+                char reservedBuf[DRWAV_BEXT_RESERVED_BYTES];
+                drwav_uint32 timeReferenceLow;
+                drwav_uint32 timeReferenceHigh;
+
+                chunkSize = DRWAV_BEXT_BYTES + pMetadata->data.bext.codingHistorySize;
+
+                bytesWritten += drwav__write_or_count(pWav, "bext", 4);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, chunkSize);
+
+                bytesWritten += drwav__write_or_count_string_to_fixed_size_buf(pWav, pMetadata->data.bext.pDescription, DRWAV_BEXT_DESCRIPTION_BYTES);
+                bytesWritten += drwav__write_or_count_string_to_fixed_size_buf(pWav, pMetadata->data.bext.pOriginatorName, DRWAV_BEXT_ORIGINATOR_NAME_BYTES);
+                bytesWritten += drwav__write_or_count_string_to_fixed_size_buf(pWav, pMetadata->data.bext.pOriginatorReference, DRWAV_BEXT_ORIGINATOR_REF_BYTES);
+                bytesWritten += drwav__write_or_count(pWav, pMetadata->data.bext.pOriginationDate, sizeof(pMetadata->data.bext.pOriginationDate));
+                bytesWritten += drwav__write_or_count(pWav, pMetadata->data.bext.pOriginationTime, sizeof(pMetadata->data.bext.pOriginationTime));
+
+                timeReferenceLow  = (drwav_uint32)(pMetadata->data.bext.timeReference & 0xFFFFFFFF);
+                timeReferenceHigh = (drwav_uint32)(pMetadata->data.bext.timeReference >> 32);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, timeReferenceLow);
+                bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, timeReferenceHigh);
+
+                bytesWritten += drwav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.version);
+                bytesWritten += drwav__write_or_count(pWav, pMetadata->data.bext.pUMID, DRWAV_BEXT_UMID_BYTES);
+                bytesWritten += drwav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.loudnessValue);
+                bytesWritten += drwav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.loudnessRange);
+                bytesWritten += drwav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.maxTruePeakLevel);
+                bytesWritten += drwav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.maxMomentaryLoudness);
+                bytesWritten += drwav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.maxShortTermLoudness);
+
+                DRWAV_ZERO_MEMORY(reservedBuf, sizeof(reservedBuf));
+                bytesWritten += drwav__write_or_count(pWav, reservedBuf, sizeof(reservedBuf));
+
+                if (pMetadata->data.bext.codingHistorySize > 0) {
+                    bytesWritten += drwav__write_or_count(pWav, pMetadata->data.bext.pCodingHistory, pMetadata->data.bext.codingHistorySize);
+                }
+            } break;
+
+            case drwav_metadata_type_unknown:
+            {
+                if (pMetadata->data.unknown.chunkLocation == drwav_metadata_location_top_level) {
+                    chunkSize = pMetadata->data.unknown.dataSizeInBytes;
+
+                    bytesWritten += drwav__write_or_count(pWav, pMetadata->data.unknown.id, 4);
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, chunkSize);
+                    bytesWritten += drwav__write_or_count(pWav, pMetadata->data.unknown.pData, pMetadata->data.unknown.dataSizeInBytes);
+                }
+            } break;
+
+            default: break;
+        }
+        if ((chunkSize % 2) != 0) {
+            bytesWritten += drwav__write_or_count_byte(pWav, 0);
+        }
+    }
+
+    if (hasListInfo) {
+        drwav_uint32 chunkSize = 4; /* Start with 4 bytes for "INFO". */
+        for (iMetadata = 0; iMetadata < metadataCount; ++iMetadata) {
+            drwav_metadata* pMetadata = &pMetadatas[iMetadata];
+
+            if ((pMetadata->type & drwav_metadata_type_list_all_info_strings)) {
+                chunkSize += 8; /* For id and string size. */
+                chunkSize += pMetadata->data.infoText.stringLength + 1; /* Include null terminator. */
+            } else if (pMetadata->type == drwav_metadata_type_unknown && pMetadata->data.unknown.chunkLocation == drwav_metadata_location_inside_info_list) {
+                chunkSize += 8; /* For id string size. */
+                chunkSize += pMetadata->data.unknown.dataSizeInBytes;
+            }
+
+            if ((chunkSize % 2) != 0) {
+                chunkSize += 1;
+            }
+        }
+
+        bytesWritten += drwav__write_or_count(pWav, "LIST", 4);
+        bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, chunkSize);
+        bytesWritten += drwav__write_or_count(pWav, "INFO", 4);
+
+        for (iMetadata = 0; iMetadata < metadataCount; ++iMetadata) {
+            drwav_metadata* pMetadata = &pMetadatas[iMetadata];
+            drwav_uint32 subchunkSize = 0;
+
+            if (pMetadata->type & drwav_metadata_type_list_all_info_strings) {
+                const char* pID = NULL;
+
+                switch (pMetadata->type) {
+                    case drwav_metadata_type_list_info_software:     pID = "ISFT"; break;
+                    case drwav_metadata_type_list_info_copyright:    pID = "ICOP"; break;
+                    case drwav_metadata_type_list_info_title:        pID = "INAM"; break;
+                    case drwav_metadata_type_list_info_artist:       pID = "IART"; break;
+                    case drwav_metadata_type_list_info_comment:      pID = "ICMT"; break;
+                    case drwav_metadata_type_list_info_date:         pID = "ICRD"; break;
+                    case drwav_metadata_type_list_info_genre:        pID = "IGNR"; break;
+                    case drwav_metadata_type_list_info_album:        pID = "IPRD"; break;
+                    case drwav_metadata_type_list_info_tracknumber:  pID = "ITRK"; break;
+                    case drwav_metadata_type_list_info_location:     pID = "IARL"; break;
+                    case drwav_metadata_type_list_info_organization: pID = "ICMS"; break;
+                    case drwav_metadata_type_list_info_keywords:     pID = "IKEY"; break;
+                    case drwav_metadata_type_list_info_medium:       pID = "IMED"; break;
+                    case drwav_metadata_type_list_info_description:  pID = "ISBJ"; break;
+                    default: break;
+                }
+
+                DRWAV_ASSERT(pID != NULL);
+
+                if (pMetadata->data.infoText.stringLength) {
+                    subchunkSize = pMetadata->data.infoText.stringLength + 1;
+                    bytesWritten += drwav__write_or_count(pWav, pID, 4);
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, subchunkSize);
+                    bytesWritten += drwav__write_or_count(pWav, pMetadata->data.infoText.pString, pMetadata->data.infoText.stringLength);
+                    bytesWritten += drwav__write_or_count_byte(pWav, '\0');
+                }
+            } else if (pMetadata->type == drwav_metadata_type_unknown && pMetadata->data.unknown.chunkLocation == drwav_metadata_location_inside_info_list) {
+                if (pMetadata->data.unknown.dataSizeInBytes) {
+                    subchunkSize = pMetadata->data.unknown.dataSizeInBytes;
+
+                    bytesWritten += drwav__write_or_count(pWav, pMetadata->data.unknown.id, 4);
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.unknown.dataSizeInBytes);
+                    bytesWritten += drwav__write_or_count(pWav, pMetadata->data.unknown.pData, subchunkSize);
+                }
+            }
+
+            if ((subchunkSize % 2) != 0) {
+                bytesWritten += drwav__write_or_count_byte(pWav, 0);
+            }
+        }
+    }
+
+    if (hasListAdtl) {
+        drwav_uint32 chunkSize = 4; /* start with 4 bytes for "adtl" */
+
+        for (iMetadata = 0; iMetadata < metadataCount; ++iMetadata) {
+            drwav_metadata* pMetadata = &pMetadatas[iMetadata];
+
+            switch (pMetadata->type)
+            {
+                case drwav_metadata_type_list_label:
+                case drwav_metadata_type_list_note:
+                {
+                    chunkSize += 8; /* for id and chunk size */
+                    chunkSize += DRWAV_LIST_LABEL_OR_NOTE_BYTES;
+
+                    if (pMetadata->data.labelOrNote.stringLength > 0) {
+                        chunkSize += pMetadata->data.labelOrNote.stringLength + 1;
+                    }
+                } break;
+
+                case drwav_metadata_type_list_labelled_cue_region:
+                {
+                    chunkSize += 8; /* for id and chunk size */
+                    chunkSize += DRWAV_LIST_LABELLED_TEXT_BYTES;
+
+                    if (pMetadata->data.labelledCueRegion.stringLength > 0) {
+                        chunkSize += pMetadata->data.labelledCueRegion.stringLength + 1;
+                    }
+                } break;
+
+                case drwav_metadata_type_unknown:
+                {
+                    if (pMetadata->data.unknown.chunkLocation == drwav_metadata_location_inside_adtl_list) {
+                        chunkSize += 8; /* for id and chunk size */
+                        chunkSize += pMetadata->data.unknown.dataSizeInBytes;
+                    }
+                } break;
+
+                default: break;
+            }
+
+            if ((chunkSize % 2) != 0) {
+                chunkSize += 1;
+            }
+        }
+
+        bytesWritten += drwav__write_or_count(pWav, "LIST", 4);
+        bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, chunkSize);
+        bytesWritten += drwav__write_or_count(pWav, "adtl", 4);
+
+        for (iMetadata = 0; iMetadata < metadataCount; ++iMetadata) {
+            drwav_metadata* pMetadata = &pMetadatas[iMetadata];
+            drwav_uint32 subchunkSize = 0;
+
+            switch (pMetadata->type)
+            {
+                case drwav_metadata_type_list_label:
+                case drwav_metadata_type_list_note:
+                {
+                    if (pMetadata->data.labelOrNote.stringLength > 0) {
+                        const char *pID = NULL;
+
+                        if (pMetadata->type == drwav_metadata_type_list_label) {
+                            pID = "labl";
+                        }
+                        else if (pMetadata->type == drwav_metadata_type_list_note) {
+                            pID = "note";
+                        }
+
+                        DRWAV_ASSERT(pID != NULL);
+                        DRWAV_ASSERT(pMetadata->data.labelOrNote.pString != NULL);
+
+                        subchunkSize = DRWAV_LIST_LABEL_OR_NOTE_BYTES;
+
+                        bytesWritten += drwav__write_or_count(pWav, pID, 4);
+                        subchunkSize += pMetadata->data.labelOrNote.stringLength + 1;
+                        bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, subchunkSize);
+
+                        bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.labelOrNote.cuePointId);
+                        bytesWritten += drwav__write_or_count(pWav, pMetadata->data.labelOrNote.pString, pMetadata->data.labelOrNote.stringLength);
+                        bytesWritten += drwav__write_or_count_byte(pWav, '\0');
+                    }
+                } break;
+
+                case drwav_metadata_type_list_labelled_cue_region:
+                {
+                    subchunkSize = DRWAV_LIST_LABELLED_TEXT_BYTES;
+
+                    bytesWritten += drwav__write_or_count(pWav, "ltxt", 4);
+                    if (pMetadata->data.labelledCueRegion.stringLength > 0) {
+                        subchunkSize += pMetadata->data.labelledCueRegion.stringLength + 1;
+                    }
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, subchunkSize);
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.labelledCueRegion.cuePointId);
+                    bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, pMetadata->data.labelledCueRegion.sampleLength);
+                    bytesWritten += drwav__write_or_count(pWav, pMetadata->data.labelledCueRegion.purposeId, 4);
+                    bytesWritten += drwav__write_or_count_u16ne_to_le(pWav, pMetadata->data.labelledCueRegion.country);
+                    bytesWritten += drwav__write_or_count_u16ne_to_le(pWav, pMetadata->data.labelledCueRegion.language);
+                    bytesWritten += drwav__write_or_count_u16ne_to_le(pWav, pMetadata->data.labelledCueRegion.dialect);
+                    bytesWritten += drwav__write_or_count_u16ne_to_le(pWav, pMetadata->data.labelledCueRegion.codePage);
+
+                    if (pMetadata->data.labelledCueRegion.stringLength > 0) {
+                        DRWAV_ASSERT(pMetadata->data.labelledCueRegion.pString != NULL);
+
+                        bytesWritten += drwav__write_or_count(pWav, pMetadata->data.labelledCueRegion.pString, pMetadata->data.labelledCueRegion.stringLength);
+                        bytesWritten += drwav__write_or_count_byte(pWav, '\0');
+                    }
+                } break;
+
+                case drwav_metadata_type_unknown:
+                {
+                    if (pMetadata->data.unknown.chunkLocation == drwav_metadata_location_inside_adtl_list) {
+                        subchunkSize = pMetadata->data.unknown.dataSizeInBytes;
+
+                        DRWAV_ASSERT(pMetadata->data.unknown.pData != NULL);
+                        bytesWritten += drwav__write_or_count(pWav, pMetadata->data.unknown.id, 4);
+                        bytesWritten += drwav__write_or_count_u32ne_to_le(pWav, subchunkSize);
+                        bytesWritten += drwav__write_or_count(pWav, pMetadata->data.unknown.pData, subchunkSize);
+                    }
+                } break;
+
+                default: break;
+            }
+
+            if ((subchunkSize % 2) != 0) {
+                bytesWritten += drwav__write_or_count_byte(pWav, 0);
+            }
+        }
+    }
+
+    DRWAV_ASSERT((bytesWritten % 2) == 0);
+
+    return bytesWritten;
+}
+
+DRWAV_PRIVATE drwav_uint32 drwav__riff_chunk_size_riff(drwav_uint64 dataChunkSize, drwav_metadata* pMetadata, drwav_uint32 metadataCount)
+{
+    drwav_uint64 chunkSize = 4 + 24 + (drwav_uint64)drwav__write_or_count_metadata(NULL, pMetadata, metadataCount) + 8 + dataChunkSize + drwav__chunk_padding_size_riff(dataChunkSize); /* 4 = "WAVE". 24 = "fmt " chunk. 8 = "data" + u32 data size. */
+    if (chunkSize > 0xFFFFFFFFUL) {
+        chunkSize = 0xFFFFFFFFUL;
+    }
+
+    return (drwav_uint32)chunkSize; /* Safe cast due to the clamp above. */
+}
+
+DRWAV_PRIVATE drwav_uint32 drwav__data_chunk_size_riff(drwav_uint64 dataChunkSize)
+{
+    if (dataChunkSize <= 0xFFFFFFFFUL) {
+        return (drwav_uint32)dataChunkSize;
+    } else {
+        return 0xFFFFFFFFUL;
+    }
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav__riff_chunk_size_w64(drwav_uint64 dataChunkSize)
+{
+    drwav_uint64 dataSubchunkPaddingSize = drwav__chunk_padding_size_w64(dataChunkSize);
+
+    return 80 + 24 + dataChunkSize + dataSubchunkPaddingSize;   /* +24 because W64 includes the size of the GUID and size fields. */
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav__data_chunk_size_w64(drwav_uint64 dataChunkSize)
+{
+    return 24 + dataChunkSize;        /* +24 because W64 includes the size of the GUID and size fields. */
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav__riff_chunk_size_rf64(drwav_uint64 dataChunkSize, drwav_metadata *metadata, drwav_uint32 numMetadata)
+{
+    drwav_uint64 chunkSize = 4 + 36 + 24 + (drwav_uint64)drwav__write_or_count_metadata(NULL, metadata, numMetadata) + 8 + dataChunkSize + drwav__chunk_padding_size_riff(dataChunkSize); /* 4 = "WAVE". 36 = "ds64" chunk. 24 = "fmt " chunk. 8 = "data" + u32 data size. */
+    if (chunkSize > 0xFFFFFFFFUL) {
+        chunkSize = 0xFFFFFFFFUL;
+    }
+
+    return chunkSize;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav__data_chunk_size_rf64(drwav_uint64 dataChunkSize)
+{
+    return dataChunkSize;
+}
+
+
+
+DRWAV_PRIVATE drwav_bool32 drwav_preinit_write(drwav* pWav, const drwav_data_format* pFormat, drwav_bool32 isSequential, drwav_write_proc onWrite, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pWav == NULL || onWrite == NULL) {
+        return DRWAV_FALSE;
+    }
+
+    if (!isSequential && onSeek == NULL) {
+        return DRWAV_FALSE; /* <-- onSeek is required when in non-sequential mode. */
+    }
+
+    /* Not currently supporting compressed formats. Will need to add support for the "fact" chunk before we enable this. */
+    if (pFormat->format == DR_WAVE_FORMAT_EXTENSIBLE) {
+        return DRWAV_FALSE;
+    }
+    if (pFormat->format == DR_WAVE_FORMAT_ADPCM || pFormat->format == DR_WAVE_FORMAT_DVI_ADPCM) {
+        return DRWAV_FALSE;
+    }
+
+    DRWAV_ZERO_MEMORY(pWav, sizeof(*pWav));
+    pWav->onWrite   = onWrite;
+    pWav->onSeek    = onSeek;
+    pWav->pUserData = pUserData;
+    pWav->allocationCallbacks = drwav_copy_allocation_callbacks_or_defaults(pAllocationCallbacks);
+
+    if (pWav->allocationCallbacks.onFree == NULL || (pWav->allocationCallbacks.onMalloc == NULL && pWav->allocationCallbacks.onRealloc == NULL)) {
+        return DRWAV_FALSE;    /* Invalid allocation callbacks. */
+    }
+
+    pWav->fmt.formatTag = (drwav_uint16)pFormat->format;
+    pWav->fmt.channels = (drwav_uint16)pFormat->channels;
+    pWav->fmt.sampleRate = pFormat->sampleRate;
+    pWav->fmt.avgBytesPerSec = (drwav_uint32)((pFormat->bitsPerSample * pFormat->sampleRate * pFormat->channels) / 8);
+    pWav->fmt.blockAlign = (drwav_uint16)((pFormat->channels * pFormat->bitsPerSample) / 8);
+    pWav->fmt.bitsPerSample = (drwav_uint16)pFormat->bitsPerSample;
+    pWav->fmt.extendedSize = 0;
+    pWav->isSequentialWrite = isSequential;
+
+    return DRWAV_TRUE;
+}
+
+
+DRWAV_PRIVATE drwav_bool32 drwav_init_write__internal(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount)
+{
+    /* The function assumes drwav_preinit_write() was called beforehand. */
+
+    size_t runningPos = 0;
+    drwav_uint64 initialDataChunkSize = 0;
+    drwav_uint64 chunkSizeFMT;
+
+    /*
+    The initial values for the "RIFF" and "data" chunks depends on whether or not we are initializing in sequential mode or not. In
+    sequential mode we set this to its final values straight away since they can be calculated from the total sample count. In non-
+    sequential mode we initialize it all to zero and fill it out in drwav_uninit() using a backwards seek.
+    */
+    if (pWav->isSequentialWrite) {
+        initialDataChunkSize = (totalSampleCount * pWav->fmt.bitsPerSample) / 8;
+
+        /*
+        The RIFF container has a limit on the number of samples. drwav is not allowing this. There's no practical limits for Wave64
+        so for the sake of simplicity I'm not doing any validation for that.
+        */
+        if (pFormat->container == drwav_container_riff) {
+            if (initialDataChunkSize > (0xFFFFFFFFUL - 36)) {
+                return DRWAV_FALSE; /* Not enough room to store every sample. */
+            }
+        }
+    }
+
+    pWav->dataChunkDataSizeTargetWrite = initialDataChunkSize;
+
+
+    /* "RIFF" chunk. */
+    if (pFormat->container == drwav_container_riff) {
+        drwav_uint32 chunkSizeRIFF = 36 + (drwav_uint32)initialDataChunkSize;   /* +36 = "WAVE" + [sizeof "fmt " chunk] + [data chunk header] */
+        runningPos += drwav__write(pWav, "RIFF", 4);
+        runningPos += drwav__write_u32ne_to_le(pWav, chunkSizeRIFF);
+        runningPos += drwav__write(pWav, "WAVE", 4);
+    } else if (pFormat->container == drwav_container_w64) {
+        drwav_uint64 chunkSizeRIFF = 80 + 24 + initialDataChunkSize;            /* +24 because W64 includes the size of the GUID and size fields. */
+        runningPos += drwav__write(pWav, drwavGUID_W64_RIFF, 16);
+        runningPos += drwav__write_u64ne_to_le(pWav, chunkSizeRIFF);
+        runningPos += drwav__write(pWav, drwavGUID_W64_WAVE, 16);
+    } else if (pFormat->container == drwav_container_rf64) {
+        runningPos += drwav__write(pWav, "RF64", 4);
+        runningPos += drwav__write_u32ne_to_le(pWav, 0xFFFFFFFF);               /* Always 0xFFFFFFFF for RF64. Set to a proper value in the "ds64" chunk. */
+        runningPos += drwav__write(pWav, "WAVE", 4);
+    } else {
+        return DRWAV_FALSE; /* Container not supported for writing. */
+    }
+
+
+    /* "ds64" chunk (RF64 only). */
+    if (pFormat->container == drwav_container_rf64) {
+        drwav_uint32 initialds64ChunkSize = 28;                                 /* 28 = [Size of RIFF (8 bytes)] + [Size of DATA (8 bytes)] + [Sample Count (8 bytes)] + [Table Length (4 bytes)]. Table length always set to 0. */
+        drwav_uint64 initialRiffChunkSize = 8 + initialds64ChunkSize + initialDataChunkSize;    /* +8 for the ds64 header. */
+
+        runningPos += drwav__write(pWav, "ds64", 4);
+        runningPos += drwav__write_u32ne_to_le(pWav, initialds64ChunkSize);     /* Size of ds64. */
+        runningPos += drwav__write_u64ne_to_le(pWav, initialRiffChunkSize);     /* Size of RIFF. Set to true value at the end. */
+        runningPos += drwav__write_u64ne_to_le(pWav, initialDataChunkSize);     /* Size of DATA. Set to true value at the end. */
+        runningPos += drwav__write_u64ne_to_le(pWav, totalSampleCount);         /* Sample count. */
+        runningPos += drwav__write_u32ne_to_le(pWav, 0);                        /* Table length. Always set to zero in our case since we're not doing any other chunks than "DATA". */
+    }
+
+
+    /* "fmt " chunk. */
+    if (pFormat->container == drwav_container_riff || pFormat->container == drwav_container_rf64) {
+        chunkSizeFMT = 16;
+        runningPos += drwav__write(pWav, "fmt ", 4);
+        runningPos += drwav__write_u32ne_to_le(pWav, (drwav_uint32)chunkSizeFMT);
+    } else if (pFormat->container == drwav_container_w64) {
+        chunkSizeFMT = 40;
+        runningPos += drwav__write(pWav, drwavGUID_W64_FMT, 16);
+        runningPos += drwav__write_u64ne_to_le(pWav, chunkSizeFMT);
+    }
+
+    runningPos += drwav__write_u16ne_to_le(pWav, pWav->fmt.formatTag);
+    runningPos += drwav__write_u16ne_to_le(pWav, pWav->fmt.channels);
+    runningPos += drwav__write_u32ne_to_le(pWav, pWav->fmt.sampleRate);
+    runningPos += drwav__write_u32ne_to_le(pWav, pWav->fmt.avgBytesPerSec);
+    runningPos += drwav__write_u16ne_to_le(pWav, pWav->fmt.blockAlign);
+    runningPos += drwav__write_u16ne_to_le(pWav, pWav->fmt.bitsPerSample);
+
+    /* TODO: is a 'fact' chunk required for DR_WAVE_FORMAT_IEEE_FLOAT? */
+
+    if (!pWav->isSequentialWrite && pWav->pMetadata != NULL && pWav->metadataCount > 0 && (pFormat->container == drwav_container_riff || pFormat->container == drwav_container_rf64)) {
+        runningPos += drwav__write_or_count_metadata(pWav, pWav->pMetadata, pWav->metadataCount);
+    }
+
+    pWav->dataChunkDataPos = runningPos;
+
+    /* "data" chunk. */
+    if (pFormat->container == drwav_container_riff) {
+        drwav_uint32 chunkSizeDATA = (drwav_uint32)initialDataChunkSize;
+        runningPos += drwav__write(pWav, "data", 4);
+        runningPos += drwav__write_u32ne_to_le(pWav, chunkSizeDATA);
+    } else if (pFormat->container == drwav_container_w64) {
+        drwav_uint64 chunkSizeDATA = 24 + initialDataChunkSize;     /* +24 because W64 includes the size of the GUID and size fields. */
+        runningPos += drwav__write(pWav, drwavGUID_W64_DATA, 16);
+        runningPos += drwav__write_u64ne_to_le(pWav, chunkSizeDATA);
+    } else if (pFormat->container == drwav_container_rf64) {
+        runningPos += drwav__write(pWav, "data", 4);
+        runningPos += drwav__write_u32ne_to_le(pWav, 0xFFFFFFFF);   /* Always set to 0xFFFFFFFF for RF64. The true size of the data chunk is specified in the ds64 chunk. */
+    }
+
+    /* Set some properties for the client's convenience. */
+    pWav->container = pFormat->container;
+    pWav->channels = (drwav_uint16)pFormat->channels;
+    pWav->sampleRate = pFormat->sampleRate;
+    pWav->bitsPerSample = (drwav_uint16)pFormat->bitsPerSample;
+    pWav->translatedFormatTag = (drwav_uint16)pFormat->format;
+    pWav->dataChunkDataPos = runningPos;
+
+    return DRWAV_TRUE;
+}
+
+
+DRWAV_API drwav_bool32 drwav_init_write(drwav* pWav, const drwav_data_format* pFormat, drwav_write_proc onWrite, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (!drwav_preinit_write(pWav, pFormat, DRWAV_FALSE, onWrite, onSeek, pUserData, pAllocationCallbacks)) {
+        return DRWAV_FALSE;
+    }
+
+    return drwav_init_write__internal(pWav, pFormat, 0);               /* DRWAV_FALSE = Not Sequential */
+}
+
+DRWAV_API drwav_bool32 drwav_init_write_sequential(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_write_proc onWrite, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (!drwav_preinit_write(pWav, pFormat, DRWAV_TRUE, onWrite, NULL, pUserData, pAllocationCallbacks)) {
+        return DRWAV_FALSE;
+    }
+
+    return drwav_init_write__internal(pWav, pFormat, totalSampleCount); /* DRWAV_TRUE = Sequential */
+}
+
+DRWAV_API drwav_bool32 drwav_init_write_sequential_pcm_frames(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, drwav_write_proc onWrite, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFormat == NULL) {
+        return DRWAV_FALSE;
+    }
+
+    return drwav_init_write_sequential(pWav, pFormat, totalPCMFrameCount*pFormat->channels, onWrite, pUserData, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_write_with_metadata(drwav* pWav, const drwav_data_format* pFormat, drwav_write_proc onWrite, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks, drwav_metadata* pMetadata, drwav_uint32 metadataCount)
+{
+    if (!drwav_preinit_write(pWav, pFormat, DRWAV_FALSE, onWrite, onSeek, pUserData, pAllocationCallbacks)) {
+        return DRWAV_FALSE;
+    }
+
+    pWav->pMetadata     = pMetadata;
+    pWav->metadataCount = metadataCount;
+
+    return drwav_init_write__internal(pWav, pFormat, 0);
+}
+
+
+DRWAV_API drwav_uint64 drwav_target_write_size_bytes(const drwav_data_format* pFormat, drwav_uint64 totalFrameCount, drwav_metadata* pMetadata, drwav_uint32 metadataCount)
+{
+    /* Casting totalFrameCount to drwav_int64 for VC6 compatibility. No issues in practice because nobody is going to exhaust the whole 63 bits. */
+    drwav_uint64 targetDataSizeBytes = (drwav_uint64)((drwav_int64)totalFrameCount * pFormat->channels * pFormat->bitsPerSample/8.0);
+    drwav_uint64 riffChunkSizeBytes;
+    drwav_uint64 fileSizeBytes = 0;
+
+    if (pFormat->container == drwav_container_riff) {
+        riffChunkSizeBytes = drwav__riff_chunk_size_riff(targetDataSizeBytes, pMetadata, metadataCount);
+        fileSizeBytes = (8 + riffChunkSizeBytes);   /* +8 because WAV doesn't include the size of the ChunkID and ChunkSize fields. */
+    } else if (pFormat->container == drwav_container_w64) {
+        riffChunkSizeBytes = drwav__riff_chunk_size_w64(targetDataSizeBytes);
+        fileSizeBytes = riffChunkSizeBytes;
+    } else if (pFormat->container == drwav_container_rf64) {
+        riffChunkSizeBytes = drwav__riff_chunk_size_rf64(targetDataSizeBytes, pMetadata, metadataCount);
+        fileSizeBytes = (8 + riffChunkSizeBytes);   /* +8 because WAV doesn't include the size of the ChunkID and ChunkSize fields. */
+    }
+
+    return fileSizeBytes;
+}
+
+
+#ifndef DR_WAV_NO_STDIO
+
+/* Errno */
+/* drwav_result_from_errno() is only used for fopen() and wfopen() so putting it inside DR_WAV_NO_STDIO for now. If something else needs this later we can move it out. */
+#include <errno.h>
+DRWAV_PRIVATE drwav_result drwav_result_from_errno(int e)
+{
+    switch (e)
+    {
+        case 0: return DRWAV_SUCCESS;
+    #ifdef EPERM
+        case EPERM: return DRWAV_INVALID_OPERATION;
+    #endif
+    #ifdef ENOENT
+        case ENOENT: return DRWAV_DOES_NOT_EXIST;
+    #endif
+    #ifdef ESRCH
+        case ESRCH: return DRWAV_DOES_NOT_EXIST;
+    #endif
+    #ifdef EINTR
+        case EINTR: return DRWAV_INTERRUPT;
+    #endif
+    #ifdef EIO
+        case EIO: return DRWAV_IO_ERROR;
+    #endif
+    #ifdef ENXIO
+        case ENXIO: return DRWAV_DOES_NOT_EXIST;
+    #endif
+    #ifdef E2BIG
+        case E2BIG: return DRWAV_INVALID_ARGS;
+    #endif
+    #ifdef ENOEXEC
+        case ENOEXEC: return DRWAV_INVALID_FILE;
+    #endif
+    #ifdef EBADF
+        case EBADF: return DRWAV_INVALID_FILE;
+    #endif
+    #ifdef ECHILD
+        case ECHILD: return DRWAV_ERROR;
+    #endif
+    #ifdef EAGAIN
+        case EAGAIN: return DRWAV_UNAVAILABLE;
+    #endif
+    #ifdef ENOMEM
+        case ENOMEM: return DRWAV_OUT_OF_MEMORY;
+    #endif
+    #ifdef EACCES
+        case EACCES: return DRWAV_ACCESS_DENIED;
+    #endif
+    #ifdef EFAULT
+        case EFAULT: return DRWAV_BAD_ADDRESS;
+    #endif
+    #ifdef ENOTBLK
+        case ENOTBLK: return DRWAV_ERROR;
+    #endif
+    #ifdef EBUSY
+        case EBUSY: return DRWAV_BUSY;
+    #endif
+    #ifdef EEXIST
+        case EEXIST: return DRWAV_ALREADY_EXISTS;
+    #endif
+    #ifdef EXDEV
+        case EXDEV: return DRWAV_ERROR;
+    #endif
+    #ifdef ENODEV
+        case ENODEV: return DRWAV_DOES_NOT_EXIST;
+    #endif
+    #ifdef ENOTDIR
+        case ENOTDIR: return DRWAV_NOT_DIRECTORY;
+    #endif
+    #ifdef EISDIR
+        case EISDIR: return DRWAV_IS_DIRECTORY;
+    #endif
+    #ifdef EINVAL
+        case EINVAL: return DRWAV_INVALID_ARGS;
+    #endif
+    #ifdef ENFILE
+        case ENFILE: return DRWAV_TOO_MANY_OPEN_FILES;
+    #endif
+    #ifdef EMFILE
+        case EMFILE: return DRWAV_TOO_MANY_OPEN_FILES;
+    #endif
+    #ifdef ENOTTY
+        case ENOTTY: return DRWAV_INVALID_OPERATION;
+    #endif
+    #ifdef ETXTBSY
+        case ETXTBSY: return DRWAV_BUSY;
+    #endif
+    #ifdef EFBIG
+        case EFBIG: return DRWAV_TOO_BIG;
+    #endif
+    #ifdef ENOSPC
+        case ENOSPC: return DRWAV_NO_SPACE;
+    #endif
+    #ifdef ESPIPE
+        case ESPIPE: return DRWAV_BAD_SEEK;
+    #endif
+    #ifdef EROFS
+        case EROFS: return DRWAV_ACCESS_DENIED;
+    #endif
+    #ifdef EMLINK
+        case EMLINK: return DRWAV_TOO_MANY_LINKS;
+    #endif
+    #ifdef EPIPE
+        case EPIPE: return DRWAV_BAD_PIPE;
+    #endif
+    #ifdef EDOM
+        case EDOM: return DRWAV_OUT_OF_RANGE;
+    #endif
+    #ifdef ERANGE
+        case ERANGE: return DRWAV_OUT_OF_RANGE;
+    #endif
+    #ifdef EDEADLK
+        case EDEADLK: return DRWAV_DEADLOCK;
+    #endif
+    #ifdef ENAMETOOLONG
+        case ENAMETOOLONG: return DRWAV_PATH_TOO_LONG;
+    #endif
+    #ifdef ENOLCK
+        case ENOLCK: return DRWAV_ERROR;
+    #endif
+    #ifdef ENOSYS
+        case ENOSYS: return DRWAV_NOT_IMPLEMENTED;
+    #endif
+    #if defined(ENOTEMPTY) && ENOTEMPTY != EEXIST   /* In AIX, ENOTEMPTY and EEXIST use the same value. */
+        case ENOTEMPTY: return DRWAV_DIRECTORY_NOT_EMPTY;
+    #endif
+    #ifdef ELOOP
+        case ELOOP: return DRWAV_TOO_MANY_LINKS;
+    #endif
+    #ifdef ENOMSG
+        case ENOMSG: return DRWAV_NO_MESSAGE;
+    #endif
+    #ifdef EIDRM
+        case EIDRM: return DRWAV_ERROR;
+    #endif
+    #ifdef ECHRNG
+        case ECHRNG: return DRWAV_ERROR;
+    #endif
+    #ifdef EL2NSYNC
+        case EL2NSYNC: return DRWAV_ERROR;
+    #endif
+    #ifdef EL3HLT
+        case EL3HLT: return DRWAV_ERROR;
+    #endif
+    #ifdef EL3RST
+        case EL3RST: return DRWAV_ERROR;
+    #endif
+    #ifdef ELNRNG
+        case ELNRNG: return DRWAV_OUT_OF_RANGE;
+    #endif
+    #ifdef EUNATCH
+        case EUNATCH: return DRWAV_ERROR;
+    #endif
+    #ifdef ENOCSI
+        case ENOCSI: return DRWAV_ERROR;
+    #endif
+    #ifdef EL2HLT
+        case EL2HLT: return DRWAV_ERROR;
+    #endif
+    #ifdef EBADE
+        case EBADE: return DRWAV_ERROR;
+    #endif
+    #ifdef EBADR
+        case EBADR: return DRWAV_ERROR;
+    #endif
+    #ifdef EXFULL
+        case EXFULL: return DRWAV_ERROR;
+    #endif
+    #ifdef ENOANO
+        case ENOANO: return DRWAV_ERROR;
+    #endif
+    #ifdef EBADRQC
+        case EBADRQC: return DRWAV_ERROR;
+    #endif
+    #ifdef EBADSLT
+        case EBADSLT: return DRWAV_ERROR;
+    #endif
+    #ifdef EBFONT
+        case EBFONT: return DRWAV_INVALID_FILE;
+    #endif
+    #ifdef ENOSTR
+        case ENOSTR: return DRWAV_ERROR;
+    #endif
+    #ifdef ENODATA
+        case ENODATA: return DRWAV_NO_DATA_AVAILABLE;
+    #endif
+    #ifdef ETIME
+        case ETIME: return DRWAV_TIMEOUT;
+    #endif
+    #ifdef ENOSR
+        case ENOSR: return DRWAV_NO_DATA_AVAILABLE;
+    #endif
+    #ifdef ENONET
+        case ENONET: return DRWAV_NO_NETWORK;
+    #endif
+    #ifdef ENOPKG
+        case ENOPKG: return DRWAV_ERROR;
+    #endif
+    #ifdef EREMOTE
+        case EREMOTE: return DRWAV_ERROR;
+    #endif
+    #ifdef ENOLINK
+        case ENOLINK: return DRWAV_ERROR;
+    #endif
+    #ifdef EADV
+        case EADV: return DRWAV_ERROR;
+    #endif
+    #ifdef ESRMNT
+        case ESRMNT: return DRWAV_ERROR;
+    #endif
+    #ifdef ECOMM
+        case ECOMM: return DRWAV_ERROR;
+    #endif
+    #ifdef EPROTO
+        case EPROTO: return DRWAV_ERROR;
+    #endif
+    #ifdef EMULTIHOP
+        case EMULTIHOP: return DRWAV_ERROR;
+    #endif
+    #ifdef EDOTDOT
+        case EDOTDOT: return DRWAV_ERROR;
+    #endif
+    #ifdef EBADMSG
+        case EBADMSG: return DRWAV_BAD_MESSAGE;
+    #endif
+    #ifdef EOVERFLOW
+        case EOVERFLOW: return DRWAV_TOO_BIG;
+    #endif
+    #ifdef ENOTUNIQ
+        case ENOTUNIQ: return DRWAV_NOT_UNIQUE;
+    #endif
+    #ifdef EBADFD
+        case EBADFD: return DRWAV_ERROR;
+    #endif
+    #ifdef EREMCHG
+        case EREMCHG: return DRWAV_ERROR;
+    #endif
+    #ifdef ELIBACC
+        case ELIBACC: return DRWAV_ACCESS_DENIED;
+    #endif
+    #ifdef ELIBBAD
+        case ELIBBAD: return DRWAV_INVALID_FILE;
+    #endif
+    #ifdef ELIBSCN
+        case ELIBSCN: return DRWAV_INVALID_FILE;
+    #endif
+    #ifdef ELIBMAX
+        case ELIBMAX: return DRWAV_ERROR;
+    #endif
+    #ifdef ELIBEXEC
+        case ELIBEXEC: return DRWAV_ERROR;
+    #endif
+    #ifdef EILSEQ
+        case EILSEQ: return DRWAV_INVALID_DATA;
+    #endif
+    #ifdef ERESTART
+        case ERESTART: return DRWAV_ERROR;
+    #endif
+    #ifdef ESTRPIPE
+        case ESTRPIPE: return DRWAV_ERROR;
+    #endif
+    #ifdef EUSERS
+        case EUSERS: return DRWAV_ERROR;
+    #endif
+    #ifdef ENOTSOCK
+        case ENOTSOCK: return DRWAV_NOT_SOCKET;
+    #endif
+    #ifdef EDESTADDRREQ
+        case EDESTADDRREQ: return DRWAV_NO_ADDRESS;
+    #endif
+    #ifdef EMSGSIZE
+        case EMSGSIZE: return DRWAV_TOO_BIG;
+    #endif
+    #ifdef EPROTOTYPE
+        case EPROTOTYPE: return DRWAV_BAD_PROTOCOL;
+    #endif
+    #ifdef ENOPROTOOPT
+        case ENOPROTOOPT: return DRWAV_PROTOCOL_UNAVAILABLE;
+    #endif
+    #ifdef EPROTONOSUPPORT
+        case EPROTONOSUPPORT: return DRWAV_PROTOCOL_NOT_SUPPORTED;
+    #endif
+    #ifdef ESOCKTNOSUPPORT
+        case ESOCKTNOSUPPORT: return DRWAV_SOCKET_NOT_SUPPORTED;
+    #endif
+    #ifdef EOPNOTSUPP
+        case EOPNOTSUPP: return DRWAV_INVALID_OPERATION;
+    #endif
+    #ifdef EPFNOSUPPORT
+        case EPFNOSUPPORT: return DRWAV_PROTOCOL_FAMILY_NOT_SUPPORTED;
+    #endif
+    #ifdef EAFNOSUPPORT
+        case EAFNOSUPPORT: return DRWAV_ADDRESS_FAMILY_NOT_SUPPORTED;
+    #endif
+    #ifdef EADDRINUSE
+        case EADDRINUSE: return DRWAV_ALREADY_IN_USE;
+    #endif
+    #ifdef EADDRNOTAVAIL
+        case EADDRNOTAVAIL: return DRWAV_ERROR;
+    #endif
+    #ifdef ENETDOWN
+        case ENETDOWN: return DRWAV_NO_NETWORK;
+    #endif
+    #ifdef ENETUNREACH
+        case ENETUNREACH: return DRWAV_NO_NETWORK;
+    #endif
+    #ifdef ENETRESET
+        case ENETRESET: return DRWAV_NO_NETWORK;
+    #endif
+    #ifdef ECONNABORTED
+        case ECONNABORTED: return DRWAV_NO_NETWORK;
+    #endif
+    #ifdef ECONNRESET
+        case ECONNRESET: return DRWAV_CONNECTION_RESET;
+    #endif
+    #ifdef ENOBUFS
+        case ENOBUFS: return DRWAV_NO_SPACE;
+    #endif
+    #ifdef EISCONN
+        case EISCONN: return DRWAV_ALREADY_CONNECTED;
+    #endif
+    #ifdef ENOTCONN
+        case ENOTCONN: return DRWAV_NOT_CONNECTED;
+    #endif
+    #ifdef ESHUTDOWN
+        case ESHUTDOWN: return DRWAV_ERROR;
+    #endif
+    #ifdef ETOOMANYREFS
+        case ETOOMANYREFS: return DRWAV_ERROR;
+    #endif
+    #ifdef ETIMEDOUT
+        case ETIMEDOUT: return DRWAV_TIMEOUT;
+    #endif
+    #ifdef ECONNREFUSED
+        case ECONNREFUSED: return DRWAV_CONNECTION_REFUSED;
+    #endif
+    #ifdef EHOSTDOWN
+        case EHOSTDOWN: return DRWAV_NO_HOST;
+    #endif
+    #ifdef EHOSTUNREACH
+        case EHOSTUNREACH: return DRWAV_NO_HOST;
+    #endif
+    #ifdef EALREADY
+        case EALREADY: return DRWAV_IN_PROGRESS;
+    #endif
+    #ifdef EINPROGRESS
+        case EINPROGRESS: return DRWAV_IN_PROGRESS;
+    #endif
+    #ifdef ESTALE
+        case ESTALE: return DRWAV_INVALID_FILE;
+    #endif
+    #ifdef EUCLEAN
+        case EUCLEAN: return DRWAV_ERROR;
+    #endif
+    #ifdef ENOTNAM
+        case ENOTNAM: return DRWAV_ERROR;
+    #endif
+    #ifdef ENAVAIL
+        case ENAVAIL: return DRWAV_ERROR;
+    #endif
+    #ifdef EISNAM
+        case EISNAM: return DRWAV_ERROR;
+    #endif
+    #ifdef EREMOTEIO
+        case EREMOTEIO: return DRWAV_IO_ERROR;
+    #endif
+    #ifdef EDQUOT
+        case EDQUOT: return DRWAV_NO_SPACE;
+    #endif
+    #ifdef ENOMEDIUM
+        case ENOMEDIUM: return DRWAV_DOES_NOT_EXIST;
+    #endif
+    #ifdef EMEDIUMTYPE
+        case EMEDIUMTYPE: return DRWAV_ERROR;
+    #endif
+    #ifdef ECANCELED
+        case ECANCELED: return DRWAV_CANCELLED;
+    #endif
+    #ifdef ENOKEY
+        case ENOKEY: return DRWAV_ERROR;
+    #endif
+    #ifdef EKEYEXPIRED
+        case EKEYEXPIRED: return DRWAV_ERROR;
+    #endif
+    #ifdef EKEYREVOKED
+        case EKEYREVOKED: return DRWAV_ERROR;
+    #endif
+    #ifdef EKEYREJECTED
+        case EKEYREJECTED: return DRWAV_ERROR;
+    #endif
+    #ifdef EOWNERDEAD
+        case EOWNERDEAD: return DRWAV_ERROR;
+    #endif
+    #ifdef ENOTRECOVERABLE
+        case ENOTRECOVERABLE: return DRWAV_ERROR;
+    #endif
+    #ifdef ERFKILL
+        case ERFKILL: return DRWAV_ERROR;
+    #endif
+    #ifdef EHWPOISON
+        case EHWPOISON: return DRWAV_ERROR;
+    #endif
+        default: return DRWAV_ERROR;
+    }
+}
+/* End Errno */
+
+/* fopen */
+DRWAV_PRIVATE drwav_result drwav_fopen(FILE** ppFile, const char* pFilePath, const char* pOpenMode)
+{
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+    errno_t err;
+#endif
+
+    if (ppFile != NULL) {
+        *ppFile = NULL;  /* Safety. */
+    }
+
+    if (pFilePath == NULL || pOpenMode == NULL || ppFile == NULL) {
+        return DRWAV_INVALID_ARGS;
+    }
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+    err = fopen_s(ppFile, pFilePath, pOpenMode);
+    if (err != 0) {
+        return drwav_result_from_errno(err);
+    }
+#else
+#if defined(_WIN32) || defined(__APPLE__)
+    *ppFile = fopen(pFilePath, pOpenMode);
+#else
+    #if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64 && defined(_LARGEFILE64_SOURCE)
+        *ppFile = fopen64(pFilePath, pOpenMode);
+    #else
+        *ppFile = fopen(pFilePath, pOpenMode);
+    #endif
+#endif
+    if (*ppFile == NULL) {
+        drwav_result result = drwav_result_from_errno(errno);
+        if (result == DRWAV_SUCCESS) {
+            result = DRWAV_ERROR;   /* Just a safety check to make sure we never ever return success when pFile == NULL. */
+        }
+
+        return result;
+    }
+#endif
+
+    return DRWAV_SUCCESS;
+}
+
+/*
+_wfopen() isn't always available in all compilation environments.
+
+    * Windows only.
+    * MSVC seems to support it universally as far back as VC6 from what I can tell (haven't checked further back).
+    * MinGW-64 (both 32- and 64-bit) seems to support it.
+    * MinGW wraps it in !defined(__STRICT_ANSI__).
+    * OpenWatcom wraps it in !defined(_NO_EXT_KEYS).
+
+This can be reviewed as compatibility issues arise. The preference is to use _wfopen_s() and _wfopen() as opposed to the wcsrtombs()
+fallback, so if you notice your compiler not detecting this properly I'm happy to look at adding support.
+*/
+#if defined(_WIN32)
+    #if defined(_MSC_VER) || defined(__MINGW64__) || (!defined(__STRICT_ANSI__) && !defined(_NO_EXT_KEYS))
+        #define DRWAV_HAS_WFOPEN
+    #endif
+#endif
+
+#ifndef DR_WAV_NO_WCHAR
+DRWAV_PRIVATE drwav_result drwav_wfopen(FILE** ppFile, const wchar_t* pFilePath, const wchar_t* pOpenMode, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (ppFile != NULL) {
+        *ppFile = NULL;  /* Safety. */
+    }
+
+    if (pFilePath == NULL || pOpenMode == NULL || ppFile == NULL) {
+        return DRWAV_INVALID_ARGS;
+    }
+
+#if defined(DRWAV_HAS_WFOPEN)
+    {
+        /* Use _wfopen() on Windows. */
+    #if defined(_MSC_VER) && _MSC_VER >= 1400
+        errno_t err = _wfopen_s(ppFile, pFilePath, pOpenMode);
+        if (err != 0) {
+            return drwav_result_from_errno(err);
+        }
+    #else
+        *ppFile = _wfopen(pFilePath, pOpenMode);
+        if (*ppFile == NULL) {
+            return drwav_result_from_errno(errno);
+        }
+    #endif
+        (void)pAllocationCallbacks;
+    }
+#else
+	/*
+    Use fopen() on anything other than Windows. Requires a conversion. This is annoying because
+	fopen() is locale specific. The only real way I can think of to do this is with wcsrtombs(). Note
+	that wcstombs() is apparently not thread-safe because it uses a static global mbstate_t object for
+    maintaining state. I've checked this with -std=c89 and it works, but if somebody get's a compiler
+	error I'll look into improving compatibility.
+    */
+
+	/*
+	Some compilers don't support wchar_t or wcsrtombs() which we're using below. In this case we just
+	need to abort with an error. If you encounter a compiler lacking such support, add it to this list
+	and submit a bug report and it'll be added to the library upstream.
+	*/
+	#if defined(__DJGPP__)
+	{
+		/* Nothing to do here. This will fall through to the error check below. */
+	}
+	#else
+    {
+        mbstate_t mbs;
+        size_t lenMB;
+        const wchar_t* pFilePathTemp = pFilePath;
+        char* pFilePathMB = NULL;
+        char pOpenModeMB[32] = {0};
+
+        /* Get the length first. */
+        DRWAV_ZERO_OBJECT(&mbs);
+        lenMB = wcsrtombs(NULL, &pFilePathTemp, 0, &mbs);
+        if (lenMB == (size_t)-1) {
+            return drwav_result_from_errno(errno);
+        }
+
+        pFilePathMB = (char*)drwav__malloc_from_callbacks(lenMB + 1, pAllocationCallbacks);
+        if (pFilePathMB == NULL) {
+            return DRWAV_OUT_OF_MEMORY;
+        }
+
+        pFilePathTemp = pFilePath;
+        DRWAV_ZERO_OBJECT(&mbs);
+        wcsrtombs(pFilePathMB, &pFilePathTemp, lenMB + 1, &mbs);
+
+        /* The open mode should always consist of ASCII characters so we should be able to do a trivial conversion. */
+        {
+            size_t i = 0;
+            for (;;) {
+                if (pOpenMode[i] == 0) {
+                    pOpenModeMB[i] = '\0';
+                    break;
+                }
+
+                pOpenModeMB[i] = (char)pOpenMode[i];
+                i += 1;
+            }
+        }
+
+        *ppFile = fopen(pFilePathMB, pOpenModeMB);
+
+        drwav__free_from_callbacks(pFilePathMB, pAllocationCallbacks);
+    }
+	#endif
+
+    if (*ppFile == NULL) {
+        return DRWAV_ERROR;
+    }
+#endif
+
+    return DRWAV_SUCCESS;
+}
+#endif
+/* End fopen */
+
+
+DRWAV_PRIVATE size_t drwav__on_read_stdio(void* pUserData, void* pBufferOut, size_t bytesToRead)
+{
+    return fread(pBufferOut, 1, bytesToRead, (FILE*)pUserData);
+}
+
+DRWAV_PRIVATE size_t drwav__on_write_stdio(void* pUserData, const void* pData, size_t bytesToWrite)
+{
+    return fwrite(pData, 1, bytesToWrite, (FILE*)pUserData);
+}
+
+DRWAV_PRIVATE drwav_bool32 drwav__on_seek_stdio(void* pUserData, int offset, drwav_seek_origin origin)
+{
+    int whence = SEEK_SET;
+    if (origin == DRWAV_SEEK_CUR) {
+        whence = SEEK_CUR;
+    } else if (origin == DRWAV_SEEK_END) {
+        whence = SEEK_END;
+    }
+
+    return fseek((FILE*)pUserData, offset, whence) == 0;
+}
+
+DRWAV_PRIVATE drwav_bool32 drwav__on_tell_stdio(void* pUserData, drwav_int64* pCursor)
+{
+    FILE* pFileStdio = (FILE*)pUserData;
+    drwav_int64 result;
+
+    /* These were all validated at a higher level. */
+    DRWAV_ASSERT(pFileStdio != NULL);
+    DRWAV_ASSERT(pCursor    != NULL);
+
+#if defined(_WIN32) && !defined(NXDK)
+    #if defined(_MSC_VER) && _MSC_VER > 1200
+        result = _ftelli64(pFileStdio);
+    #else
+        result = ftell(pFileStdio);
+    #endif
+#else
+    result = ftell(pFileStdio);
+#endif
+
+    *pCursor = result;
+
+    return DRWAV_TRUE;
+}
+
+DRWAV_API drwav_bool32 drwav_init_file(drwav* pWav, const char* filename, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_file_ex(pWav, filename, NULL, NULL, 0, pAllocationCallbacks);
+}
+
+
+DRWAV_PRIVATE drwav_bool32 drwav_init_file__internal_FILE(drwav* pWav, FILE* pFile, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav_bool32 result;
+
+    result = drwav_preinit(pWav, drwav__on_read_stdio, drwav__on_seek_stdio, drwav__on_tell_stdio, (void*)pFile, pAllocationCallbacks);
+    if (result != DRWAV_TRUE) {
+        fclose(pFile);
+        return result;
+    }
+
+    result = drwav_init__internal(pWav, onChunk, pChunkUserData, flags);
+    if (result != DRWAV_TRUE) {
+        fclose(pFile);
+        return result;
+    }
+
+    return DRWAV_TRUE;
+}
+
+DRWAV_API drwav_bool32 drwav_init_file_ex(drwav* pWav, const char* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    FILE* pFile;
+    if (drwav_fopen(&pFile, filename, "rb") != DRWAV_SUCCESS) {
+        return DRWAV_FALSE;
+    }
+
+    /* This takes ownership of the FILE* object. */
+    return drwav_init_file__internal_FILE(pWav, pFile, onChunk, pChunkUserData, flags, pAllocationCallbacks);
+}
+
+#ifndef DR_WAV_NO_WCHAR
+DRWAV_API drwav_bool32 drwav_init_file_w(drwav* pWav, const wchar_t* filename, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_file_ex_w(pWav, filename, NULL, NULL, 0, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_file_ex_w(drwav* pWav, const wchar_t* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    FILE* pFile;
+    if (drwav_wfopen(&pFile, filename, L"rb", pAllocationCallbacks) != DRWAV_SUCCESS) {
+        return DRWAV_FALSE;
+    }
+
+    /* This takes ownership of the FILE* object. */
+    return drwav_init_file__internal_FILE(pWav, pFile, onChunk, pChunkUserData, flags, pAllocationCallbacks);
+}
+#endif
+
+DRWAV_API drwav_bool32 drwav_init_file_with_metadata(drwav* pWav, const char* filename, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    FILE* pFile;
+    if (drwav_fopen(&pFile, filename, "rb") != DRWAV_SUCCESS) {
+        return DRWAV_FALSE;
+    }
+
+    /* This takes ownership of the FILE* object. */
+    return drwav_init_file__internal_FILE(pWav, pFile, NULL, NULL, flags | DRWAV_WITH_METADATA, pAllocationCallbacks);
+}
+
+#ifndef DR_WAV_NO_WCHAR
+DRWAV_API drwav_bool32 drwav_init_file_with_metadata_w(drwav* pWav, const wchar_t* filename, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    FILE* pFile;
+    if (drwav_wfopen(&pFile, filename, L"rb", pAllocationCallbacks) != DRWAV_SUCCESS) {
+        return DRWAV_FALSE;
+    }
+
+    /* This takes ownership of the FILE* object. */
+    return drwav_init_file__internal_FILE(pWav, pFile, NULL, NULL, flags | DRWAV_WITH_METADATA, pAllocationCallbacks);
+}
+#endif
+
+
+DRWAV_PRIVATE drwav_bool32 drwav_init_file_write__internal_FILE(drwav* pWav, FILE* pFile, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav_bool32 result;
+
+    result = drwav_preinit_write(pWav, pFormat, isSequential, drwav__on_write_stdio, drwav__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
+    if (result != DRWAV_TRUE) {
+        fclose(pFile);
+        return result;
+    }
+
+    result = drwav_init_write__internal(pWav, pFormat, totalSampleCount);
+    if (result != DRWAV_TRUE) {
+        fclose(pFile);
+        return result;
+    }
+
+    return DRWAV_TRUE;
+}
+
+DRWAV_PRIVATE drwav_bool32 drwav_init_file_write__internal(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    FILE* pFile;
+    if (drwav_fopen(&pFile, filename, "wb") != DRWAV_SUCCESS) {
+        return DRWAV_FALSE;
+    }
+
+    /* This takes ownership of the FILE* object. */
+    return drwav_init_file_write__internal_FILE(pWav, pFile, pFormat, totalSampleCount, isSequential, pAllocationCallbacks);
+}
+
+#ifndef DR_WAV_NO_WCHAR
+DRWAV_PRIVATE drwav_bool32 drwav_init_file_write_w__internal(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    FILE* pFile;
+    if (drwav_wfopen(&pFile, filename, L"wb", pAllocationCallbacks) != DRWAV_SUCCESS) {
+        return DRWAV_FALSE;
+    }
+
+    /* This takes ownership of the FILE* object. */
+    return drwav_init_file_write__internal_FILE(pWav, pFile, pFormat, totalSampleCount, isSequential, pAllocationCallbacks);
+}
+#endif
+
+DRWAV_API drwav_bool32 drwav_init_file_write(drwav* pWav, const char* filename, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_file_write__internal(pWav, filename, pFormat, 0, DRWAV_FALSE, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_file_write_sequential(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_file_write__internal(pWav, filename, pFormat, totalSampleCount, DRWAV_TRUE, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_file_write_sequential_pcm_frames(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFormat == NULL) {
+        return DRWAV_FALSE;
+    }
+
+    return drwav_init_file_write_sequential(pWav, filename, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks);
+}
+
+#ifndef DR_WAV_NO_WCHAR
+DRWAV_API drwav_bool32 drwav_init_file_write_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_file_write_w__internal(pWav, filename, pFormat, 0, DRWAV_FALSE, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_file_write_sequential_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_file_write_w__internal(pWav, filename, pFormat, totalSampleCount, DRWAV_TRUE, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_file_write_sequential_pcm_frames_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFormat == NULL) {
+        return DRWAV_FALSE;
+    }
+
+    return drwav_init_file_write_sequential_w(pWav, filename, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks);
+}
+#endif
+#endif  /* DR_WAV_NO_STDIO */
+
+
+DRWAV_PRIVATE size_t drwav__on_read_memory(void* pUserData, void* pBufferOut, size_t bytesToRead)
+{
+    drwav* pWav = (drwav*)pUserData;
+    size_t bytesRemaining;
+
+    DRWAV_ASSERT(pWav != NULL);
+    DRWAV_ASSERT(pWav->memoryStream.dataSize >= pWav->memoryStream.currentReadPos);
+
+    bytesRemaining = pWav->memoryStream.dataSize - pWav->memoryStream.currentReadPos;
+    if (bytesToRead > bytesRemaining) {
+        bytesToRead = bytesRemaining;
+    }
+
+    if (bytesToRead > 0) {
+        DRWAV_COPY_MEMORY(pBufferOut, pWav->memoryStream.data + pWav->memoryStream.currentReadPos, bytesToRead);
+        pWav->memoryStream.currentReadPos += bytesToRead;
+    }
+
+    return bytesToRead;
+}
+
+DRWAV_PRIVATE drwav_bool32 drwav__on_seek_memory(void* pUserData, int offset, drwav_seek_origin origin)
+{
+    drwav* pWav = (drwav*)pUserData;
+    drwav_int64 newCursor;
+
+    DRWAV_ASSERT(pWav != NULL);
+
+    newCursor = pWav->memoryStream.currentReadPos;
+
+    if (origin == DRWAV_SEEK_SET) {
+        newCursor = 0;
+    } else if (origin == DRWAV_SEEK_CUR) {
+        newCursor = (drwav_int64)pWav->memoryStream.currentReadPos;
+    } else if (origin == DRWAV_SEEK_END) {
+        newCursor = (drwav_int64)pWav->memoryStream.dataSize;
+    } else {
+        DRWAV_ASSERT(!"Invalid seek origin");
+        return DRWAV_FALSE;
+    }
+
+    newCursor += offset;
+
+    if (newCursor < 0) {
+        return DRWAV_FALSE;  /* Trying to seek prior to the start of the buffer. */
+    }
+    if ((size_t)newCursor > pWav->memoryStream.dataSize) {
+        return DRWAV_FALSE;  /* Trying to seek beyond the end of the buffer. */
+    }
+
+    pWav->memoryStream.currentReadPos = (size_t)newCursor;
+
+    return DRWAV_TRUE;
+}
+
+DRWAV_PRIVATE size_t drwav__on_write_memory(void* pUserData, const void* pDataIn, size_t bytesToWrite)
+{
+    drwav* pWav = (drwav*)pUserData;
+    size_t bytesRemaining;
+
+    DRWAV_ASSERT(pWav != NULL);
+    DRWAV_ASSERT(pWav->memoryStreamWrite.dataCapacity >= pWav->memoryStreamWrite.currentWritePos);
+
+    bytesRemaining = pWav->memoryStreamWrite.dataCapacity - pWav->memoryStreamWrite.currentWritePos;
+    if (bytesRemaining < bytesToWrite) {
+        /* Need to reallocate. */
+        void* pNewData;
+        size_t newDataCapacity = (pWav->memoryStreamWrite.dataCapacity == 0) ? 256 : pWav->memoryStreamWrite.dataCapacity * 2;
+
+        /* If doubling wasn't enough, just make it the minimum required size to write the data. */
+        if ((newDataCapacity - pWav->memoryStreamWrite.currentWritePos) < bytesToWrite) {
+            newDataCapacity = pWav->memoryStreamWrite.currentWritePos + bytesToWrite;
+        }
+
+        pNewData = drwav__realloc_from_callbacks(*pWav->memoryStreamWrite.ppData, newDataCapacity, pWav->memoryStreamWrite.dataCapacity, &pWav->allocationCallbacks);
+        if (pNewData == NULL) {
+            return 0;
+        }
+
+        *pWav->memoryStreamWrite.ppData = pNewData;
+        pWav->memoryStreamWrite.dataCapacity = newDataCapacity;
+    }
+
+    DRWAV_COPY_MEMORY(((drwav_uint8*)(*pWav->memoryStreamWrite.ppData)) + pWav->memoryStreamWrite.currentWritePos, pDataIn, bytesToWrite);
+
+    pWav->memoryStreamWrite.currentWritePos += bytesToWrite;
+    if (pWav->memoryStreamWrite.dataSize < pWav->memoryStreamWrite.currentWritePos) {
+        pWav->memoryStreamWrite.dataSize = pWav->memoryStreamWrite.currentWritePos;
+    }
+
+    *pWav->memoryStreamWrite.pDataSize = pWav->memoryStreamWrite.dataSize;
+
+    return bytesToWrite;
+}
+
+DRWAV_PRIVATE drwav_bool32 drwav__on_seek_memory_write(void* pUserData, int offset, drwav_seek_origin origin)
+{
+    drwav* pWav = (drwav*)pUserData;
+    drwav_int64 newCursor;
+
+    DRWAV_ASSERT(pWav != NULL);
+
+    newCursor = pWav->memoryStreamWrite.currentWritePos;
+
+    if (origin == DRWAV_SEEK_SET) {
+        newCursor = 0;
+    } else if (origin == DRWAV_SEEK_CUR) {
+        newCursor = (drwav_int64)pWav->memoryStreamWrite.currentWritePos;
+    } else if (origin == DRWAV_SEEK_END) {
+        newCursor = (drwav_int64)pWav->memoryStreamWrite.dataSize;
+    } else {
+        DRWAV_ASSERT(!"Invalid seek origin");
+        return DRWAV_FALSE;
+    }
+
+    newCursor += offset;
+
+    if (newCursor < 0) {
+        return DRWAV_FALSE;  /* Trying to seek prior to the start of the buffer. */
+    }
+    if ((size_t)newCursor > pWav->memoryStreamWrite.dataSize) {
+        return DRWAV_FALSE;  /* Trying to seek beyond the end of the buffer. */
+    }
+
+    pWav->memoryStreamWrite.currentWritePos = (size_t)newCursor;
+
+    return DRWAV_TRUE;
+}
+
+DRWAV_PRIVATE drwav_bool32 drwav__on_tell_memory(void* pUserData, drwav_int64* pCursor)
+{
+    drwav* pWav = (drwav*)pUserData;
+
+    DRWAV_ASSERT(pWav != NULL);
+    DRWAV_ASSERT(pCursor != NULL);
+
+    *pCursor = (drwav_int64)pWav->memoryStream.currentReadPos;
+    return DRWAV_TRUE;
+}
+
+DRWAV_API drwav_bool32 drwav_init_memory(drwav* pWav, const void* data, size_t dataSize, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_memory_ex(pWav, data, dataSize, NULL, NULL, 0, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_memory_ex(drwav* pWav, const void* data, size_t dataSize, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (data == NULL || dataSize == 0) {
+        return DRWAV_FALSE;
+    }
+
+    if (!drwav_preinit(pWav, drwav__on_read_memory, drwav__on_seek_memory, drwav__on_tell_memory, pWav, pAllocationCallbacks)) {
+        return DRWAV_FALSE;
+    }
+
+    pWav->memoryStream.data = (const drwav_uint8*)data;
+    pWav->memoryStream.dataSize = dataSize;
+    pWav->memoryStream.currentReadPos = 0;
+
+    return drwav_init__internal(pWav, onChunk, pChunkUserData, flags);
+}
+
+DRWAV_API drwav_bool32 drwav_init_memory_with_metadata(drwav* pWav, const void* data, size_t dataSize, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (data == NULL || dataSize == 0) {
+        return DRWAV_FALSE;
+    }
+
+    if (!drwav_preinit(pWav, drwav__on_read_memory, drwav__on_seek_memory, drwav__on_tell_memory, pWav, pAllocationCallbacks)) {
+        return DRWAV_FALSE;
+    }
+
+    pWav->memoryStream.data = (const drwav_uint8*)data;
+    pWav->memoryStream.dataSize = dataSize;
+    pWav->memoryStream.currentReadPos = 0;
+
+    return drwav_init__internal(pWav, NULL, NULL, flags | DRWAV_WITH_METADATA);
+}
+
+
+DRWAV_PRIVATE drwav_bool32 drwav_init_memory_write__internal(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (ppData == NULL || pDataSize == NULL) {
+        return DRWAV_FALSE;
+    }
+
+    *ppData = NULL; /* Important because we're using realloc()! */
+    *pDataSize = 0;
+
+    if (!drwav_preinit_write(pWav, pFormat, isSequential, drwav__on_write_memory, drwav__on_seek_memory_write, pWav, pAllocationCallbacks)) {
+        return DRWAV_FALSE;
+    }
+
+    pWav->memoryStreamWrite.ppData = ppData;
+    pWav->memoryStreamWrite.pDataSize = pDataSize;
+    pWav->memoryStreamWrite.dataSize = 0;
+    pWav->memoryStreamWrite.dataCapacity = 0;
+    pWav->memoryStreamWrite.currentWritePos = 0;
+
+    return drwav_init_write__internal(pWav, pFormat, totalSampleCount);
+}
+
+DRWAV_API drwav_bool32 drwav_init_memory_write(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_memory_write__internal(pWav, ppData, pDataSize, pFormat, 0, DRWAV_FALSE, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_memory_write_sequential(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    return drwav_init_memory_write__internal(pWav, ppData, pDataSize, pFormat, totalSampleCount, DRWAV_TRUE, pAllocationCallbacks);
+}
+
+DRWAV_API drwav_bool32 drwav_init_memory_write_sequential_pcm_frames(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFormat == NULL) {
+        return DRWAV_FALSE;
+    }
+
+    return drwav_init_memory_write_sequential(pWav, ppData, pDataSize, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks);
+}
+
+
+
+DRWAV_API drwav_result drwav_uninit(drwav* pWav)
+{
+    drwav_result result = DRWAV_SUCCESS;
+
+    if (pWav == NULL) {
+        return DRWAV_INVALID_ARGS;
+    }
+
+    /*
+    If the drwav object was opened in write mode we'll need to finalize a few things:
+      - Make sure the "data" chunk is aligned to 16-bits for RIFF containers, or 64 bits for W64 containers.
+      - Set the size of the "data" chunk.
+    */
+    if (pWav->onWrite != NULL) {
+        drwav_uint32 paddingSize = 0;
+
+        /* Padding. Do not adjust pWav->dataChunkDataSize - this should not include the padding. */
+        if (pWav->container == drwav_container_riff || pWav->container == drwav_container_rf64) {
+            paddingSize = drwav__chunk_padding_size_riff(pWav->dataChunkDataSize);
+        } else {
+            paddingSize = drwav__chunk_padding_size_w64(pWav->dataChunkDataSize);
+        }
+
+        if (paddingSize > 0) {
+            drwav_uint64 paddingData = 0;
+            drwav__write(pWav, &paddingData, paddingSize);  /* Byte order does not matter for this. */
+        }
+
+        /*
+        Chunk sizes. When using sequential mode, these will have been filled in at initialization time. We only need
+        to do this when using non-sequential mode.
+        */
+        if (pWav->onSeek && !pWav->isSequentialWrite) {
+            if (pWav->container == drwav_container_riff) {
+                /* The "RIFF" chunk size. */
+                if (pWav->onSeek(pWav->pUserData, 4, DRWAV_SEEK_SET)) {
+                    drwav_uint32 riffChunkSize = drwav__riff_chunk_size_riff(pWav->dataChunkDataSize, pWav->pMetadata, pWav->metadataCount);
+                    drwav__write_u32ne_to_le(pWav, riffChunkSize);
+                }
+
+                /* The "data" chunk size. */
+                if (pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos - 4, DRWAV_SEEK_SET)) {
+                    drwav_uint32 dataChunkSize = drwav__data_chunk_size_riff(pWav->dataChunkDataSize);
+                    drwav__write_u32ne_to_le(pWav, dataChunkSize);
+                }
+            } else if (pWav->container == drwav_container_w64) {
+                /* The "RIFF" chunk size. */
+                if (pWav->onSeek(pWav->pUserData, 16, DRWAV_SEEK_SET)) {
+                    drwav_uint64 riffChunkSize = drwav__riff_chunk_size_w64(pWav->dataChunkDataSize);
+                    drwav__write_u64ne_to_le(pWav, riffChunkSize);
+                }
+
+                /* The "data" chunk size. */
+                if (pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos - 8, DRWAV_SEEK_SET)) {
+                    drwav_uint64 dataChunkSize = drwav__data_chunk_size_w64(pWav->dataChunkDataSize);
+                    drwav__write_u64ne_to_le(pWav, dataChunkSize);
+                }
+            } else if (pWav->container == drwav_container_rf64) {
+                /* We only need to update the ds64 chunk. The "RIFF" and "data" chunks always have their sizes set to 0xFFFFFFFF for RF64. */
+                int ds64BodyPos = 12 + 8;
+
+                /* The "RIFF" chunk size. */
+                if (pWav->onSeek(pWav->pUserData, ds64BodyPos + 0, DRWAV_SEEK_SET)) {
+                    drwav_uint64 riffChunkSize = drwav__riff_chunk_size_rf64(pWav->dataChunkDataSize, pWav->pMetadata, pWav->metadataCount);
+                    drwav__write_u64ne_to_le(pWav, riffChunkSize);
+                }
+
+                /* The "data" chunk size. */
+                if (pWav->onSeek(pWav->pUserData, ds64BodyPos + 8, DRWAV_SEEK_SET)) {
+                    drwav_uint64 dataChunkSize = drwav__data_chunk_size_rf64(pWav->dataChunkDataSize);
+                    drwav__write_u64ne_to_le(pWav, dataChunkSize);
+                }
+            }
+        }
+
+        /* Validation for sequential mode. */
+        if (pWav->isSequentialWrite) {
+            if (pWav->dataChunkDataSize != pWav->dataChunkDataSizeTargetWrite) {
+                result = DRWAV_INVALID_FILE;
+            }
+        }
+    } else {
+        drwav_free(pWav->pMetadata, &pWav->allocationCallbacks);
+    }
+
+#ifndef DR_WAV_NO_STDIO
+    /*
+    If we opened the file with drwav_open_file() we will want to close the file handle. We can know whether or not drwav_open_file()
+    was used by looking at the onRead and onSeek callbacks.
+    */
+    if (pWav->onRead == drwav__on_read_stdio || pWav->onWrite == drwav__on_write_stdio) {
+        fclose((FILE*)pWav->pUserData);
+    }
+#endif
+
+    return result;
+}
+
+
+
+DRWAV_API size_t drwav_read_raw(drwav* pWav, size_t bytesToRead, void* pBufferOut)
+{
+    size_t bytesRead;
+    drwav_uint32 bytesPerFrame;
+
+    if (pWav == NULL || bytesToRead == 0) {
+        return 0;   /* Invalid args. */
+    }
+
+    if (bytesToRead > pWav->bytesRemaining) {
+        bytesToRead = (size_t)pWav->bytesRemaining;
+    }
+
+    if (bytesToRead == 0) {
+        return 0;   /* At end. */
+    }
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;   /* Could not determine the bytes per frame. */
+    }
+
+    if (pBufferOut != NULL) {
+        bytesRead = pWav->onRead(pWav->pUserData, pBufferOut, bytesToRead);
+    } else {
+        /* We need to seek. If we fail, we need to read-and-discard to make sure we get a good byte count. */
+        bytesRead = 0;
+        while (bytesRead < bytesToRead) {
+            size_t bytesToSeek = (bytesToRead - bytesRead);
+            if (bytesToSeek > 0x7FFFFFFF) {
+                bytesToSeek = 0x7FFFFFFF;
+            }
+
+            if (pWav->onSeek(pWav->pUserData, (int)bytesToSeek, DRWAV_SEEK_CUR) == DRWAV_FALSE) {
+                break;
+            }
+
+            bytesRead += bytesToSeek;
+        }
+
+        /* When we get here we may need to read-and-discard some data. */
+        while (bytesRead < bytesToRead) {
+            drwav_uint8 buffer[4096];
+            size_t bytesSeeked;
+            size_t bytesToSeek = (bytesToRead - bytesRead);
+            if (bytesToSeek > sizeof(buffer)) {
+                bytesToSeek = sizeof(buffer);
+            }
+
+            bytesSeeked = pWav->onRead(pWav->pUserData, buffer, bytesToSeek);
+            bytesRead += bytesSeeked;
+
+            if (bytesSeeked < bytesToSeek) {
+                break;  /* Reached the end. */
+            }
+        }
+    }
+
+    pWav->readCursorInPCMFrames += bytesRead / bytesPerFrame;
+
+    pWav->bytesRemaining -= bytesRead;
+    return bytesRead;
+}
+
+
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_le(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut)
+{
+    drwav_uint32 bytesPerFrame;
+    drwav_uint64 bytesToRead;   /* Intentionally uint64 instead of size_t so we can do a check that we're not reading too much on 32-bit builds. */
+    drwav_uint64 framesRemainingInFile;
+
+    if (pWav == NULL || framesToRead == 0) {
+        return 0;
+    }
+
+    /* Cannot use this function for compressed formats. */
+    if (drwav__is_compressed_format_tag(pWav->translatedFormatTag)) {
+        return 0;
+    }
+
+    framesRemainingInFile = pWav->totalPCMFrameCount - pWav->readCursorInPCMFrames;
+    if (framesToRead > framesRemainingInFile) {
+        framesToRead = framesRemainingInFile;
+    }
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    /* Don't try to read more samples than can potentially fit in the output buffer. */
+    bytesToRead = framesToRead * bytesPerFrame;
+    if (bytesToRead > DRWAV_SIZE_MAX) {
+        bytesToRead = (DRWAV_SIZE_MAX / bytesPerFrame) * bytesPerFrame; /* Round the number of bytes to read to a clean frame boundary. */
+    }
+
+    /*
+    Doing an explicit check here just to make it clear that we don't want to be attempt to read anything if there's no bytes to read. There
+    *could* be a time where it evaluates to 0 due to overflowing.
+    */
+    if (bytesToRead == 0) {
+        return 0;
+    }
+
+    return drwav_read_raw(pWav, (size_t)bytesToRead, pBufferOut) / bytesPerFrame;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_be(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut)
+{
+    drwav_uint64 framesRead = drwav_read_pcm_frames_le(pWav, framesToRead, pBufferOut);
+
+    if (pBufferOut != NULL) {
+        drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+        if (bytesPerFrame == 0) {
+            return 0;   /* Could not get the bytes per frame which means bytes per sample cannot be determined and we don't know how to byte swap. */
+        }
+
+        drwav__bswap_samples(pBufferOut, framesRead*pWav->channels, bytesPerFrame/pWav->channels);
+    }
+
+    return framesRead;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut)
+{
+    drwav_uint64 framesRead = 0;
+
+    if (drwav_is_container_be(pWav->container)) {
+        /*
+        Special case for AIFF. AIFF is a big-endian encoded format, but it supports a format that is
+        PCM in little-endian encoding. In this case, we fall through this branch and treate it as
+        little-endian.
+        */
+        if (pWav->container != drwav_container_aiff || pWav->aiff.isLE == DRWAV_FALSE) {
+            if (drwav__is_little_endian()) {
+                framesRead = drwav_read_pcm_frames_be(pWav, framesToRead, pBufferOut);
+            } else {
+                framesRead = drwav_read_pcm_frames_le(pWav, framesToRead, pBufferOut);
+            }
+
+            goto post_process;
+        }
+    }
+
+    /* Getting here means the data should be considered little-endian. */
+    if (drwav__is_little_endian()) {
+        framesRead = drwav_read_pcm_frames_le(pWav, framesToRead, pBufferOut);
+    } else {
+        framesRead = drwav_read_pcm_frames_be(pWav, framesToRead, pBufferOut);
+    }
+
+    /*
+    Here is where we check if we need to do a signed/unsigned conversion for AIFF. The reason we need to do this
+    is because dr_wav always assumes an 8-bit sample is unsigned, whereas AIFF can have signed 8-bit formats.
+    */
+    post_process:
+    {
+        if (pWav->container == drwav_container_aiff && pWav->bitsPerSample == 8 && pWav->aiff.isUnsigned == DRWAV_FALSE) {
+            if (pBufferOut != NULL) {
+                drwav_uint64 iSample;
+
+                for (iSample = 0; iSample < framesRead * pWav->channels; iSample += 1) {
+                    ((drwav_uint8*)pBufferOut)[iSample] += 128;
+                }
+            }
+        }
+    }
+
+    return framesRead;
+}
+
+
+
+DRWAV_PRIVATE drwav_bool32 drwav_seek_to_first_pcm_frame(drwav* pWav)
+{
+    if (pWav->onWrite != NULL) {
+        return DRWAV_FALSE; /* No seeking in write mode. */
+    }
+
+    if (!pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos, DRWAV_SEEK_SET)) {
+        return DRWAV_FALSE;
+    }
+
+    if (drwav__is_compressed_format_tag(pWav->translatedFormatTag)) {
+        /* Cached data needs to be cleared for compressed formats. */
+        if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
+            DRWAV_ZERO_OBJECT(&pWav->msadpcm);
+        } else if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
+            DRWAV_ZERO_OBJECT(&pWav->ima);
+        } else {
+            DRWAV_ASSERT(DRWAV_FALSE);  /* If this assertion is triggered it means I've implemented a new compressed format but forgot to add a branch for it here. */
+        }
+    }
+
+    pWav->readCursorInPCMFrames = 0;
+    pWav->bytesRemaining = pWav->dataChunkDataSize;
+
+    return DRWAV_TRUE;
+}
+
+DRWAV_API drwav_bool32 drwav_seek_to_pcm_frame(drwav* pWav, drwav_uint64 targetFrameIndex)
+{
+    /* Seeking should be compatible with wave files > 2GB. */
+
+    if (pWav == NULL || pWav->onSeek == NULL) {
+        return DRWAV_FALSE;
+    }
+
+    /* No seeking in write mode. */
+    if (pWav->onWrite != NULL) {
+        return DRWAV_FALSE;
+    }
+
+    /* If there are no samples, just return DRWAV_TRUE without doing anything. */
+    if (pWav->totalPCMFrameCount == 0) {
+        return DRWAV_TRUE;
+    }
+
+    /* Make sure the sample is clamped. */
+    if (targetFrameIndex > pWav->totalPCMFrameCount) {
+        targetFrameIndex = pWav->totalPCMFrameCount;
+    }
+
+    /*
+    For compressed formats we just use a slow generic seek. If we are seeking forward we just seek forward. If we are going backwards we need
+    to seek back to the start.
+    */
+    if (drwav__is_compressed_format_tag(pWav->translatedFormatTag)) {
+        /* TODO: This can be optimized. */
+
+        /*
+        If we're seeking forward it's simple - just keep reading samples until we hit the sample we're requesting. If we're seeking backwards,
+        we first need to seek back to the start and then just do the same thing as a forward seek.
+        */
+        if (targetFrameIndex < pWav->readCursorInPCMFrames) {
+            if (!drwav_seek_to_first_pcm_frame(pWav)) {
+                return DRWAV_FALSE;
+            }
+        }
+
+        if (targetFrameIndex > pWav->readCursorInPCMFrames) {
+            drwav_uint64 offsetInFrames = targetFrameIndex - pWav->readCursorInPCMFrames;
+
+            drwav_int16 devnull[2048];
+            while (offsetInFrames > 0) {
+                drwav_uint64 framesRead = 0;
+                drwav_uint64 framesToRead = offsetInFrames;
+                if (framesToRead > drwav_countof(devnull)/pWav->channels) {
+                    framesToRead = drwav_countof(devnull)/pWav->channels;
+                }
+
+                if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
+                    framesRead = drwav_read_pcm_frames_s16__msadpcm(pWav, framesToRead, devnull);
+                } else if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
+                    framesRead = drwav_read_pcm_frames_s16__ima(pWav, framesToRead, devnull);
+                } else {
+                    DRWAV_ASSERT(DRWAV_FALSE);  /* If this assertion is triggered it means I've implemented a new compressed format but forgot to add a branch for it here. */
+                }
+
+                if (framesRead != framesToRead) {
+                    return DRWAV_FALSE;
+                }
+
+                offsetInFrames -= framesRead;
+            }
+        }
+    } else {
+        drwav_uint64 totalSizeInBytes;
+        drwav_uint64 currentBytePos;
+        drwav_uint64 targetBytePos;
+        drwav_uint64 offset;
+        drwav_uint32 bytesPerFrame;
+
+        bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+        if (bytesPerFrame == 0) {
+            return DRWAV_FALSE; /* Not able to calculate offset. */
+        }
+
+        totalSizeInBytes = pWav->totalPCMFrameCount * bytesPerFrame;
+        /*DRWAV_ASSERT(totalSizeInBytes >= pWav->bytesRemaining);*/
+
+        currentBytePos = totalSizeInBytes - pWav->bytesRemaining;
+        targetBytePos  = targetFrameIndex * bytesPerFrame;
+
+        if (currentBytePos < targetBytePos) {
+            /* Offset forwards. */
+            offset = (targetBytePos - currentBytePos);
+        } else {
+            /* Offset backwards. */
+            if (!drwav_seek_to_first_pcm_frame(pWav)) {
+                return DRWAV_FALSE;
+            }
+            offset = targetBytePos;
+        }
+
+        while (offset > 0) {
+            int offset32 = ((offset > INT_MAX) ? INT_MAX : (int)offset);
+            if (!pWav->onSeek(pWav->pUserData, offset32, DRWAV_SEEK_CUR)) {
+                return DRWAV_FALSE;
+            }
+
+            pWav->readCursorInPCMFrames += offset32 / bytesPerFrame;
+            pWav->bytesRemaining        -= offset32;
+            offset                      -= offset32;
+        }
+    }
+
+    return DRWAV_TRUE;
+}
+
+DRWAV_API drwav_result drwav_get_cursor_in_pcm_frames(drwav* pWav, drwav_uint64* pCursor)
+{
+    if (pCursor == NULL) {
+        return DRWAV_INVALID_ARGS;
+    }
+
+    *pCursor = 0;   /* Safety. */
+
+    if (pWav == NULL) {
+        return DRWAV_INVALID_ARGS;
+    }
+
+    *pCursor = pWav->readCursorInPCMFrames;
+
+    return DRWAV_SUCCESS;
+}
+
+DRWAV_API drwav_result drwav_get_length_in_pcm_frames(drwav* pWav, drwav_uint64* pLength)
+{
+    if (pLength == NULL) {
+        return DRWAV_INVALID_ARGS;
+    }
+
+    *pLength = 0;   /* Safety. */
+
+    if (pWav == NULL) {
+        return DRWAV_INVALID_ARGS;
+    }
+
+    *pLength = pWav->totalPCMFrameCount;
+
+    return DRWAV_SUCCESS;
+}
+
+
+DRWAV_API size_t drwav_write_raw(drwav* pWav, size_t bytesToWrite, const void* pData)
+{
+    size_t bytesWritten;
+
+    if (pWav == NULL || bytesToWrite == 0 || pData == NULL) {
+        return 0;
+    }
+
+    bytesWritten = pWav->onWrite(pWav->pUserData, pData, bytesToWrite);
+    pWav->dataChunkDataSize += bytesWritten;
+
+    return bytesWritten;
+}
+
+DRWAV_API drwav_uint64 drwav_write_pcm_frames_le(drwav* pWav, drwav_uint64 framesToWrite, const void* pData)
+{
+    drwav_uint64 bytesToWrite;
+    drwav_uint64 bytesWritten;
+    const drwav_uint8* pRunningData;
+
+    if (pWav == NULL || framesToWrite == 0 || pData == NULL) {
+        return 0;
+    }
+
+    bytesToWrite = ((framesToWrite * pWav->channels * pWav->bitsPerSample) / 8);
+    if (bytesToWrite > DRWAV_SIZE_MAX) {
+        return 0;
+    }
+
+    bytesWritten = 0;
+    pRunningData = (const drwav_uint8*)pData;
+
+    while (bytesToWrite > 0) {
+        size_t bytesJustWritten;
+        drwav_uint64 bytesToWriteThisIteration;
+
+        bytesToWriteThisIteration = bytesToWrite;
+        DRWAV_ASSERT(bytesToWriteThisIteration <= DRWAV_SIZE_MAX);  /* <-- This is checked above. */
+
+        bytesJustWritten = drwav_write_raw(pWav, (size_t)bytesToWriteThisIteration, pRunningData);
+        if (bytesJustWritten == 0) {
+            break;
+        }
+
+        bytesToWrite -= bytesJustWritten;
+        bytesWritten += bytesJustWritten;
+        pRunningData += bytesJustWritten;
+    }
+
+    return (bytesWritten * 8) / pWav->bitsPerSample / pWav->channels;
+}
+
+DRWAV_API drwav_uint64 drwav_write_pcm_frames_be(drwav* pWav, drwav_uint64 framesToWrite, const void* pData)
+{
+    drwav_uint64 bytesToWrite;
+    drwav_uint64 bytesWritten;
+    drwav_uint32 bytesPerSample;
+    const drwav_uint8* pRunningData;
+
+    if (pWav == NULL || framesToWrite == 0 || pData == NULL) {
+        return 0;
+    }
+
+    bytesToWrite = ((framesToWrite * pWav->channels * pWav->bitsPerSample) / 8);
+    if (bytesToWrite > DRWAV_SIZE_MAX) {
+        return 0;
+    }
+
+    bytesWritten = 0;
+    pRunningData = (const drwav_uint8*)pData;
+
+    bytesPerSample = drwav_get_bytes_per_pcm_frame(pWav) / pWav->channels;
+    if (bytesPerSample == 0) {
+        return 0;   /* Cannot determine bytes per sample, or bytes per sample is less than one byte. */
+    }
+
+    while (bytesToWrite > 0) {
+        drwav_uint8 temp[4096];
+        drwav_uint32 sampleCount;
+        size_t bytesJustWritten;
+        drwav_uint64 bytesToWriteThisIteration;
+
+        bytesToWriteThisIteration = bytesToWrite;
+        DRWAV_ASSERT(bytesToWriteThisIteration <= DRWAV_SIZE_MAX);  /* <-- This is checked above. */
+
+        /*
+        WAV files are always little-endian. We need to byte swap on big-endian architectures. Since our input buffer is read-only we need
+        to use an intermediary buffer for the conversion.
+        */
+        sampleCount = sizeof(temp)/bytesPerSample;
+
+        if (bytesToWriteThisIteration > ((drwav_uint64)sampleCount)*bytesPerSample) {
+            bytesToWriteThisIteration = ((drwav_uint64)sampleCount)*bytesPerSample;
+        }
+
+        DRWAV_COPY_MEMORY(temp, pRunningData, (size_t)bytesToWriteThisIteration);
+        drwav__bswap_samples(temp, sampleCount, bytesPerSample);
+
+        bytesJustWritten = drwav_write_raw(pWav, (size_t)bytesToWriteThisIteration, temp);
+        if (bytesJustWritten == 0) {
+            break;
+        }
+
+        bytesToWrite -= bytesJustWritten;
+        bytesWritten += bytesJustWritten;
+        pRunningData += bytesJustWritten;
+    }
+
+    return (bytesWritten * 8) / pWav->bitsPerSample / pWav->channels;
+}
+
+DRWAV_API drwav_uint64 drwav_write_pcm_frames(drwav* pWav, drwav_uint64 framesToWrite, const void* pData)
+{
+    if (drwav__is_little_endian()) {
+        return drwav_write_pcm_frames_le(pWav, framesToWrite, pData);
+    } else {
+        return drwav_write_pcm_frames_be(pWav, framesToWrite, pData);
+    }
+}
+
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_s16__msadpcm(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    drwav_uint64 totalFramesRead = 0;
+
+    static const drwav_int32 adaptationTable[] = {
+        230, 230, 230, 230, 307, 409, 512, 614,
+        768, 614, 512, 409, 307, 230, 230, 230
+    };
+    static const drwav_int32 coeff1Table[] = { 256, 512, 0, 192, 240, 460,  392 };
+    static const drwav_int32 coeff2Table[] = { 0,  -256, 0, 64,  0,  -208, -232 };
+
+    DRWAV_ASSERT(pWav != NULL);
+    DRWAV_ASSERT(framesToRead > 0);
+
+    /* TODO: Lots of room for optimization here. */
+
+    while (pWav->readCursorInPCMFrames < pWav->totalPCMFrameCount) {
+        DRWAV_ASSERT(framesToRead > 0); /* This loop iteration will never get hit with framesToRead == 0 because it's asserted at the top, and we check for 0 inside the loop just below. */
+
+        /* If there are no cached frames we need to load a new block. */
+        if (pWav->msadpcm.cachedFrameCount == 0 && pWav->msadpcm.bytesRemainingInBlock == 0) {
+            if (pWav->channels == 1) {
+                /* Mono. */
+                drwav_uint8 header[7];
+                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
+                    return totalFramesRead;
+                }
+                pWav->msadpcm.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
+
+                pWav->msadpcm.predictor[0]     = header[0];
+                pWav->msadpcm.delta[0]         = drwav_bytes_to_s16(header + 1);
+                pWav->msadpcm.prevFrames[0][1] = (drwav_int32)drwav_bytes_to_s16(header + 3);
+                pWav->msadpcm.prevFrames[0][0] = (drwav_int32)drwav_bytes_to_s16(header + 5);
+                pWav->msadpcm.cachedFrames[2]  = pWav->msadpcm.prevFrames[0][0];
+                pWav->msadpcm.cachedFrames[3]  = pWav->msadpcm.prevFrames[0][1];
+                pWav->msadpcm.cachedFrameCount = 2;
+
+                /* The predictor is used as an index into coeff1Table so we'll need to validate to ensure it never overflows. */
+                if (pWav->msadpcm.predictor[0] >= drwav_countof(coeff1Table) || pWav->msadpcm.predictor[0] >= drwav_countof(coeff2Table)) {
+                    return totalFramesRead; /* Invalid file. */
+                }
+            } else {
+                /* Stereo. */
+                drwav_uint8 header[14];
+                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
+                    return totalFramesRead;
+                }
+                pWav->msadpcm.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
+
+                pWav->msadpcm.predictor[0] = header[0];
+                pWav->msadpcm.predictor[1] = header[1];
+                pWav->msadpcm.delta[0] = drwav_bytes_to_s16(header + 2);
+                pWav->msadpcm.delta[1] = drwav_bytes_to_s16(header + 4);
+                pWav->msadpcm.prevFrames[0][1] = (drwav_int32)drwav_bytes_to_s16(header + 6);
+                pWav->msadpcm.prevFrames[1][1] = (drwav_int32)drwav_bytes_to_s16(header + 8);
+                pWav->msadpcm.prevFrames[0][0] = (drwav_int32)drwav_bytes_to_s16(header + 10);
+                pWav->msadpcm.prevFrames[1][0] = (drwav_int32)drwav_bytes_to_s16(header + 12);
+
+                pWav->msadpcm.cachedFrames[0] = pWav->msadpcm.prevFrames[0][0];
+                pWav->msadpcm.cachedFrames[1] = pWav->msadpcm.prevFrames[1][0];
+                pWav->msadpcm.cachedFrames[2] = pWav->msadpcm.prevFrames[0][1];
+                pWav->msadpcm.cachedFrames[3] = pWav->msadpcm.prevFrames[1][1];
+                pWav->msadpcm.cachedFrameCount = 2;
+
+                /* The predictor is used as an index into coeff1Table so we'll need to validate to ensure it never overflows. */
+                if (pWav->msadpcm.predictor[0] >= drwav_countof(coeff1Table) || pWav->msadpcm.predictor[0] >= drwav_countof(coeff2Table) ||
+                    pWav->msadpcm.predictor[1] >= drwav_countof(coeff1Table) || pWav->msadpcm.predictor[1] >= drwav_countof(coeff2Table)) {
+                    return totalFramesRead; /* Invalid file. */
+                }
+            }
+        }
+
+        /* Output anything that's cached. */
+        while (framesToRead > 0 && pWav->msadpcm.cachedFrameCount > 0 && pWav->readCursorInPCMFrames < pWav->totalPCMFrameCount) {
+            if (pBufferOut != NULL) {
+                drwav_uint32 iSample = 0;
+                for (iSample = 0; iSample < pWav->channels; iSample += 1) {
+                    pBufferOut[iSample] = (drwav_int16)pWav->msadpcm.cachedFrames[(drwav_countof(pWav->msadpcm.cachedFrames) - (pWav->msadpcm.cachedFrameCount*pWav->channels)) + iSample];
+                }
+
+                pBufferOut += pWav->channels;
+            }
+
+            framesToRead    -= 1;
+            totalFramesRead += 1;
+            pWav->readCursorInPCMFrames += 1;
+            pWav->msadpcm.cachedFrameCount -= 1;
+        }
+
+        if (framesToRead == 0) {
+            break;
+        }
+
+
+        /*
+        If there's nothing left in the cache, just go ahead and load more. If there's nothing left to load in the current block we just continue to the next
+        loop iteration which will trigger the loading of a new block.
+        */
+        if (pWav->msadpcm.cachedFrameCount == 0) {
+            if (pWav->msadpcm.bytesRemainingInBlock == 0) {
+                continue;
+            } else {
+                drwav_uint8 nibbles;
+                drwav_int32 nibble0;
+                drwav_int32 nibble1;
+
+                if (pWav->onRead(pWav->pUserData, &nibbles, 1) != 1) {
+                    return totalFramesRead;
+                }
+                pWav->msadpcm.bytesRemainingInBlock -= 1;
+
+                /* TODO: Optimize away these if statements. */
+                nibble0 = ((nibbles & 0xF0) >> 4); if ((nibbles & 0x80)) { nibble0 |= 0xFFFFFFF0UL; }
+                nibble1 = ((nibbles & 0x0F) >> 0); if ((nibbles & 0x08)) { nibble1 |= 0xFFFFFFF0UL; }
+
+                if (pWav->channels == 1) {
+                    /* Mono. */
+                    drwav_int32 newSample0;
+                    drwav_int32 newSample1;
+
+                    /* The predictor is read from the file and then indexed into a table. Check that it's in bounds. */
+                    if (pWav->msadpcm.predictor[0] >= drwav_countof(coeff1Table) || pWav->msadpcm.predictor[0] >= drwav_countof(coeff2Table)) {
+                        return totalFramesRead;
+                    }
+
+                    newSample0  = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
+                    newSample0 += nibble0 * pWav->msadpcm.delta[0];
+                    newSample0  = drwav_clamp(newSample0, -32768, 32767);
+
+                    pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0xF0) >> 4)] * pWav->msadpcm.delta[0]) >> 8;
+                    if (pWav->msadpcm.delta[0] < 16) {
+                        pWav->msadpcm.delta[0] = 16;
+                    }
+
+                    pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
+                    pWav->msadpcm.prevFrames[0][1] = newSample0;
+
+
+                    newSample1  = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
+                    newSample1 += nibble1 * pWav->msadpcm.delta[0];
+                    newSample1  = drwav_clamp(newSample1, -32768, 32767);
+
+                    pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0x0F) >> 0)] * pWav->msadpcm.delta[0]) >> 8;
+                    if (pWav->msadpcm.delta[0] < 16) {
+                        pWav->msadpcm.delta[0] = 16;
+                    }
+
+                    pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
+                    pWav->msadpcm.prevFrames[0][1] = newSample1;
+
+                    pWav->msadpcm.cachedFrames[2] = newSample0;
+                    pWav->msadpcm.cachedFrames[3] = newSample1;
+                    pWav->msadpcm.cachedFrameCount = 2;
+                } else {
+                    /* Stereo. */
+                    drwav_int32 newSample0;
+                    drwav_int32 newSample1;
+
+                    /* Left. */
+                    if (pWav->msadpcm.predictor[0] >= drwav_countof(coeff1Table) || pWav->msadpcm.predictor[0] >= drwav_countof(coeff2Table)) {
+                        return totalFramesRead; /* Out of bounds. Invalid file. */
+                    }
+
+                    newSample0  = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
+                    newSample0 += nibble0 * pWav->msadpcm.delta[0];
+                    newSample0  = drwav_clamp(newSample0, -32768, 32767);
+
+                    pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0xF0) >> 4)] * pWav->msadpcm.delta[0]) >> 8;
+                    if (pWav->msadpcm.delta[0] < 16) {
+                        pWav->msadpcm.delta[0] = 16;
+                    }
+
+                    pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
+                    pWav->msadpcm.prevFrames[0][1] = newSample0;
+
+
+                    /* Right. */
+                    if (pWav->msadpcm.predictor[1] >= drwav_countof(coeff1Table) || pWav->msadpcm.predictor[1] >= drwav_countof(coeff2Table)) {
+                        return totalFramesRead; /* Out of bounds. Invalid file. */
+                    }
+
+                    newSample1  = ((pWav->msadpcm.prevFrames[1][1] * coeff1Table[pWav->msadpcm.predictor[1]]) + (pWav->msadpcm.prevFrames[1][0] * coeff2Table[pWav->msadpcm.predictor[1]])) >> 8;
+                    newSample1 += nibble1 * pWav->msadpcm.delta[1];
+                    newSample1  = drwav_clamp(newSample1, -32768, 32767);
+
+                    pWav->msadpcm.delta[1] = (adaptationTable[((nibbles & 0x0F) >> 0)] * pWav->msadpcm.delta[1]) >> 8;
+                    if (pWav->msadpcm.delta[1] < 16) {
+                        pWav->msadpcm.delta[1] = 16;
+                    }
+
+                    pWav->msadpcm.prevFrames[1][0] = pWav->msadpcm.prevFrames[1][1];
+                    pWav->msadpcm.prevFrames[1][1] = newSample1;
+
+                    pWav->msadpcm.cachedFrames[2] = newSample0;
+                    pWav->msadpcm.cachedFrames[3] = newSample1;
+                    pWav->msadpcm.cachedFrameCount = 1;
+                }
+            }
+        }
+    }
+
+    return totalFramesRead;
+}
+
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_s16__ima(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    drwav_uint64 totalFramesRead = 0;
+    drwav_uint32 iChannel;
+
+    static const drwav_int32 indexTable[16] = {
+        -1, -1, -1, -1, 2, 4, 6, 8,
+        -1, -1, -1, -1, 2, 4, 6, 8
+    };
+
+    static const drwav_int32 stepTable[89] = {
+        7,     8,     9,     10,    11,    12,    13,    14,    16,    17,
+        19,    21,    23,    25,    28,    31,    34,    37,    41,    45,
+        50,    55,    60,    66,    73,    80,    88,    97,    107,   118,
+        130,   143,   157,   173,   190,   209,   230,   253,   279,   307,
+        337,   371,   408,   449,   494,   544,   598,   658,   724,   796,
+        876,   963,   1060,  1166,  1282,  1411,  1552,  1707,  1878,  2066,
+        2272,  2499,  2749,  3024,  3327,  3660,  4026,  4428,  4871,  5358,
+        5894,  6484,  7132,  7845,  8630,  9493,  10442, 11487, 12635, 13899,
+        15289, 16818, 18500, 20350, 22385, 24623, 27086, 29794, 32767
+    };
+
+    DRWAV_ASSERT(pWav != NULL);
+    DRWAV_ASSERT(framesToRead > 0);
+
+    /* TODO: Lots of room for optimization here. */
+
+    while (pWav->readCursorInPCMFrames < pWav->totalPCMFrameCount) {
+        DRWAV_ASSERT(framesToRead > 0); /* This loop iteration will never get hit with framesToRead == 0 because it's asserted at the top, and we check for 0 inside the loop just below. */
+
+        /* If there are no cached samples we need to load a new block. */
+        if (pWav->ima.cachedFrameCount == 0 && pWav->ima.bytesRemainingInBlock == 0) {
+            if (pWav->channels == 1) {
+                /* Mono. */
+                drwav_uint8 header[4];
+                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
+                    return totalFramesRead;
+                }
+                pWav->ima.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
+
+                if (header[2] >= drwav_countof(stepTable)) {
+                    pWav->onSeek(pWav->pUserData, pWav->ima.bytesRemainingInBlock, DRWAV_SEEK_CUR);
+                    pWav->ima.bytesRemainingInBlock = 0;
+                    return totalFramesRead; /* Invalid data. */
+                }
+
+                pWav->ima.predictor[0] = (drwav_int16)drwav_bytes_to_u16(header + 0);
+                pWav->ima.stepIndex[0] = drwav_clamp(header[2], 0, (drwav_int32)drwav_countof(stepTable)-1);    /* Clamp not necessary because we checked above, but adding here to silence a static analysis warning. */
+                pWav->ima.cachedFrames[drwav_countof(pWav->ima.cachedFrames) - 1] = pWav->ima.predictor[0];
+                pWav->ima.cachedFrameCount = 1;
+            } else {
+                /* Stereo. */
+                drwav_uint8 header[8];
+                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
+                    return totalFramesRead;
+                }
+                pWav->ima.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
+
+                if (header[2] >= drwav_countof(stepTable) || header[6] >= drwav_countof(stepTable)) {
+                    pWav->onSeek(pWav->pUserData, pWav->ima.bytesRemainingInBlock, DRWAV_SEEK_CUR);
+                    pWav->ima.bytesRemainingInBlock = 0;
+                    return totalFramesRead; /* Invalid data. */
+                }
+
+                pWav->ima.predictor[0] = drwav_bytes_to_s16(header + 0);
+                pWav->ima.stepIndex[0] = drwav_clamp(header[2], 0, (drwav_int32)drwav_countof(stepTable)-1);    /* Clamp not necessary because we checked above, but adding here to silence a static analysis warning. */
+                pWav->ima.predictor[1] = drwav_bytes_to_s16(header + 4);
+                pWav->ima.stepIndex[1] = drwav_clamp(header[6], 0, (drwav_int32)drwav_countof(stepTable)-1);    /* Clamp not necessary because we checked above, but adding here to silence a static analysis warning. */
+
+                pWav->ima.cachedFrames[drwav_countof(pWav->ima.cachedFrames) - 2] = pWav->ima.predictor[0];
+                pWav->ima.cachedFrames[drwav_countof(pWav->ima.cachedFrames) - 1] = pWav->ima.predictor[1];
+                pWav->ima.cachedFrameCount = 1;
+            }
+        }
+
+        /* Output anything that's cached. */
+        while (framesToRead > 0 && pWav->ima.cachedFrameCount > 0 && pWav->readCursorInPCMFrames < pWav->totalPCMFrameCount) {
+            if (pBufferOut != NULL) {
+                drwav_uint32 iSample;
+                for (iSample = 0; iSample < pWav->channels; iSample += 1) {
+                    pBufferOut[iSample] = (drwav_int16)pWav->ima.cachedFrames[(drwav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + iSample];
+                }
+                pBufferOut += pWav->channels;
+            }
+
+            framesToRead    -= 1;
+            totalFramesRead += 1;
+            pWav->readCursorInPCMFrames += 1;
+            pWav->ima.cachedFrameCount -= 1;
+        }
+
+        if (framesToRead == 0) {
+            break;
+        }
+
+        /*
+        If there's nothing left in the cache, just go ahead and load more. If there's nothing left to load in the current block we just continue to the next
+        loop iteration which will trigger the loading of a new block.
+        */
+        if (pWav->ima.cachedFrameCount == 0) {
+            if (pWav->ima.bytesRemainingInBlock == 0) {
+                continue;
+            } else {
+                /*
+                From what I can tell with stereo streams, it looks like every 4 bytes (8 samples) is for one channel. So it goes 4 bytes for the
+                left channel, 4 bytes for the right channel.
+                */
+                pWav->ima.cachedFrameCount = 8;
+                for (iChannel = 0; iChannel < pWav->channels; ++iChannel) {
+                    drwav_uint32 iByte;
+                    drwav_uint8 nibbles[4];
+                    if (pWav->onRead(pWav->pUserData, &nibbles, 4) != 4) {
+                        pWav->ima.cachedFrameCount = 0;
+                        return totalFramesRead;
+                    }
+                    pWav->ima.bytesRemainingInBlock -= 4;
+
+                    for (iByte = 0; iByte < 4; ++iByte) {
+                        drwav_uint8 nibble0 = ((nibbles[iByte] & 0x0F) >> 0);
+                        drwav_uint8 nibble1 = ((nibbles[iByte] & 0xF0) >> 4);
+
+                        drwav_int32 step      = stepTable[pWav->ima.stepIndex[iChannel]];
+                        drwav_int32 predictor = pWav->ima.predictor[iChannel];
+
+                        drwav_int32      diff  = step >> 3;
+                        if (nibble0 & 1) diff += step >> 2;
+                        if (nibble0 & 2) diff += step >> 1;
+                        if (nibble0 & 4) diff += step;
+                        if (nibble0 & 8) diff  = -diff;
+
+                        predictor = drwav_clamp(predictor + diff, -32768, 32767);
+                        pWav->ima.predictor[iChannel] = predictor;
+                        pWav->ima.stepIndex[iChannel] = drwav_clamp(pWav->ima.stepIndex[iChannel] + indexTable[nibble0], 0, (drwav_int32)drwav_countof(stepTable)-1);
+                        pWav->ima.cachedFrames[(drwav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + (iByte*2+0)*pWav->channels + iChannel] = predictor;
+
+
+                        step      = stepTable[pWav->ima.stepIndex[iChannel]];
+                        predictor = pWav->ima.predictor[iChannel];
+
+                                         diff  = step >> 3;
+                        if (nibble1 & 1) diff += step >> 2;
+                        if (nibble1 & 2) diff += step >> 1;
+                        if (nibble1 & 4) diff += step;
+                        if (nibble1 & 8) diff  = -diff;
+
+                        predictor = drwav_clamp(predictor + diff, -32768, 32767);
+                        pWav->ima.predictor[iChannel] = predictor;
+                        pWav->ima.stepIndex[iChannel] = drwav_clamp(pWav->ima.stepIndex[iChannel] + indexTable[nibble1], 0, (drwav_int32)drwav_countof(stepTable)-1);
+                        pWav->ima.cachedFrames[(drwav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + (iByte*2+1)*pWav->channels + iChannel] = predictor;
+                    }
+                }
+            }
+        }
+    }
+
+    return totalFramesRead;
+}
+
+
+#ifndef DR_WAV_NO_CONVERSION_API
+static const unsigned short g_drwavAlawTable[256] = {
+    0xEA80, 0xEB80, 0xE880, 0xE980, 0xEE80, 0xEF80, 0xEC80, 0xED80, 0xE280, 0xE380, 0xE080, 0xE180, 0xE680, 0xE780, 0xE480, 0xE580,
+    0xF540, 0xF5C0, 0xF440, 0xF4C0, 0xF740, 0xF7C0, 0xF640, 0xF6C0, 0xF140, 0xF1C0, 0xF040, 0xF0C0, 0xF340, 0xF3C0, 0xF240, 0xF2C0,
+    0xAA00, 0xAE00, 0xA200, 0xA600, 0xBA00, 0xBE00, 0xB200, 0xB600, 0x8A00, 0x8E00, 0x8200, 0x8600, 0x9A00, 0x9E00, 0x9200, 0x9600,
+    0xD500, 0xD700, 0xD100, 0xD300, 0xDD00, 0xDF00, 0xD900, 0xDB00, 0xC500, 0xC700, 0xC100, 0xC300, 0xCD00, 0xCF00, 0xC900, 0xCB00,
+    0xFEA8, 0xFEB8, 0xFE88, 0xFE98, 0xFEE8, 0xFEF8, 0xFEC8, 0xFED8, 0xFE28, 0xFE38, 0xFE08, 0xFE18, 0xFE68, 0xFE78, 0xFE48, 0xFE58,
+    0xFFA8, 0xFFB8, 0xFF88, 0xFF98, 0xFFE8, 0xFFF8, 0xFFC8, 0xFFD8, 0xFF28, 0xFF38, 0xFF08, 0xFF18, 0xFF68, 0xFF78, 0xFF48, 0xFF58,
+    0xFAA0, 0xFAE0, 0xFA20, 0xFA60, 0xFBA0, 0xFBE0, 0xFB20, 0xFB60, 0xF8A0, 0xF8E0, 0xF820, 0xF860, 0xF9A0, 0xF9E0, 0xF920, 0xF960,
+    0xFD50, 0xFD70, 0xFD10, 0xFD30, 0xFDD0, 0xFDF0, 0xFD90, 0xFDB0, 0xFC50, 0xFC70, 0xFC10, 0xFC30, 0xFCD0, 0xFCF0, 0xFC90, 0xFCB0,
+    0x1580, 0x1480, 0x1780, 0x1680, 0x1180, 0x1080, 0x1380, 0x1280, 0x1D80, 0x1C80, 0x1F80, 0x1E80, 0x1980, 0x1880, 0x1B80, 0x1A80,
+    0x0AC0, 0x0A40, 0x0BC0, 0x0B40, 0x08C0, 0x0840, 0x09C0, 0x0940, 0x0EC0, 0x0E40, 0x0FC0, 0x0F40, 0x0CC0, 0x0C40, 0x0DC0, 0x0D40,
+    0x5600, 0x5200, 0x5E00, 0x5A00, 0x4600, 0x4200, 0x4E00, 0x4A00, 0x7600, 0x7200, 0x7E00, 0x7A00, 0x6600, 0x6200, 0x6E00, 0x6A00,
+    0x2B00, 0x2900, 0x2F00, 0x2D00, 0x2300, 0x2100, 0x2700, 0x2500, 0x3B00, 0x3900, 0x3F00, 0x3D00, 0x3300, 0x3100, 0x3700, 0x3500,
+    0x0158, 0x0148, 0x0178, 0x0168, 0x0118, 0x0108, 0x0138, 0x0128, 0x01D8, 0x01C8, 0x01F8, 0x01E8, 0x0198, 0x0188, 0x01B8, 0x01A8,
+    0x0058, 0x0048, 0x0078, 0x0068, 0x0018, 0x0008, 0x0038, 0x0028, 0x00D8, 0x00C8, 0x00F8, 0x00E8, 0x0098, 0x0088, 0x00B8, 0x00A8,
+    0x0560, 0x0520, 0x05E0, 0x05A0, 0x0460, 0x0420, 0x04E0, 0x04A0, 0x0760, 0x0720, 0x07E0, 0x07A0, 0x0660, 0x0620, 0x06E0, 0x06A0,
+    0x02B0, 0x0290, 0x02F0, 0x02D0, 0x0230, 0x0210, 0x0270, 0x0250, 0x03B0, 0x0390, 0x03F0, 0x03D0, 0x0330, 0x0310, 0x0370, 0x0350
+};
+
+static const unsigned short g_drwavMulawTable[256] = {
+    0x8284, 0x8684, 0x8A84, 0x8E84, 0x9284, 0x9684, 0x9A84, 0x9E84, 0xA284, 0xA684, 0xAA84, 0xAE84, 0xB284, 0xB684, 0xBA84, 0xBE84,
+    0xC184, 0xC384, 0xC584, 0xC784, 0xC984, 0xCB84, 0xCD84, 0xCF84, 0xD184, 0xD384, 0xD584, 0xD784, 0xD984, 0xDB84, 0xDD84, 0xDF84,
+    0xE104, 0xE204, 0xE304, 0xE404, 0xE504, 0xE604, 0xE704, 0xE804, 0xE904, 0xEA04, 0xEB04, 0xEC04, 0xED04, 0xEE04, 0xEF04, 0xF004,
+    0xF0C4, 0xF144, 0xF1C4, 0xF244, 0xF2C4, 0xF344, 0xF3C4, 0xF444, 0xF4C4, 0xF544, 0xF5C4, 0xF644, 0xF6C4, 0xF744, 0xF7C4, 0xF844,
+    0xF8A4, 0xF8E4, 0xF924, 0xF964, 0xF9A4, 0xF9E4, 0xFA24, 0xFA64, 0xFAA4, 0xFAE4, 0xFB24, 0xFB64, 0xFBA4, 0xFBE4, 0xFC24, 0xFC64,
+    0xFC94, 0xFCB4, 0xFCD4, 0xFCF4, 0xFD14, 0xFD34, 0xFD54, 0xFD74, 0xFD94, 0xFDB4, 0xFDD4, 0xFDF4, 0xFE14, 0xFE34, 0xFE54, 0xFE74,
+    0xFE8C, 0xFE9C, 0xFEAC, 0xFEBC, 0xFECC, 0xFEDC, 0xFEEC, 0xFEFC, 0xFF0C, 0xFF1C, 0xFF2C, 0xFF3C, 0xFF4C, 0xFF5C, 0xFF6C, 0xFF7C,
+    0xFF88, 0xFF90, 0xFF98, 0xFFA0, 0xFFA8, 0xFFB0, 0xFFB8, 0xFFC0, 0xFFC8, 0xFFD0, 0xFFD8, 0xFFE0, 0xFFE8, 0xFFF0, 0xFFF8, 0x0000,
+    0x7D7C, 0x797C, 0x757C, 0x717C, 0x6D7C, 0x697C, 0x657C, 0x617C, 0x5D7C, 0x597C, 0x557C, 0x517C, 0x4D7C, 0x497C, 0x457C, 0x417C,
+    0x3E7C, 0x3C7C, 0x3A7C, 0x387C, 0x367C, 0x347C, 0x327C, 0x307C, 0x2E7C, 0x2C7C, 0x2A7C, 0x287C, 0x267C, 0x247C, 0x227C, 0x207C,
+    0x1EFC, 0x1DFC, 0x1CFC, 0x1BFC, 0x1AFC, 0x19FC, 0x18FC, 0x17FC, 0x16FC, 0x15FC, 0x14FC, 0x13FC, 0x12FC, 0x11FC, 0x10FC, 0x0FFC,
+    0x0F3C, 0x0EBC, 0x0E3C, 0x0DBC, 0x0D3C, 0x0CBC, 0x0C3C, 0x0BBC, 0x0B3C, 0x0ABC, 0x0A3C, 0x09BC, 0x093C, 0x08BC, 0x083C, 0x07BC,
+    0x075C, 0x071C, 0x06DC, 0x069C, 0x065C, 0x061C, 0x05DC, 0x059C, 0x055C, 0x051C, 0x04DC, 0x049C, 0x045C, 0x041C, 0x03DC, 0x039C,
+    0x036C, 0x034C, 0x032C, 0x030C, 0x02EC, 0x02CC, 0x02AC, 0x028C, 0x026C, 0x024C, 0x022C, 0x020C, 0x01EC, 0x01CC, 0x01AC, 0x018C,
+    0x0174, 0x0164, 0x0154, 0x0144, 0x0134, 0x0124, 0x0114, 0x0104, 0x00F4, 0x00E4, 0x00D4, 0x00C4, 0x00B4, 0x00A4, 0x0094, 0x0084,
+    0x0078, 0x0070, 0x0068, 0x0060, 0x0058, 0x0050, 0x0048, 0x0040, 0x0038, 0x0030, 0x0028, 0x0020, 0x0018, 0x0010, 0x0008, 0x0000
+};
+
+static DRWAV_INLINE drwav_int16 drwav__alaw_to_s16(drwav_uint8 sampleIn)
+{
+    return (short)g_drwavAlawTable[sampleIn];
+}
+
+static DRWAV_INLINE drwav_int16 drwav__mulaw_to_s16(drwav_uint8 sampleIn)
+{
+    return (short)g_drwavMulawTable[sampleIn];
+}
+
+
+
+DRWAV_PRIVATE void drwav__pcm_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
+{
+    size_t i;
+
+    /* Special case for 8-bit sample data because it's treated as unsigned. */
+    if (bytesPerSample == 1) {
+        drwav_u8_to_s16(pOut, pIn, totalSampleCount);
+        return;
+    }
+
+
+    /* Slightly more optimal implementation for common formats. */
+    if (bytesPerSample == 2) {
+        for (i = 0; i < totalSampleCount; ++i) {
+           *pOut++ = ((const drwav_int16*)pIn)[i];
+        }
+        return;
+    }
+    if (bytesPerSample == 3) {
+        drwav_s24_to_s16(pOut, pIn, totalSampleCount);
+        return;
+    }
+    if (bytesPerSample == 4) {
+        drwav_s32_to_s16(pOut, (const drwav_int32*)pIn, totalSampleCount);
+        return;
+    }
+
+
+    /* Anything more than 64 bits per sample is not supported. */
+    if (bytesPerSample > 8) {
+        DRWAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
+        return;
+    }
+
+
+    /* Generic, slow converter. */
+    for (i = 0; i < totalSampleCount; ++i) {
+        drwav_uint64 sample = 0;
+        unsigned int shift  = (8 - bytesPerSample) * 8;
+
+        unsigned int j;
+        for (j = 0; j < bytesPerSample; j += 1) {
+            DRWAV_ASSERT(j < 8);
+            sample |= (drwav_uint64)(pIn[j]) << shift;
+            shift  += 8;
+        }
+
+        pIn += j;
+        *pOut++ = (drwav_int16)((drwav_int64)sample >> 48);
+    }
+}
+
+DRWAV_PRIVATE void drwav__ieee_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
+{
+    if (bytesPerSample == 4) {
+        drwav_f32_to_s16(pOut, (const float*)pIn, totalSampleCount);
+        return;
+    } else if (bytesPerSample == 8) {
+        drwav_f64_to_s16(pOut, (const double*)pIn, totalSampleCount);
+        return;
+    } else {
+        /* Only supporting 32- and 64-bit float. Output silence in all other cases. Contributions welcome for 16-bit float. */
+        DRWAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
+        return;
+    }
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_s16__pcm(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096] = {0};
+    drwav_uint32 bytesPerFrame;
+    drwav_uint32 bytesPerSample;
+    drwav_uint64 samplesRead;
+
+    /* Fast path. */
+    if ((pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM && pWav->bitsPerSample == 16) || pBufferOut == NULL) {
+        return drwav_read_pcm_frames(pWav, framesToRead, pBufferOut);
+    }
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;   /* Only byte-aligned formats are supported. */
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesToReadThisIteration = drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        DRWAV_ASSERT(framesRead <= framesToReadThisIteration);   /* If this fails it means there's a bug in drwav_read_pcm_frames(). */
+
+        /* Validation to ensure we don't read too much from out intermediary buffer. This is to protect from invalid files. */
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            DRWAV_ASSERT(DRWAV_FALSE);  /* This should never happen with a valid file. */
+            break;
+        }
+
+        drwav__pcm_to_s16(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
+
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_s16__ieee(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096] = {0};
+    drwav_uint32 bytesPerFrame;
+    drwav_uint32 bytesPerSample;
+    drwav_uint64 samplesRead;
+
+    if (pBufferOut == NULL) {
+        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;   /* Only byte-aligned formats are supported. */
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesToReadThisIteration = drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        DRWAV_ASSERT(framesRead <= framesToReadThisIteration);   /* If this fails it means there's a bug in drwav_read_pcm_frames(). */
+
+        /* Validation to ensure we don't read too much from out intermediary buffer. This is to protect from invalid files. */
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            DRWAV_ASSERT(DRWAV_FALSE);  /* This should never happen with a valid file. */
+            break;
+        }
+
+        drwav__ieee_to_s16(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);    /* Safe cast. */
+
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_s16__alaw(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096] = {0};
+    drwav_uint32 bytesPerFrame;
+    drwav_uint32 bytesPerSample;
+    drwav_uint64 samplesRead;
+
+    if (pBufferOut == NULL) {
+        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;   /* Only byte-aligned formats are supported. */
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesToReadThisIteration = drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        DRWAV_ASSERT(framesRead <= framesToReadThisIteration);   /* If this fails it means there's a bug in drwav_read_pcm_frames(). */
+
+        /* Validation to ensure we don't read too much from out intermediary buffer. This is to protect from invalid files. */
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            DRWAV_ASSERT(DRWAV_FALSE);  /* This should never happen with a valid file. */
+            break;
+        }
+
+        drwav_alaw_to_s16(pBufferOut, sampleData, (size_t)samplesRead);
+
+        /*
+        For some reason libsndfile seems to be returning samples of the opposite sign for a-law, but only
+        with AIFF files. For WAV files it seems to be the same as dr_wav. This is resulting in dr_wav's
+        automated tests failing. I'm not sure which is correct, but will assume dr_wav. If we're enforcing
+        libsndfile compatibility we'll swap the signs here.
+        */
+        #ifdef DR_WAV_LIBSNDFILE_COMPAT
+        {
+            if (pWav->container == drwav_container_aiff) {
+                drwav_uint64 iSample;
+                for (iSample = 0; iSample < samplesRead; iSample += 1) {
+                    pBufferOut[iSample] = -pBufferOut[iSample];
+                }
+            }
+        }
+        #endif
+
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_s16__mulaw(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096] = {0};
+    drwav_uint32 bytesPerFrame;
+    drwav_uint32 bytesPerSample;
+    drwav_uint64 samplesRead;
+
+    if (pBufferOut == NULL) {
+        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;   /* Only byte-aligned formats are supported. */
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesToReadThisIteration = drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        DRWAV_ASSERT(framesRead <= framesToReadThisIteration);   /* If this fails it means there's a bug in drwav_read_pcm_frames(). */
+
+        /* Validation to ensure we don't read too much from out intermediary buffer. This is to protect from invalid files. */
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            DRWAV_ASSERT(DRWAV_FALSE);  /* This should never happen with a valid file. */
+            break;
+        }
+
+        drwav_mulaw_to_s16(pBufferOut, sampleData, (size_t)samplesRead);
+
+        /*
+        Just like with alaw, for some reason the signs between libsndfile and dr_wav are opposite. We just need to
+        swap the sign if we're compiling with libsndfile compatiblity so our automated tests don't fail.
+        */
+        #ifdef DR_WAV_LIBSNDFILE_COMPAT
+        {
+            if (pWav->container == drwav_container_aiff) {
+                drwav_uint64 iSample;
+                for (iSample = 0; iSample < samplesRead; iSample += 1) {
+                    pBufferOut[iSample] = -pBufferOut[iSample];
+                }
+            }
+        }
+        #endif
+
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    if (pWav == NULL || framesToRead == 0) {
+        return 0;
+    }
+
+    if (pBufferOut == NULL) {
+        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+
+    /* Don't try to read more samples than can potentially fit in the output buffer. */
+    if (framesToRead * pWav->channels * sizeof(drwav_int16) > DRWAV_SIZE_MAX) {
+        framesToRead = DRWAV_SIZE_MAX / sizeof(drwav_int16) / pWav->channels;
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM) {
+        return drwav_read_pcm_frames_s16__pcm(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT) {
+        return drwav_read_pcm_frames_s16__ieee(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ALAW) {
+        return drwav_read_pcm_frames_s16__alaw(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_MULAW) {
+        return drwav_read_pcm_frames_s16__mulaw(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
+        return drwav_read_pcm_frames_s16__msadpcm(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
+        return drwav_read_pcm_frames_s16__ima(pWav, framesToRead, pBufferOut);
+    }
+
+    return 0;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16le(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_FALSE) {
+        drwav__bswap_samples_s16(pBufferOut, framesRead*pWav->channels);
+    }
+
+    return framesRead;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16be(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_TRUE) {
+        drwav__bswap_samples_s16(pBufferOut, framesRead*pWav->channels);
+    }
+
+    return framesRead;
+}
+
+
+DRWAV_API void drwav_u8_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    int r;
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        int x = pIn[i];
+        r = x << 8;
+        r = r - 32768;
+        pOut[i] = (short)r;
+    }
+}
+
+DRWAV_API void drwav_s24_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    int r;
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        int x = ((int)(((unsigned int)(((const drwav_uint8*)pIn)[i*3+0]) << 8) | ((unsigned int)(((const drwav_uint8*)pIn)[i*3+1]) << 16) | ((unsigned int)(((const drwav_uint8*)pIn)[i*3+2])) << 24)) >> 8;
+        r = x >> 8;
+        pOut[i] = (short)r;
+    }
+}
+
+DRWAV_API void drwav_s32_to_s16(drwav_int16* pOut, const drwav_int32* pIn, size_t sampleCount)
+{
+    int r;
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        int x = pIn[i];
+        r = x >> 16;
+        pOut[i] = (short)r;
+    }
+}
+
+DRWAV_API void drwav_f32_to_s16(drwav_int16* pOut, const float* pIn, size_t sampleCount)
+{
+    int r;
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        float x = pIn[i];
+        float c;
+        c = ((x < -1) ? -1 : ((x > 1) ? 1 : x));
+        c = c + 1;
+        r = (int)(c * 32767.5f);
+        r = r - 32768;
+        pOut[i] = (short)r;
+    }
+}
+
+DRWAV_API void drwav_f64_to_s16(drwav_int16* pOut, const double* pIn, size_t sampleCount)
+{
+    int r;
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        double x = pIn[i];
+        double c;
+        c = ((x < -1) ? -1 : ((x > 1) ? 1 : x));
+        c = c + 1;
+        r = (int)(c * 32767.5);
+        r = r - 32768;
+        pOut[i] = (short)r;
+    }
+}
+
+DRWAV_API void drwav_alaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        pOut[i] = drwav__alaw_to_s16(pIn[i]);
+    }
+}
+
+DRWAV_API void drwav_mulaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        pOut[i] = drwav__mulaw_to_s16(pIn[i]);
+    }
+}
+
+
+DRWAV_PRIVATE void drwav__pcm_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount, unsigned int bytesPerSample)
+{
+    unsigned int i;
+
+    /* Special case for 8-bit sample data because it's treated as unsigned. */
+    if (bytesPerSample == 1) {
+        drwav_u8_to_f32(pOut, pIn, sampleCount);
+        return;
+    }
+
+    /* Slightly more optimal implementation for common formats. */
+    if (bytesPerSample == 2) {
+        drwav_s16_to_f32(pOut, (const drwav_int16*)pIn, sampleCount);
+        return;
+    }
+    if (bytesPerSample == 3) {
+        drwav_s24_to_f32(pOut, pIn, sampleCount);
+        return;
+    }
+    if (bytesPerSample == 4) {
+        drwav_s32_to_f32(pOut, (const drwav_int32*)pIn, sampleCount);
+        return;
+    }
+
+
+    /* Anything more than 64 bits per sample is not supported. */
+    if (bytesPerSample > 8) {
+        DRWAV_ZERO_MEMORY(pOut, sampleCount * sizeof(*pOut));
+        return;
+    }
+
+
+    /* Generic, slow converter. */
+    for (i = 0; i < sampleCount; ++i) {
+        drwav_uint64 sample = 0;
+        unsigned int shift  = (8 - bytesPerSample) * 8;
+
+        unsigned int j;
+        for (j = 0; j < bytesPerSample; j += 1) {
+            DRWAV_ASSERT(j < 8);
+            sample |= (drwav_uint64)(pIn[j]) << shift;
+            shift  += 8;
+        }
+
+        pIn += j;
+        *pOut++ = (float)((drwav_int64)sample / 9223372036854775807.0);
+    }
+}
+
+DRWAV_PRIVATE void drwav__ieee_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount, unsigned int bytesPerSample)
+{
+    if (bytesPerSample == 4) {
+        unsigned int i;
+        for (i = 0; i < sampleCount; ++i) {
+            *pOut++ = ((const float*)pIn)[i];
+        }
+        return;
+    } else if (bytesPerSample == 8) {
+        drwav_f64_to_f32(pOut, (const double*)pIn, sampleCount);
+        return;
+    } else {
+        /* Only supporting 32- and 64-bit float. Output silence in all other cases. Contributions welcome for 16-bit float. */
+        DRWAV_ZERO_MEMORY(pOut, sampleCount * sizeof(*pOut));
+        return;
+    }
+}
+
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_f32__pcm(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096] = {0};
+    drwav_uint32 bytesPerFrame;
+    drwav_uint32 bytesPerSample;
+    drwav_uint64 samplesRead;
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;   /* Only byte-aligned formats are supported. */
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesToReadThisIteration = drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        DRWAV_ASSERT(framesRead <= framesToReadThisIteration);   /* If this fails it means there's a bug in drwav_read_pcm_frames(). */
+
+        /* Validation to ensure we don't read too much from out intermediary buffer. This is to protect from invalid files. */
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            DRWAV_ASSERT(DRWAV_FALSE);  /* This should never happen with a valid file. */
+            break;
+        }
+
+        drwav__pcm_to_f32(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
+
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_f32__msadpcm_ima(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    /*
+    We're just going to borrow the implementation from the drwav_read_s16() since ADPCM is a little bit more complicated than other formats and I don't
+    want to duplicate that code.
+    */
+    drwav_uint64 totalFramesRead;
+    drwav_int16 samples16[2048];
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesToReadThisIteration = drwav_min(framesToRead, drwav_countof(samples16)/pWav->channels);
+        drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, framesToReadThisIteration, samples16);
+        if (framesRead == 0) {
+            break;
+        }
+
+        DRWAV_ASSERT(framesRead <= framesToReadThisIteration);   /* If this fails it means there's a bug in drwav_read_pcm_frames(). */
+
+        drwav_s16_to_f32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels));   /* <-- Safe cast because we're clamping to 2048. */
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_f32__ieee(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096] = {0};
+    drwav_uint32 bytesPerFrame;
+    drwav_uint32 bytesPerSample;
+    drwav_uint64 samplesRead;
+
+    /* Fast path. */
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT && pWav->bitsPerSample == 32) {
+        return drwav_read_pcm_frames(pWav, framesToRead, pBufferOut);
+    }
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;   /* Only byte-aligned formats are supported. */
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesToReadThisIteration = drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        DRWAV_ASSERT(framesRead <= framesToReadThisIteration);   /* If this fails it means there's a bug in drwav_read_pcm_frames(). */
+
+        /* Validation to ensure we don't read too much from out intermediary buffer. This is to protect from invalid files. */
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            DRWAV_ASSERT(DRWAV_FALSE);  /* This should never happen with a valid file. */
+            break;
+        }
+
+        drwav__ieee_to_f32(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
+
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_f32__alaw(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096] = {0};
+    drwav_uint32 bytesPerFrame;
+    drwav_uint32 bytesPerSample;
+    drwav_uint64 samplesRead;
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;   /* Only byte-aligned formats are supported. */
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesToReadThisIteration = drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        DRWAV_ASSERT(framesRead <= framesToReadThisIteration);   /* If this fails it means there's a bug in drwav_read_pcm_frames(). */
+
+        /* Validation to ensure we don't read too much from out intermediary buffer. This is to protect from invalid files. */
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            DRWAV_ASSERT(DRWAV_FALSE);  /* This should never happen with a valid file. */
+            break;
+        }
+
+        drwav_alaw_to_f32(pBufferOut, sampleData, (size_t)samplesRead);
+
+        #ifdef DR_WAV_LIBSNDFILE_COMPAT
+        {
+            if (pWav->container == drwav_container_aiff) {
+                drwav_uint64 iSample;
+                for (iSample = 0; iSample < samplesRead; iSample += 1) {
+                    pBufferOut[iSample] = -pBufferOut[iSample];
+                }
+            }
+        }
+        #endif
+
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_f32__mulaw(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096] = {0};
+    drwav_uint32 bytesPerFrame;
+    drwav_uint32 bytesPerSample;
+    drwav_uint64 samplesRead;
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;   /* Only byte-aligned formats are supported. */
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesToReadThisIteration = drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        DRWAV_ASSERT(framesRead <= framesToReadThisIteration);   /* If this fails it means there's a bug in drwav_read_pcm_frames(). */
+
+        /* Validation to ensure we don't read too much from out intermediary buffer. This is to protect from invalid files. */
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            DRWAV_ASSERT(DRWAV_FALSE);  /* This should never happen with a valid file. */
+            break;
+        }
+
+        drwav_mulaw_to_f32(pBufferOut, sampleData, (size_t)samplesRead);
+
+        #ifdef DR_WAV_LIBSNDFILE_COMPAT
+        {
+            if (pWav->container == drwav_container_aiff) {
+                drwav_uint64 iSample;
+                for (iSample = 0; iSample < samplesRead; iSample += 1) {
+                    pBufferOut[iSample] = -pBufferOut[iSample];
+                }
+            }
+        }
+        #endif
+
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    if (pWav == NULL || framesToRead == 0) {
+        return 0;
+    }
+
+    if (pBufferOut == NULL) {
+        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+
+    /* Don't try to read more samples than can potentially fit in the output buffer. */
+    if (framesToRead * pWav->channels * sizeof(float) > DRWAV_SIZE_MAX) {
+        framesToRead = DRWAV_SIZE_MAX / sizeof(float) / pWav->channels;
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM) {
+        return drwav_read_pcm_frames_f32__pcm(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM || pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
+        return drwav_read_pcm_frames_f32__msadpcm_ima(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT) {
+        return drwav_read_pcm_frames_f32__ieee(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ALAW) {
+        return drwav_read_pcm_frames_f32__alaw(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_MULAW) {
+        return drwav_read_pcm_frames_f32__mulaw(pWav, framesToRead, pBufferOut);
+    }
+
+    return 0;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32le(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    drwav_uint64 framesRead = drwav_read_pcm_frames_f32(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_FALSE) {
+        drwav__bswap_samples_f32(pBufferOut, framesRead*pWav->channels);
+    }
+
+    return framesRead;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32be(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    drwav_uint64 framesRead = drwav_read_pcm_frames_f32(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_TRUE) {
+        drwav__bswap_samples_f32(pBufferOut, framesRead*pWav->channels);
+    }
+
+    return framesRead;
+}
+
+
+DRWAV_API void drwav_u8_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+#ifdef DR_WAV_LIBSNDFILE_COMPAT
+    /*
+    It appears libsndfile uses slightly different logic for the u8 -> f32 conversion to dr_wav, which in my opinion is incorrect. It appears
+    libsndfile performs the conversion something like "f32 = (u8 / 256) * 2 - 1", however I think it should be "f32 = (u8 / 255) * 2 - 1" (note
+    the divisor of 256 vs 255). I use libsndfile as a benchmark for testing, so I'm therefore leaving this block here just for my automated
+    correctness testing. This is disabled by default.
+    */
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = (pIn[i] / 256.0f) * 2 - 1;
+    }
+#else
+    for (i = 0; i < sampleCount; ++i) {
+        float x = pIn[i];
+        x = x * 0.00784313725490196078f;    /* 0..255 to 0..2 */
+        x = x - 1;                          /* 0..2 to -1..1 */
+
+        *pOut++ = x;
+    }
+#endif
+}
+
+DRWAV_API void drwav_s16_to_f32(float* pOut, const drwav_int16* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = pIn[i] * 0.000030517578125f;
+    }
+}
+
+DRWAV_API void drwav_s24_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        double x;
+        drwav_uint32 a = ((drwav_uint32)(pIn[i*3+0]) <<  8);
+        drwav_uint32 b = ((drwav_uint32)(pIn[i*3+1]) << 16);
+        drwav_uint32 c = ((drwav_uint32)(pIn[i*3+2]) << 24);
+
+        x = (double)((drwav_int32)(a | b | c) >> 8);
+        *pOut++ = (float)(x * 0.00000011920928955078125);
+    }
+}
+
+DRWAV_API void drwav_s32_to_f32(float* pOut, const drwav_int32* pIn, size_t sampleCount)
+{
+    size_t i;
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = (float)(pIn[i] / 2147483648.0);
+    }
+}
+
+DRWAV_API void drwav_f64_to_f32(float* pOut, const double* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = (float)pIn[i];
+    }
+}
+
+DRWAV_API void drwav_alaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = drwav__alaw_to_s16(pIn[i]) / 32768.0f;
+    }
+}
+
+DRWAV_API void drwav_mulaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = drwav__mulaw_to_s16(pIn[i]) / 32768.0f;
+    }
+}
+
+
+
+DRWAV_PRIVATE void drwav__pcm_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
+{
+    unsigned int i;
+
+    /* Special case for 8-bit sample data because it's treated as unsigned. */
+    if (bytesPerSample == 1) {
+        drwav_u8_to_s32(pOut, pIn, totalSampleCount);
+        return;
+    }
+
+    /* Slightly more optimal implementation for common formats. */
+    if (bytesPerSample == 2) {
+        drwav_s16_to_s32(pOut, (const drwav_int16*)pIn, totalSampleCount);
+        return;
+    }
+    if (bytesPerSample == 3) {
+        drwav_s24_to_s32(pOut, pIn, totalSampleCount);
+        return;
+    }
+    if (bytesPerSample == 4) {
+        for (i = 0; i < totalSampleCount; ++i) {
+           *pOut++ = ((const drwav_int32*)pIn)[i];
+        }
+        return;
+    }
+
+
+    /* Anything more than 64 bits per sample is not supported. */
+    if (bytesPerSample > 8) {
+        DRWAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
+        return;
+    }
+
+
+    /* Generic, slow converter. */
+    for (i = 0; i < totalSampleCount; ++i) {
+        drwav_uint64 sample = 0;
+        unsigned int shift  = (8 - bytesPerSample) * 8;
+
+        unsigned int j;
+        for (j = 0; j < bytesPerSample; j += 1) {
+            DRWAV_ASSERT(j < 8);
+            sample |= (drwav_uint64)(pIn[j]) << shift;
+            shift  += 8;
+        }
+
+        pIn += j;
+        *pOut++ = (drwav_int32)((drwav_int64)sample >> 32);
+    }
+}
+
+DRWAV_PRIVATE void drwav__ieee_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
+{
+    if (bytesPerSample == 4) {
+        drwav_f32_to_s32(pOut, (const float*)pIn, totalSampleCount);
+        return;
+    } else if (bytesPerSample == 8) {
+        drwav_f64_to_s32(pOut, (const double*)pIn, totalSampleCount);
+        return;
+    } else {
+        /* Only supporting 32- and 64-bit float. Output silence in all other cases. Contributions welcome for 16-bit float. */
+        DRWAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
+        return;
+    }
+}
+
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_s32__pcm(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096] = {0};
+    drwav_uint32 bytesPerFrame;
+    drwav_uint32 bytesPerSample;
+    drwav_uint64 samplesRead;
+
+    /* Fast path. */
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM && pWav->bitsPerSample == 32) {
+        return drwav_read_pcm_frames(pWav, framesToRead, pBufferOut);
+    }
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;   /* Only byte-aligned formats are supported. */
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesToReadThisIteration = drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        DRWAV_ASSERT(framesRead <= framesToReadThisIteration);   /* If this fails it means there's a bug in drwav_read_pcm_frames(). */
+
+        /* Validation to ensure we don't read too much from out intermediary buffer. This is to protect from invalid files. */
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            DRWAV_ASSERT(DRWAV_FALSE);  /* This should never happen with a valid file. */
+            break;
+        }
+
+        drwav__pcm_to_s32(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
+
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_s32__msadpcm_ima(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    /*
+    We're just going to borrow the implementation from the drwav_read_s16() since ADPCM is a little bit more complicated than other formats and I don't
+    want to duplicate that code.
+    */
+    drwav_uint64 totalFramesRead = 0;
+    drwav_int16 samples16[2048];
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesToReadThisIteration = drwav_min(framesToRead, drwav_countof(samples16)/pWav->channels);
+        drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, framesToReadThisIteration, samples16);
+        if (framesRead == 0) {
+            break;
+        }
+
+        DRWAV_ASSERT(framesRead <= framesToReadThisIteration);   /* If this fails it means there's a bug in drwav_read_pcm_frames(). */
+
+        drwav_s16_to_s32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels));   /* <-- Safe cast because we're clamping to 2048. */
+
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_s32__ieee(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096] = {0};
+    drwav_uint32 bytesPerFrame;
+    drwav_uint32 bytesPerSample;
+    drwav_uint64 samplesRead;
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;   /* Only byte-aligned formats are supported. */
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesToReadThisIteration = drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        DRWAV_ASSERT(framesRead <= framesToReadThisIteration);   /* If this fails it means there's a bug in drwav_read_pcm_frames(). */
+
+        /* Validation to ensure we don't read too much from out intermediary buffer. This is to protect from invalid files. */
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            DRWAV_ASSERT(DRWAV_FALSE);  /* This should never happen with a valid file. */
+            break;
+        }
+
+        drwav__ieee_to_s32(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
+
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_s32__alaw(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096] = {0};
+    drwav_uint32 bytesPerFrame;
+    drwav_uint32 bytesPerSample;
+    drwav_uint64 samplesRead;
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;   /* Only byte-aligned formats are supported. */
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesToReadThisIteration = drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        DRWAV_ASSERT(framesRead <= framesToReadThisIteration);   /* If this fails it means there's a bug in drwav_read_pcm_frames(). */
+
+        /* Validation to ensure we don't read too much from out intermediary buffer. This is to protect from invalid files. */
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            DRWAV_ASSERT(DRWAV_FALSE);  /* This should never happen with a valid file. */
+            break;
+        }
+
+        drwav_alaw_to_s32(pBufferOut, sampleData, (size_t)samplesRead);
+
+        #ifdef DR_WAV_LIBSNDFILE_COMPAT
+        {
+            if (pWav->container == drwav_container_aiff) {
+                drwav_uint64 iSample;
+                for (iSample = 0; iSample < samplesRead; iSample += 1) {
+                    pBufferOut[iSample] = -pBufferOut[iSample];
+                }
+            }
+        }
+        #endif
+
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_PRIVATE drwav_uint64 drwav_read_pcm_frames_s32__mulaw(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    drwav_uint64 totalFramesRead;
+    drwav_uint8 sampleData[4096] = {0};
+    drwav_uint32 bytesPerFrame;
+    drwav_uint32 bytesPerSample;
+    drwav_uint64 samplesRead;
+
+    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;   /* Only byte-aligned formats are supported. */
+    }
+
+    totalFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drwav_uint64 framesToReadThisIteration = drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+
+        DRWAV_ASSERT(framesRead <= framesToReadThisIteration);   /* If this fails it means there's a bug in drwav_read_pcm_frames(). */
+
+        /* Validation to ensure we don't read too much from out intermediary buffer. This is to protect from invalid files. */
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            DRWAV_ASSERT(DRWAV_FALSE);  /* This should never happen with a valid file. */
+            break;
+        }
+
+        drwav_mulaw_to_s32(pBufferOut, sampleData, (size_t)samplesRead);
+
+        #ifdef DR_WAV_LIBSNDFILE_COMPAT
+        {
+            if (pWav->container == drwav_container_aiff) {
+                drwav_uint64 iSample;
+                for (iSample = 0; iSample < samplesRead; iSample += 1) {
+                    pBufferOut[iSample] = -pBufferOut[iSample];
+                }
+            }
+        }
+        #endif
+
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+
+    return totalFramesRead;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    if (pWav == NULL || framesToRead == 0) {
+        return 0;
+    }
+
+    if (pBufferOut == NULL) {
+        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+
+    /* Don't try to read more samples than can potentially fit in the output buffer. */
+    if (framesToRead * pWav->channels * sizeof(drwav_int32) > DRWAV_SIZE_MAX) {
+        framesToRead = DRWAV_SIZE_MAX / sizeof(drwav_int32) / pWav->channels;
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM) {
+        return drwav_read_pcm_frames_s32__pcm(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM || pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
+        return drwav_read_pcm_frames_s32__msadpcm_ima(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT) {
+        return drwav_read_pcm_frames_s32__ieee(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ALAW) {
+        return drwav_read_pcm_frames_s32__alaw(pWav, framesToRead, pBufferOut);
+    }
+
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_MULAW) {
+        return drwav_read_pcm_frames_s32__mulaw(pWav, framesToRead, pBufferOut);
+    }
+
+    return 0;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32le(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    drwav_uint64 framesRead = drwav_read_pcm_frames_s32(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_FALSE) {
+        drwav__bswap_samples_s32(pBufferOut, framesRead*pWav->channels);
+    }
+
+    return framesRead;
+}
+
+DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32be(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    drwav_uint64 framesRead = drwav_read_pcm_frames_s32(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_TRUE) {
+        drwav__bswap_samples_s32(pBufferOut, framesRead*pWav->channels);
+    }
+
+    return framesRead;
+}
+
+
+DRWAV_API void drwav_u8_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = ((int)pIn[i] - 128) << 24;
+    }
+}
+
+DRWAV_API void drwav_s16_to_s32(drwav_int32* pOut, const drwav_int16* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = pIn[i] << 16;
+    }
+}
+
+DRWAV_API void drwav_s24_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        unsigned int s0 = pIn[i*3 + 0];
+        unsigned int s1 = pIn[i*3 + 1];
+        unsigned int s2 = pIn[i*3 + 2];
+
+        drwav_int32 sample32 = (drwav_int32)((s0 << 8) | (s1 << 16) | (s2 << 24));
+        *pOut++ = sample32;
+    }
+}
+
+DRWAV_API void drwav_f32_to_s32(drwav_int32* pOut, const float* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = (drwav_int32)(2147483648.0f * pIn[i]);
+    }
+}
+
+DRWAV_API void drwav_f64_to_s32(drwav_int32* pOut, const double* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = (drwav_int32)(2147483648.0 * pIn[i]);
+    }
+}
+
+DRWAV_API void drwav_alaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = ((drwav_int32)drwav__alaw_to_s16(pIn[i])) << 16;
+    }
+}
+
+DRWAV_API void drwav_mulaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+
+    for (i= 0; i < sampleCount; ++i) {
+        *pOut++ = ((drwav_int32)drwav__mulaw_to_s16(pIn[i])) << 16;
+    }
+}
+
+
+
+DRWAV_PRIVATE drwav_int16* drwav__read_pcm_frames_and_close_s16(drwav* pWav, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount)
+{
+    drwav_uint64 sampleDataSize;
+    drwav_int16* pSampleData;
+    drwav_uint64 framesRead;
+
+    DRWAV_ASSERT(pWav != NULL);
+
+    /* Check for overflow before multiplication. */
+    if (pWav->channels == 0 || pWav->totalPCMFrameCount > DRWAV_SIZE_MAX / pWav->channels / sizeof(drwav_int16)) {
+        drwav_uninit(pWav);
+        return NULL;    /* Overflow or invalid channels. */
+    }
+
+    sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(drwav_int16);
+    if (sampleDataSize > DRWAV_SIZE_MAX) {
+        drwav_uninit(pWav);
+        return NULL;    /* File's too big. */
+    }
+
+    pSampleData = (drwav_int16*)drwav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks); /* <-- Safe cast due to the check above. */
+    if (pSampleData == NULL) {
+        drwav_uninit(pWav);
+        return NULL;    /* Failed to allocate memory. */
+    }
+
+    framesRead = drwav_read_pcm_frames_s16(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData);
+    if (framesRead != pWav->totalPCMFrameCount) {
+        drwav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks);
+        drwav_uninit(pWav);
+        return NULL;    /* There was an error reading the samples. */
+    }
+
+    drwav_uninit(pWav);
+
+    if (sampleRate) {
+        *sampleRate = pWav->sampleRate;
+    }
+    if (channels) {
+        *channels = pWav->channels;
+    }
+    if (totalFrameCount) {
+        *totalFrameCount = pWav->totalPCMFrameCount;
+    }
+
+    return pSampleData;
+}
+
+DRWAV_PRIVATE float* drwav__read_pcm_frames_and_close_f32(drwav* pWav, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount)
+{
+    drwav_uint64 sampleDataSize;
+    float* pSampleData;
+    drwav_uint64 framesRead;
+
+    DRWAV_ASSERT(pWav != NULL);
+
+    /* Check for overflow before multiplication. */
+    if (pWav->channels == 0 || pWav->totalPCMFrameCount > DRWAV_SIZE_MAX / pWav->channels / sizeof(float)) {
+        drwav_uninit(pWav);
+        return NULL;    /* Overflow or invalid channels. */
+    }
+
+    sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(float);
+    if (sampleDataSize > DRWAV_SIZE_MAX) {
+        drwav_uninit(pWav);
+        return NULL;    /* File's too big. */
+    }
+
+    pSampleData = (float*)drwav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks); /* <-- Safe cast due to the check above. */
+    if (pSampleData == NULL) {
+        drwav_uninit(pWav);
+        return NULL;    /* Failed to allocate memory. */
+    }
+
+    framesRead = drwav_read_pcm_frames_f32(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData);
+    if (framesRead != pWav->totalPCMFrameCount) {
+        drwav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks);
+        drwav_uninit(pWav);
+        return NULL;    /* There was an error reading the samples. */
+    }
+
+    drwav_uninit(pWav);
+
+    if (sampleRate) {
+        *sampleRate = pWav->sampleRate;
+    }
+    if (channels) {
+        *channels = pWav->channels;
+    }
+    if (totalFrameCount) {
+        *totalFrameCount = pWav->totalPCMFrameCount;
+    }
+
+    return pSampleData;
+}
+
+DRWAV_PRIVATE drwav_int32* drwav__read_pcm_frames_and_close_s32(drwav* pWav, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount)
+{
+    drwav_uint64 sampleDataSize;
+    drwav_int32* pSampleData;
+    drwav_uint64 framesRead;
+
+    DRWAV_ASSERT(pWav != NULL);
+
+    /* Check for overflow before multiplication. */
+    if (pWav->channels == 0 || pWav->totalPCMFrameCount > DRWAV_SIZE_MAX / pWav->channels / sizeof(drwav_int32)) {
+        drwav_uninit(pWav);
+        return NULL;    /* Overflow or invalid channels. */
+    }
+
+    sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(drwav_int32);
+    if (sampleDataSize > DRWAV_SIZE_MAX) {
+        drwav_uninit(pWav);
+        return NULL;    /* File's too big. */
+    }
+
+    pSampleData = (drwav_int32*)drwav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks); /* <-- Safe cast due to the check above. */
+    if (pSampleData == NULL) {
+        drwav_uninit(pWav);
+        return NULL;    /* Failed to allocate memory. */
+    }
+
+    framesRead = drwav_read_pcm_frames_s32(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData);
+    if (framesRead != pWav->totalPCMFrameCount) {
+        drwav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks);
+        drwav_uninit(pWav);
+        return NULL;    /* There was an error reading the samples. */
+    }
+
+    drwav_uninit(pWav);
+
+    if (sampleRate) {
+        *sampleRate = pWav->sampleRate;
+    }
+    if (channels) {
+        *channels = pWav->channels;
+    }
+    if (totalFrameCount) {
+        *totalFrameCount = pWav->totalPCMFrameCount;
+    }
+
+    return pSampleData;
+}
+
+
+
+DRWAV_API drwav_int16* drwav_open_and_read_pcm_frames_s16(drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_tell_proc onTell, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init(&wav, onRead, onSeek, onTell, pUserData, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+DRWAV_API float* drwav_open_and_read_pcm_frames_f32(drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_tell_proc onTell, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init(&wav, onRead, onSeek, onTell, pUserData, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+DRWAV_API drwav_int32* drwav_open_and_read_pcm_frames_s32(drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_tell_proc onTell, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init(&wav, onRead, onSeek, onTell, pUserData, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+#ifndef DR_WAV_NO_STDIO
+DRWAV_API drwav_int16* drwav_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_file(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+DRWAV_API float* drwav_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_file(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+DRWAV_API drwav_int32* drwav_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_file(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+
+#ifndef DR_WAV_NO_WCHAR
+DRWAV_API drwav_int16* drwav_open_file_and_read_pcm_frames_s16_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_file_w(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+DRWAV_API float* drwav_open_file_and_read_pcm_frames_f32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_file_w(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+DRWAV_API drwav_int32* drwav_open_file_and_read_pcm_frames_s32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_file_w(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+#endif /* DR_WAV_NO_WCHAR */
+#endif /* DR_WAV_NO_STDIO */
+
+DRWAV_API drwav_int16* drwav_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+DRWAV_API float* drwav_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+
+DRWAV_API drwav_int32* drwav_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    drwav wav;
+
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+
+    if (!drwav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) {
+        return NULL;
+    }
+
+    return drwav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+#endif  /* DR_WAV_NO_CONVERSION_API */
+
+
+DRWAV_API void drwav_free(void* p, const drwav_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks != NULL) {
+        drwav__free_from_callbacks(p, pAllocationCallbacks);
+    } else {
+        drwav__free_default(p, NULL);
+    }
+}
+
+DRWAV_API drwav_uint16 drwav_bytes_to_u16(const drwav_uint8* data)
+{
+    return ((drwav_uint16)data[0] << 0) | ((drwav_uint16)data[1] << 8);
+}
+
+DRWAV_API drwav_int16 drwav_bytes_to_s16(const drwav_uint8* data)
+{
+    return (drwav_int16)drwav_bytes_to_u16(data);
+}
+
+DRWAV_API drwav_uint32 drwav_bytes_to_u32(const drwav_uint8* data)
+{
+    return drwav_bytes_to_u32_le(data);
+}
+
+DRWAV_API float drwav_bytes_to_f32(const drwav_uint8* data)
+{
+    union {
+        drwav_uint32 u32;
+        float f32;
+    } value;
+
+    value.u32 = drwav_bytes_to_u32(data);
+    return value.f32;
+}
+
+DRWAV_API drwav_int32 drwav_bytes_to_s32(const drwav_uint8* data)
+{
+    return (drwav_int32)drwav_bytes_to_u32(data);
+}
+
+DRWAV_API drwav_uint64 drwav_bytes_to_u64(const drwav_uint8* data)
+{
+    return
+        ((drwav_uint64)data[0] <<  0) | ((drwav_uint64)data[1] <<  8) | ((drwav_uint64)data[2] << 16) | ((drwav_uint64)data[3] << 24) |
+        ((drwav_uint64)data[4] << 32) | ((drwav_uint64)data[5] << 40) | ((drwav_uint64)data[6] << 48) | ((drwav_uint64)data[7] << 56);
+}
+
+DRWAV_API drwav_int64 drwav_bytes_to_s64(const drwav_uint8* data)
+{
+    return (drwav_int64)drwav_bytes_to_u64(data);
+}
+
+
+DRWAV_API drwav_bool32 drwav_guid_equal(const drwav_uint8 a[16], const drwav_uint8 b[16])
+{
+    int i;
+    for (i = 0; i < 16; i += 1) {
+        if (a[i] != b[i]) {
+            return DRWAV_FALSE;
+        }
+    }
+
+    return DRWAV_TRUE;
+}
+
+DRWAV_API drwav_bool32 drwav_fourcc_equal(const drwav_uint8* a, const char* b)
+{
+    return
+        a[0] == b[0] &&
+        a[1] == b[1] &&
+        a[2] == b[2] &&
+        a[3] == b[3];
+}
+
+#ifdef __MRC__
+/* Undo the pragma at the beginning of this file. */
+#pragma options opt reset
+#endif
+
+#endif  /* dr_wav_c */
+#endif  /* DR_WAV_IMPLEMENTATION */
+
+/*
+REVISION HISTORY
+================
+v0.14.3 - 2025-12-14
+  - Fix a possible out-of-bounds read when reading from MS-ADPCM encoded files.
+  - Fix a possible integer overflow error.
+
+v0.14.2 - 2025-12-02
+  - Fix a compilation warning.
+
+v0.14.1 - 2025-09-10
+  - Fix an error with the NXDK build.
+
+v0.14.0 - 2025-07-23
+  - API CHANGE: Seek origin enums have been renamed to the following:
+    - drwav_seek_origin_start   -> DRWAV_SEEK_SET
+    - drwav_seek_origin_current -> DRWAV_SEEK_CUR
+    - DRWAV_SEEK_END (new)
+  - API CHANGE: A new seek origin has been added to allow seeking from the end of the file. If you implement your own `onSeek` callback, you must now handle `DRWAV_SEEK_END`. If you only use `*_init_file()` or `*_init_memory()`, you need not change anything.
+  - API CHANGE: An `onTell` callback has been added to the following functions:
+    - drwav_init()
+    - drwav_init_ex()
+    - drwav_init_with_metadata()
+    - drwav_open_and_read_pcm_frames_s16()
+    - drwav_open_and_read_pcm_frames_f32()
+    - drwav_open_and_read_pcm_frames_s32()
+  - API CHANGE: The `firstSampleByteOffset`, `lastSampleByteOffset` and `sampleByteOffset` members of `drwav_cue_point` have been renamed to `firstSampleOffset`, `lastSampleOffset` and `sampleOffset`, respectively.
+  - Fix a static analysis warning.
+  - Fix compilation for AIX OS.
+
+v0.13.17 - 2024-12-17
+  - Fix a possible crash when reading from MS-ADPCM encoded files.
+  - Improve detection of ARM64EC
+
+v0.13.16 - 2024-02-27
+  - Fix a Wdouble-promotion warning.
+
+v0.13.15 - 2024-01-23
+  - Relax some unnecessary validation that prevented some files from loading.
+
+v0.13.14 - 2023-12-02
+  - Fix a warning about an unused variable.
+
+v0.13.13 - 2023-11-02
+  - Fix a warning when compiling with Clang.
+
+v0.13.12 - 2023-08-07
+  - Fix a possible crash in drwav_read_pcm_frames().
+
+v0.13.11 - 2023-07-07
+  - AIFF compatibility improvements.
+
+v0.13.10 - 2023-05-29
+  - Fix a bug where drwav_init_with_metadata() does not decode any frames after initializtion.
+
+v0.13.9 - 2023-05-22
+  - Add support for AIFF decoding (writing and metadata not supported).
+  - Add support for RIFX decoding (writing and metadata not supported).
+  - Fix a bug where metadata is not processed if it's located before the "fmt " chunk.
+  - Add a workaround for a type of malformed WAV file where the size of the "RIFF" and "data" chunks
+    are incorrectly set to 0xFFFFFFFF.
+
+v0.13.8 - 2023-03-25
+  - Fix a possible null pointer dereference.
+  - Fix a crash when loading files with badly formed metadata.
+
+v0.13.7 - 2022-09-17
+  - Fix compilation with DJGPP.
+  - Add support for disabling wchar_t with DR_WAV_NO_WCHAR.
+
+v0.13.6 - 2022-04-10
+  - Fix compilation error on older versions of GCC.
+  - Remove some dependencies on the standard library.
+
+v0.13.5 - 2022-01-26
+  - Fix an error when seeking to the end of the file.
+
+v0.13.4 - 2021-12-08
+  - Fix some static analysis warnings.
+
+v0.13.3 - 2021-11-24
+  - Fix an incorrect assertion when trying to endian swap 1-byte sample formats. This is now a no-op
+    rather than a failed assertion.
+  - Fix a bug with parsing of the bext chunk.
+  - Fix some static analysis warnings.
+
+v0.13.2 - 2021-10-02
+  - Fix a possible buffer overflow when reading from compressed formats.
+
+v0.13.1 - 2021-07-31
+  - Fix platform detection for ARM64.
+
+v0.13.0 - 2021-07-01
+  - Improve support for reading and writing metadata. Use the `_with_metadata()` APIs to initialize
+    a WAV decoder and store the metadata within the `drwav` object. Use the `pMetadata` and
+    `metadataCount` members of the `drwav` object to read the data. The old way of handling metadata
+    via a callback is still usable and valid.
+  - API CHANGE: drwav_target_write_size_bytes() now takes extra parameters for calculating the
+    required write size when writing metadata.
+  - Add drwav_get_cursor_in_pcm_frames()
+  - Add drwav_get_length_in_pcm_frames()
+  - Fix a bug where drwav_read_raw() can call the read callback with a byte count of zero.
+
+v0.12.20 - 2021-06-11
+  - Fix some undefined behavior.
+
+v0.12.19 - 2021-02-21
+  - Fix a warning due to referencing _MSC_VER when it is undefined.
+  - Minor improvements to the management of some internal state concerning the data chunk cursor.
+
+v0.12.18 - 2021-01-31
+  - Clean up some static analysis warnings.
+
+v0.12.17 - 2021-01-17
+  - Minor fix to sample code in documentation.
+  - Correctly qualify a private API as private rather than public.
+  - Code cleanup.
+
+v0.12.16 - 2020-12-02
+  - Fix a bug when trying to read more bytes than can fit in a size_t.
+
+v0.12.15 - 2020-11-21
+  - Fix compilation with OpenWatcom.
+
+v0.12.14 - 2020-11-13
+  - Minor code clean up.
+
+v0.12.13 - 2020-11-01
+  - Improve compiler support for older versions of GCC.
+
+v0.12.12 - 2020-09-28
+  - Add support for RF64.
+  - Fix a bug in writing mode where the size of the RIFF chunk incorrectly includes the header section.
+
+v0.12.11 - 2020-09-08
+  - Fix a compilation error on older compilers.
+
+v0.12.10 - 2020-08-24
+  - Fix a bug when seeking with ADPCM formats.
+
+v0.12.9 - 2020-08-02
+  - Simplify sized types.
+
+v0.12.8 - 2020-07-25
+  - Fix a compilation warning.
+
+v0.12.7 - 2020-07-15
+  - Fix some bugs on big-endian architectures.
+  - Fix an error in s24 to f32 conversion.
+
+v0.12.6 - 2020-06-23
+  - Change drwav_read_*() to allow NULL to be passed in as the output buffer which is equivalent to a forward seek.
+  - Fix a buffer overflow when trying to decode invalid IMA-ADPCM files.
+  - Add include guard for the implementation section.
+
+v0.12.5 - 2020-05-27
+  - Minor documentation fix.
+
+v0.12.4 - 2020-05-16
+  - Replace assert() with DRWAV_ASSERT().
+  - Add compile-time and run-time version querying.
+    - DRWAV_VERSION_MINOR
+    - DRWAV_VERSION_MAJOR
+    - DRWAV_VERSION_REVISION
+    - DRWAV_VERSION_STRING
+    - drwav_version()
+    - drwav_version_string()
+
+v0.12.3 - 2020-04-30
+  - Fix compilation errors with VC6.
+
+v0.12.2 - 2020-04-21
+  - Fix a bug where drwav_init_file() does not close the file handle after attempting to load an erroneous file.
+
+v0.12.1 - 2020-04-13
+  - Fix some pedantic warnings.
+
+v0.12.0 - 2020-04-04
+  - API CHANGE: Add container and format parameters to the chunk callback.
+  - Minor documentation updates.
+
+v0.11.5 - 2020-03-07
+  - Fix compilation error with Visual Studio .NET 2003.
+
+v0.11.4 - 2020-01-29
+  - Fix some static analysis warnings.
+  - Fix a bug when reading f32 samples from an A-law encoded stream.
+
+v0.11.3 - 2020-01-12
+  - Minor changes to some f32 format conversion routines.
+  - Minor bug fix for ADPCM conversion when end of file is reached.
+
+v0.11.2 - 2019-12-02
+  - Fix a possible crash when using custom memory allocators without a custom realloc() implementation.
+  - Fix an integer overflow bug.
+  - Fix a null pointer dereference bug.
+  - Add limits to sample rate, channels and bits per sample to tighten up some validation.
+
+v0.11.1 - 2019-10-07
+  - Internal code clean up.
+
+v0.11.0 - 2019-10-06
+  - API CHANGE: Add support for user defined memory allocation routines. This system allows the program to specify their own memory allocation
+    routines with a user data pointer for client-specific contextual data. This adds an extra parameter to the end of the following APIs:
+    - drwav_init()
+    - drwav_init_ex()
+    - drwav_init_file()
+    - drwav_init_file_ex()
+    - drwav_init_file_w()
+    - drwav_init_file_w_ex()
+    - drwav_init_memory()
+    - drwav_init_memory_ex()
+    - drwav_init_write()
+    - drwav_init_write_sequential()
+    - drwav_init_write_sequential_pcm_frames()
+    - drwav_init_file_write()
+    - drwav_init_file_write_sequential()
+    - drwav_init_file_write_sequential_pcm_frames()
+    - drwav_init_file_write_w()
+    - drwav_init_file_write_sequential_w()
+    - drwav_init_file_write_sequential_pcm_frames_w()
+    - drwav_init_memory_write()
+    - drwav_init_memory_write_sequential()
+    - drwav_init_memory_write_sequential_pcm_frames()
+    - drwav_open_and_read_pcm_frames_s16()
+    - drwav_open_and_read_pcm_frames_f32()
+    - drwav_open_and_read_pcm_frames_s32()
+    - drwav_open_file_and_read_pcm_frames_s16()
+    - drwav_open_file_and_read_pcm_frames_f32()
+    - drwav_open_file_and_read_pcm_frames_s32()
+    - drwav_open_file_and_read_pcm_frames_s16_w()
+    - drwav_open_file_and_read_pcm_frames_f32_w()
+    - drwav_open_file_and_read_pcm_frames_s32_w()
+    - drwav_open_memory_and_read_pcm_frames_s16()
+    - drwav_open_memory_and_read_pcm_frames_f32()
+    - drwav_open_memory_and_read_pcm_frames_s32()
+    Set this extra parameter to NULL to use defaults which is the same as the previous behaviour. Setting this NULL will use
+    DRWAV_MALLOC, DRWAV_REALLOC and DRWAV_FREE.
+  - Add support for reading and writing PCM frames in an explicit endianness. New APIs:
+    - drwav_read_pcm_frames_le()
+    - drwav_read_pcm_frames_be()
+    - drwav_read_pcm_frames_s16le()
+    - drwav_read_pcm_frames_s16be()
+    - drwav_read_pcm_frames_f32le()
+    - drwav_read_pcm_frames_f32be()
+    - drwav_read_pcm_frames_s32le()
+    - drwav_read_pcm_frames_s32be()
+    - drwav_write_pcm_frames_le()
+    - drwav_write_pcm_frames_be()
+  - Remove deprecated APIs.
+  - API CHANGE: The following APIs now return native-endian data. Previously they returned little-endian data.
+    - drwav_read_pcm_frames()
+    - drwav_read_pcm_frames_s16()
+    - drwav_read_pcm_frames_s32()
+    - drwav_read_pcm_frames_f32()
+    - drwav_open_and_read_pcm_frames_s16()
+    - drwav_open_and_read_pcm_frames_s32()
+    - drwav_open_and_read_pcm_frames_f32()
+    - drwav_open_file_and_read_pcm_frames_s16()
+    - drwav_open_file_and_read_pcm_frames_s32()
+    - drwav_open_file_and_read_pcm_frames_f32()
+    - drwav_open_file_and_read_pcm_frames_s16_w()
+    - drwav_open_file_and_read_pcm_frames_s32_w()
+    - drwav_open_file_and_read_pcm_frames_f32_w()
+    - drwav_open_memory_and_read_pcm_frames_s16()
+    - drwav_open_memory_and_read_pcm_frames_s32()
+    - drwav_open_memory_and_read_pcm_frames_f32()
+
+v0.10.1 - 2019-08-31
+  - Correctly handle partial trailing ADPCM blocks.
+
+v0.10.0 - 2019-08-04
+  - Remove deprecated APIs.
+  - Add wchar_t variants for file loading APIs:
+      drwav_init_file_w()
+      drwav_init_file_ex_w()
+      drwav_init_file_write_w()
+      drwav_init_file_write_sequential_w()
+  - Add drwav_target_write_size_bytes() which calculates the total size in bytes of a WAV file given a format and sample count.
+  - Add APIs for specifying the PCM frame count instead of the sample count when opening in sequential write mode:
+      drwav_init_write_sequential_pcm_frames()
+      drwav_init_file_write_sequential_pcm_frames()
+      drwav_init_file_write_sequential_pcm_frames_w()
+      drwav_init_memory_write_sequential_pcm_frames()
+  - Deprecate drwav_open*() and drwav_close():
+      drwav_open()
+      drwav_open_ex()
+      drwav_open_write()
+      drwav_open_write_sequential()
+      drwav_open_file()
+      drwav_open_file_ex()
+      drwav_open_file_write()
+      drwav_open_file_write_sequential()
+      drwav_open_memory()
+      drwav_open_memory_ex()
+      drwav_open_memory_write()
+      drwav_open_memory_write_sequential()
+      drwav_close()
+  - Minor documentation updates.
+
+v0.9.2 - 2019-05-21
+  - Fix warnings.
+
+v0.9.1 - 2019-05-05
+  - Add support for C89.
+  - Change license to choice of public domain or MIT-0.
+
+v0.9.0 - 2018-12-16
+  - API CHANGE: Add new reading APIs for reading by PCM frames instead of samples. Old APIs have been deprecated and
+    will be removed in v0.10.0. Deprecated APIs and their replacements:
+      drwav_read()                     -> drwav_read_pcm_frames()
+      drwav_read_s16()                 -> drwav_read_pcm_frames_s16()
+      drwav_read_f32()                 -> drwav_read_pcm_frames_f32()
+      drwav_read_s32()                 -> drwav_read_pcm_frames_s32()
+      drwav_seek_to_sample()           -> drwav_seek_to_pcm_frame()
+      drwav_write()                    -> drwav_write_pcm_frames()
+      drwav_open_and_read_s16()        -> drwav_open_and_read_pcm_frames_s16()
+      drwav_open_and_read_f32()        -> drwav_open_and_read_pcm_frames_f32()
+      drwav_open_and_read_s32()        -> drwav_open_and_read_pcm_frames_s32()
+      drwav_open_file_and_read_s16()   -> drwav_open_file_and_read_pcm_frames_s16()
+      drwav_open_file_and_read_f32()   -> drwav_open_file_and_read_pcm_frames_f32()
+      drwav_open_file_and_read_s32()   -> drwav_open_file_and_read_pcm_frames_s32()
+      drwav_open_memory_and_read_s16() -> drwav_open_memory_and_read_pcm_frames_s16()
+      drwav_open_memory_and_read_f32() -> drwav_open_memory_and_read_pcm_frames_f32()
+      drwav_open_memory_and_read_s32() -> drwav_open_memory_and_read_pcm_frames_s32()
+      drwav::totalSampleCount          -> drwav::totalPCMFrameCount
+  - API CHANGE: Rename drwav_open_and_read_file_*() to drwav_open_file_and_read_*().
+  - API CHANGE: Rename drwav_open_and_read_memory_*() to drwav_open_memory_and_read_*().
+  - Add built-in support for smpl chunks.
+  - Add support for firing a callback for each chunk in the file at initialization time.
+    - This is enabled through the drwav_init_ex(), etc. family of APIs.
+  - Handle invalid FMT chunks more robustly.
+
+v0.8.5 - 2018-09-11
+  - Const correctness.
+  - Fix a potential stack overflow.
+
+v0.8.4 - 2018-08-07
+  - Improve 64-bit detection.
+
+v0.8.3 - 2018-08-05
+  - Fix C++ build on older versions of GCC.
+
+v0.8.2 - 2018-08-02
+  - Fix some big-endian bugs.
+
+v0.8.1 - 2018-06-29
+  - Add support for sequential writing APIs.
+  - Disable seeking in write mode.
+  - Fix bugs with Wave64.
+  - Fix typos.
+
+v0.8 - 2018-04-27
+  - Bug fix.
+  - Start using major.minor.revision versioning.
+
+v0.7f - 2018-02-05
+  - Restrict ADPCM formats to a maximum of 2 channels.
+
+v0.7e - 2018-02-02
+  - Fix a crash.
+
+v0.7d - 2018-02-01
+  - Fix a crash.
+
+v0.7c - 2018-02-01
+  - Set drwav.bytesPerSample to 0 for all compressed formats.
+  - Fix a crash when reading 16-bit floating point WAV files. In this case dr_wav will output silence for
+    all format conversion reading APIs (*_s16, *_s32, *_f32 APIs).
+  - Fix some divide-by-zero errors.
+
+v0.7b - 2018-01-22
+  - Fix errors with seeking of compressed formats.
+  - Fix compilation error when DR_WAV_NO_CONVERSION_API
+
+v0.7a - 2017-11-17
+  - Fix some GCC warnings.
+
+v0.7 - 2017-11-04
+  - Add writing APIs.
+
+v0.6 - 2017-08-16
+  - API CHANGE: Rename dr_* types to drwav_*.
+  - Add support for custom implementations of malloc(), realloc(), etc.
+  - Add support for Microsoft ADPCM.
+  - Add support for IMA ADPCM (DVI, format code 0x11).
+  - Optimizations to drwav_read_s16().
+  - Bug fixes.
+
+v0.5g - 2017-07-16
+  - Change underlying type for booleans to unsigned.
+
+v0.5f - 2017-04-04
+  - Fix a minor bug with drwav_open_and_read_s16() and family.
+
+v0.5e - 2016-12-29
+  - Added support for reading samples as signed 16-bit integers. Use the _s16() family of APIs for this.
+  - Minor fixes to documentation.
+
+v0.5d - 2016-12-28
+  - Use drwav_int* and drwav_uint* sized types to improve compiler support.
+
+v0.5c - 2016-11-11
+  - Properly handle JUNK chunks that come before the FMT chunk.
+
+v0.5b - 2016-10-23
+  - A minor change to drwav_bool8 and drwav_bool32 types.
+
+v0.5a - 2016-10-11
+  - Fixed a bug with drwav_open_and_read() and family due to incorrect argument ordering.
+  - Improve A-law and mu-law efficiency.
+
+v0.5 - 2016-09-29
+  - API CHANGE. Swap the order of "channels" and "sampleRate" parameters in drwav_open_and_read*(). Rationale for this is to
+    keep it consistent with dr_audio and dr_flac.
+
+v0.4b - 2016-09-18
+  - Fixed a typo in documentation.
+
+v0.4a - 2016-09-18
+  - Fixed a typo.
+  - Change date format to ISO 8601 (YYYY-MM-DD)
+
+v0.4 - 2016-07-13
+  - API CHANGE. Make onSeek consistent with dr_flac.
+  - API CHANGE. Rename drwav_seek() to drwav_seek_to_sample() for clarity and consistency with dr_flac.
+  - Added support for Sony Wave64.
+
+v0.3a - 2016-05-28
+  - API CHANGE. Return drwav_bool32 instead of int in onSeek callback.
+  - Fixed a memory leak.
+
+v0.3 - 2016-05-22
+  - Lots of API changes for consistency.
+
+v0.2a - 2016-05-16
+  - Fixed Linux/GCC build.
+
+v0.2 - 2016-05-11
+  - Added support for reading data as signed 32-bit PCM for consistency with dr_flac.
+
+v0.1a - 2016-05-07
+  - Fixed a bug in drwav_open_file() where the file handle would not be closed if the loader failed to initialize.
+
+v0.1 - 2016-05-04
+  - Initial versioned release.
+*/
+
+/*
+This software is available as a choice of the following licenses. Choose
+whichever you prefer.
+
+===============================================================================
+ALTERNATIVE 1 - Public Domain (www.unlicense.org)
+===============================================================================
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
+
+===============================================================================
+ALTERNATIVE 2 - MIT No Attribution
+===============================================================================
+Copyright 2023 David Reid
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/