#include "test_common.h"

int main(int argc, char* argv[]) {
    std::cout << "Test: BandSplit Component Verification" << std::endl;
    
    // 1. 获取资源
    std::string model_path = GetModelPath();
    std::string data_dir = GetTestDataDir();
    
    if (argc > 1) model_path = argv[1];
    if (argc > 2) data_dir = argv[2];
    
    LOG_STEP(1, 4, "Loading model from " + model_path);
    BSRoformer model;
    model.Initialize(model_path);
    
    LOG_STEP(2, 4, "Loading golden tensors from " + data_dir);
    GoldenTensor input(data_dir, "band_split_in");
    GoldenTensor expected(data_dir, "after_band_split");
    
    TEST_ASSERT_LOAD(input, "band_split_in");
    TEST_ASSERT_LOAD(expected, "after_band_split");
    
    input.PrintShape("Input");
    expected.PrintShape("Expected");
    
    // PyTorch [batch, bands, time, dim] -> GGML [dim, time, bands, batch] ? 
    // Wait, utils.cpp says: load_npy returns raw data and shape.
    // PyTorch input: [batch, bands, time, dim]
    // GGML expected Input: [dim, bands, time, batch] ? No.
    // Let's check original test...
    // Original: total_dim_input(idx=2), n_frames(idx=1), batch(idx=0).
    // Original input: [batch, frames, dim] ??
    // band_split_in.npy shape from original output: [1, 301, 384] (Batch, Time, Dim)?
    // No, let's look at export_debug.py line 219: `x = rearrange(x, 'b t (f c) -> b t f c')` ??
    // Wait, export_debug.py:
    //   x = stft_repr[batch_arange, freq_indices] -> [b, f, t, c]
    //   x = rearrange(x, 'b f t c -> b t (f c)') -> [b, t, features]
    // So 'band_split_in' is [Batch, Time, Features]
    // GGML Tensor likely: [Features, Time, Batch] (Transposed for column-major/GGML)
    
    int batch = input.shape[0];
    int n_frames = input.shape[1];
    int total_dim = input.shape[2];
    
    // 3. Build Graph
    LOG_STEP(3, 4, "Building computation graph");
    TestContext tc(&model);
    
    // GGML Tensor shape: [dim, n_frames, batch]
    ggml_tensor* in_tensor = ggml_new_tensor_3d(tc.ctx, GGML_TYPE_F32, total_dim, n_frames, batch);
    ggml_set_input(in_tensor);
    
    ggml_tensor* out = model.BuildBandSplitGraph(tc.ctx, in_tensor, tc.gf, n_frames, batch);
    TEST_ASSERT(out, "BuildBandSplitGraph returned nullptr");
    
    // Mark output for computation
    ggml_build_forward_expand(tc.gf, out);
    
    // 4. Exec
    LOG_STEP(4, 4, "Executing");
    if (!tc.AllocateGraph()) {
        std::cerr << "Graph allocation failed" << std::endl;
        return 1;
    }
    
    // Copy input (NumPy [B, T, D] -> GGML [D, T, B])
    // The memory layout of NumPy [B,T,D] (C-contiguous) is:
    //   Batch 0 -> Time 0 -> Dim 0..D
    // GGML [D, T, B] (F-contiguous-ish, but tensor struct is different)
    // Actually GGML default tensor is [ne0, ne1, ne2, ne3]
    // ne0 is fastest moving dimension. 
    // If we say tensor is [D, T, B], ne0=D, ne1=T, ne2=B.
    // So data layout is D contiguous, then T, then B.
    // This MATCHES NumPy [B, T, D] C-contiguous!
    //   NumPy: fast index is last dim (D).
    //   GGML: fast index is first dim (ne0=D).
    // So we can memcpy directly!
    
    ggml_backend_tensor_set(in_tensor, input.data, 0, ggml_nbytes(in_tensor));
    tc.Compute();
    
    // 5. Compare
    auto output = tc.ReadTensor(out);
    
    bool pass = CompareAndReport("BandSplit", 
                                  expected.data, expected.nelements(),
                                  output.data(), output.size());
    
    if (pass) LOG_PASS(); else LOG_FAIL();
    return pass ? 0 : 1;
}