dakera-inference 0.11.81

//! Batch processing utilities for efficient embedding generation.

use crate::error::{InferenceError, Result};
use crate::models::EmbeddingModel;
use tokenizers::{PaddingParams, PaddingStrategy, Tokenizer, TruncationParams};
use tracing::{debug, instrument};

/// Prepared batch of tokenized inputs ready for ORT inference.
///
/// All token arrays are flat (row-major) with shape `[batch_size, seq_len]`.
/// Values are `i64` because ONNX Runtime BERT models expect `int64` inputs.
#[derive(Debug)]
pub struct PreparedBatch {
    /// Input token IDs, flat `[batch_size * seq_len]`, i64
    pub input_ids: Vec<i64>,
    /// Attention mask, flat `[batch_size * seq_len]`, i64
    pub attention_mask: Vec<i64>,
    /// Token type IDs, flat `[batch_size * seq_len]`, i64
    pub token_type_ids: Vec<i64>,
    /// Number of items in this batch
    pub batch_size: usize,
    /// Sequence length (uniform after padding)
    pub seq_len: usize,
    /// Original text lengths (for debugging)
    pub original_lengths: Vec<usize>,
}

/// Batch processor for preparing text inputs for embedding models.
pub struct BatchProcessor {
    tokenizer: Tokenizer,
    model: EmbeddingModel,
    max_batch_size: usize,
}

impl BatchProcessor {
    /// Create a new batch processor.
    pub fn new(mut tokenizer: Tokenizer, model: EmbeddingModel, max_batch_size: usize) -> Self {
        // Configure padding
        let padding = PaddingParams {
            strategy: PaddingStrategy::BatchLongest,
            pad_id: tokenizer.get_padding().map_or(0, |p| p.pad_id),
            pad_token: tokenizer
                .get_padding()
                .map_or("[PAD]".to_string(), |p| p.pad_token.clone()),
            ..Default::default()
        };
        tokenizer.with_padding(Some(padding));

        // Configure truncation
        let truncation = TruncationParams {
            max_length: model.max_seq_length(),
            ..Default::default()
        };
        let _ = tokenizer.with_truncation(Some(truncation));

        Self {
            tokenizer,
            model,
            max_batch_size,
        }
    }

    /// Get the maximum batch size.
    pub fn max_batch_size(&self) -> usize {
        self.max_batch_size
    }

    /// Prepare texts for embedding, optionally applying model-specific prefixes.
    #[instrument(skip(self, texts), fields(count = texts.len()))]
    pub fn prepare_texts(&self, texts: &[String], is_query: bool) -> Vec<String> {
        let prefix = if is_query {
            self.model.query_prefix()
        } else {
            self.model.document_prefix()
        };

        match prefix {
            Some(p) => texts.iter().map(|t| format!("{}{}", p, t)).collect(),
            None => texts.to_vec(),
        }
    }

    /// Tokenize a batch of texts and prepare flat i64 arrays for ORT inference.
    #[instrument(skip(self, texts), fields(count = texts.len()))]
    pub fn tokenize_batch(&self, texts: &[String]) -> Result<PreparedBatch> {
        if texts.is_empty() {
            return Err(InferenceError::InvalidInput("Empty text batch".into()));
        }

        if texts.len() > self.max_batch_size {
            return Err(InferenceError::InvalidInput(format!(
                "Batch size {} exceeds maximum {}",
                texts.len(),
                self.max_batch_size
            )));
        }

        let original_lengths: Vec<usize> = texts.iter().map(|t| t.len()).collect();

        debug!(
            "Tokenizing {} texts, max length: {}",
            texts.len(),
            original_lengths.iter().max().unwrap_or(&0)
        );

        // Tokenize all texts
        let encodings = self
            .tokenizer
            .encode_batch(texts.to_vec(), true)
            .map_err(|e| InferenceError::TokenizationError(e.to_string()))?;

        let batch_size = encodings.len();
        let seq_len = encodings.first().map(|e| e.get_ids().len()).unwrap_or(0);

        debug!("Tokenized: batch_size={}, seq_len={}", batch_size, seq_len);

        // Extract and flatten as i64 (ORT BERT models require int64)
        let mut input_ids = Vec::with_capacity(batch_size * seq_len);
        let mut attention_mask = Vec::with_capacity(batch_size * seq_len);
        let mut token_type_ids = Vec::with_capacity(batch_size * seq_len);

        for enc in &encodings {
            input_ids.extend(enc.get_ids().iter().map(|&id| id as i64));
            attention_mask.extend(enc.get_attention_mask().iter().map(|&m| m as i64));

            let type_ids = enc.get_type_ids();
            if type_ids.is_empty() {
                token_type_ids.extend(std::iter::repeat_n(0i64, seq_len));
            } else {
                token_type_ids.extend(type_ids.iter().map(|&t| t as i64));
            }
        }

        Ok(PreparedBatch {
            input_ids,
            attention_mask,
            token_type_ids,
            batch_size,
            seq_len,
            original_lengths,
        })
    }

    /// Split texts into batches of maximum size.
    pub fn split_into_batches<'a>(&self, texts: &'a [String]) -> Vec<&'a [String]> {
        texts.chunks(self.max_batch_size).collect()
    }
}

/// Apply mean pooling to ORT last-hidden-state output.
///
/// `last_hidden_state` is a flat row-major slice with shape `[batch, seq_len, hidden_size]`.
/// `attention_mask` is flat `[batch * seq_len]` with i64 values (0 or 1).
///
/// Returns `Vec<Vec<f32>>` with shape `[batch, hidden_size]`.
#[instrument(skip_all, fields(batch_size, seq_len, hidden_size))]
pub fn mean_pooling(
    last_hidden_state: &[f32],
    batch_size: usize,
    seq_len: usize,
    hidden_size: usize,
    attention_mask: &[i64],
) -> Vec<Vec<f32>> {
    let mut result = vec![vec![0.0f32; hidden_size]; batch_size];

    for b in 0..batch_size {
        // Sum of mask weights for this batch item
        let mask_sum: f32 = (0..seq_len)
            .map(|s| attention_mask[b * seq_len + s] as f32)
            .sum::<f32>()
            .max(1e-9);

        for (h, cell) in result[b].iter_mut().enumerate() {
            let weighted_sum: f32 = (0..seq_len)
                .map(|s| {
                    let lhs_idx = b * seq_len * hidden_size + s * hidden_size + h;
                    last_hidden_state[lhs_idx] * attention_mask[b * seq_len + s] as f32
                })
                .sum();
            *cell = weighted_sum / mask_sum;
        }
    }

    debug!(
        "Mean pooled: batch={}, hidden={}",
        result.len(),
        result.first().map(|v| v.len()).unwrap_or(0)
    );

    result
}

/// Normalize embeddings to unit length (L2 normalization), in-place.
#[instrument(skip_all, fields(count = embeddings.len()))]
pub fn normalize_embeddings(embeddings: &mut [Vec<f32>]) {
    for emb in embeddings.iter_mut() {
        let norm: f32 = emb.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-12);
        for v in emb.iter_mut() {
            *v /= norm;
        }
    }
    debug!("Normalized {} embeddings", embeddings.len());
}

/// Truncate a full-dimension embedding to `target_dim` MRL sub-dimension, then re-normalise.
///
/// Matryoshka Representation Learning (MRL) trains models such that the first `N` dimensions
/// of the embedding capture the most semantic information.  Truncating to `N` followed by
/// L2 re-normalisation is semantically valid and gives a high-quality lower-dimensional
/// representation.
///
/// Returns an error if `target_dim > embedding.len()` or `target_dim == 0`.
pub fn truncate_mrl(embedding: &[f32], target_dim: usize) -> crate::error::Result<Vec<f32>> {
    if target_dim == 0 || target_dim > embedding.len() {
        return Err(crate::error::InferenceError::InvalidInput(format!(
            "MRL target_dim={} is out of range for embedding of length {}",
            target_dim,
            embedding.len()
        )));
    }
    let mut truncated = embedding[..target_dim].to_vec();
    let norm: f32 = truncated
        .iter()
        .map(|x| x * x)
        .sum::<f32>()
        .sqrt()
        .max(1e-12);
    for v in truncated.iter_mut() {
        *v /= norm;
    }
    Ok(truncated)
}

/// Batch input accumulator that respects a total *token budget* instead of a fixed item count.
///
/// Current approach: estimates token count as `text.len() / 4` (chars per token approximation).
/// A production deployment can inject a real tokeniser via `TokenBudgetBatcher::with_token_fn`.
///
/// # Example
///
/// ```no_run
/// use inference::batch::TokenBudgetBatcher;
///
/// let mut batcher = TokenBudgetBatcher::new(2048);
/// batcher.push("short text".to_string());
/// batcher.push("another short text".to_string());
/// let batches = batcher.finish();
/// assert_eq!(batches.len(), 1);
/// ```
pub struct TokenBudgetBatcher {
    token_budget: usize,
    current_batch: Vec<String>,
    current_tokens: usize,
    finished_batches: Vec<Vec<String>>,
    /// Optional token-count estimator.  Defaults to `text.len() / 4`.
    token_count_fn: Box<dyn Fn(&str) -> usize + Send + Sync>,
}

impl TokenBudgetBatcher {
    /// Create a batcher with the given token budget.
    ///
    /// Defaults to character-count estimation (`len / 4`).
    /// Use `with_token_fn` to inject a real tokeniser.
    pub fn new(token_budget: usize) -> Self {
        let budget = std::env::var("DAKERA_TOKEN_BUDGET")
            .ok()
            .and_then(|v| v.parse::<usize>().ok())
            .filter(|&n| n > 0)
            .unwrap_or(token_budget)
            .max(1);

        Self {
            token_budget: budget,
            current_batch: Vec::new(),
            current_tokens: 0,
            finished_batches: Vec::new(),
            token_count_fn: Box::new(|text| (text.len() / 4).max(1)),
        }
    }

    /// Replace the default character-count estimator with a real token counter.
    pub fn with_token_fn(mut self, f: impl Fn(&str) -> usize + Send + Sync + 'static) -> Self {
        self.token_count_fn = Box::new(f);
        self
    }

    /// Add a text to the current batch.
    ///
    /// If adding `text` would exceed the token budget, the current batch is flushed first and a
    /// new batch starting with `text` is begun.
    pub fn push(&mut self, text: String) {
        let tokens = (self.token_count_fn)(&text);
        if !self.current_batch.is_empty() && self.current_tokens + tokens > self.token_budget {
            // Flush current batch
            let batch = std::mem::take(&mut self.current_batch);
            self.finished_batches.push(batch);
            self.current_tokens = 0;
        }
        self.current_tokens += tokens;
        self.current_batch.push(text);
    }

    /// Add multiple texts at once.
    pub fn push_all(&mut self, texts: impl IntoIterator<Item = String>) {
        for t in texts {
            self.push(t);
        }
    }

    /// Flush any pending batch and return all accumulated batches.
    ///
    /// The batcher is reset after this call and can be reused.
    pub fn finish(&mut self) -> Vec<Vec<String>> {
        if !self.current_batch.is_empty() {
            let batch = std::mem::take(&mut self.current_batch);
            self.finished_batches.push(batch);
            self.current_tokens = 0;
        }
        std::mem::take(&mut self.finished_batches)
    }

    /// Number of texts accumulated in the current (unflushed) batch.
    pub fn pending_count(&self) -> usize {
        self.current_batch.len()
    }

    /// Total tokens accumulated in the current (unflushed) batch.
    pub fn pending_tokens(&self) -> usize {
        self.current_tokens
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Create a minimal tokenizer for unit tests (no network required).
    /// `prepare_texts` only uses the model's prefix logic, not the tokenizer,
    /// so any valid tokenizer works here.
    fn dummy_tokenizer() -> Tokenizer {
        use tokenizers::models::bpe::BPE;
        Tokenizer::new(BPE::default())
    }

    /// Create a WordLevel tokenizer with a small known vocabulary.
    /// Unlike BPE::default(), this tokenizer can actually encode words,
    /// enabling tests of the tokenize_batch happy path.
    fn simple_tokenizer() -> Tokenizer {
        use std::collections::HashMap;
        use tokenizers::models::wordlevel::WordLevel;
        use tokenizers::pre_tokenizers::whitespace::Whitespace;

        let mut vocab: HashMap<String, u32> = HashMap::new();
        for (i, w) in [
            "[PAD]", "[UNK]", "hello", "world", "test", "text", "one", "two", "foo", "bar", "baz",
        ]
        .iter()
        .enumerate()
        {
            vocab.insert(w.to_string(), i as u32);
        }

        let model = WordLevel::builder()
            .vocab(vocab)
            .unk_token("[UNK]".to_string())
            .build()
            .unwrap();

        let mut tok = Tokenizer::new(model);
        tok.with_pre_tokenizer(Some(Whitespace {}));
        tok
    }

    #[test]
    fn test_prepare_texts_with_prefix() {
        let processor = BatchProcessor::new(dummy_tokenizer(), EmbeddingModel::E5Small, 32);

        let texts = vec!["Hello world".to_string(), "Test query".to_string()];
        let prepared = processor.prepare_texts(&texts, true);

        assert_eq!(prepared[0], "query: Hello world");
        assert_eq!(prepared[1], "query: Test query");
    }

    #[test]
    fn test_prepare_texts_no_prefix() {
        let processor = BatchProcessor::new(dummy_tokenizer(), EmbeddingModel::MiniLM, 32);

        let texts = vec!["Hello world".to_string()];
        let prepared = processor.prepare_texts(&texts, true);

        assert_eq!(prepared[0], "Hello world");
    }

    #[test]
    fn test_prepare_texts_document_prefix_e5() {
        let processor = BatchProcessor::new(dummy_tokenizer(), EmbeddingModel::E5Small, 32);
        let texts = vec!["Some document".to_string(), "Another doc".to_string()];
        let prepared = processor.prepare_texts(&texts, false);
        assert_eq!(prepared[0], "passage: Some document");
        assert_eq!(prepared[1], "passage: Another doc");
    }

    #[test]
    fn test_prepare_texts_bge_no_prefix_query() {
        let processor = BatchProcessor::new(dummy_tokenizer(), EmbeddingModel::BgeSmall, 32);
        let texts = vec!["Test".to_string()];
        let prepared = processor.prepare_texts(&texts, true);
        assert_eq!(prepared[0], "Test");
    }

    #[test]
    fn test_prepare_texts_bge_no_prefix_document() {
        let processor = BatchProcessor::new(dummy_tokenizer(), EmbeddingModel::BgeSmall, 32);
        let texts = vec!["Doc text".to_string()];
        let prepared = processor.prepare_texts(&texts, false);
        assert_eq!(prepared[0], "Doc text");
    }

    #[test]
    fn test_prepare_texts_empty_input() {
        let processor = BatchProcessor::new(dummy_tokenizer(), EmbeddingModel::MiniLM, 32);
        let texts: Vec<String> = vec![];
        let prepared = processor.prepare_texts(&texts, true);
        assert!(prepared.is_empty());
    }

    #[test]
    fn test_max_batch_size() {
        let processor = BatchProcessor::new(dummy_tokenizer(), EmbeddingModel::MiniLM, 64);
        assert_eq!(processor.max_batch_size(), 64);
    }

    #[test]
    fn test_max_batch_size_default() {
        let processor = BatchProcessor::new(dummy_tokenizer(), EmbeddingModel::BgeSmall, 32);
        assert_eq!(processor.max_batch_size(), 32);
    }

    #[test]
    fn test_split_into_batches_exact_multiple() {
        let processor = BatchProcessor::new(dummy_tokenizer(), EmbeddingModel::MiniLM, 4);
        let texts: Vec<String> = (0..8).map(|i| format!("text {i}")).collect();
        let batches = processor.split_into_batches(&texts);
        assert_eq!(batches.len(), 2);
        assert_eq!(batches[0].len(), 4);
        assert_eq!(batches[1].len(), 4);
    }

    #[test]
    fn test_split_into_batches_partial_last() {
        let processor = BatchProcessor::new(dummy_tokenizer(), EmbeddingModel::MiniLM, 4);
        let texts: Vec<String> = (0..6).map(|i| format!("text {i}")).collect();
        let batches = processor.split_into_batches(&texts);
        assert_eq!(batches.len(), 2);
        assert_eq!(batches[0].len(), 4);
        assert_eq!(batches[1].len(), 2);
    }

    #[test]
    fn test_split_into_batches_smaller_than_max() {
        let processor = BatchProcessor::new(dummy_tokenizer(), EmbeddingModel::MiniLM, 32);
        let texts: Vec<String> = (0..5).map(|i| format!("text {i}")).collect();
        let batches = processor.split_into_batches(&texts);
        assert_eq!(batches.len(), 1);
        assert_eq!(batches[0].len(), 5);
    }

    #[test]
    fn test_split_into_batches_empty() {
        let processor = BatchProcessor::new(dummy_tokenizer(), EmbeddingModel::MiniLM, 32);
        let texts: Vec<String> = vec![];
        let batches = processor.split_into_batches(&texts);
        assert!(batches.is_empty());
    }

    #[test]
    fn test_split_into_batches_preserves_content() {
        let processor = BatchProcessor::new(dummy_tokenizer(), EmbeddingModel::MiniLM, 3);
        let texts = vec![
            "a".to_string(),
            "b".to_string(),
            "c".to_string(),
            "d".to_string(),
        ];
        let batches = processor.split_into_batches(&texts);
        assert_eq!(batches[0], &["a", "b", "c"]);
        assert_eq!(batches[1], &["d"]);
    }

    #[test]
    fn test_tokenize_batch_empty_error() {
        let processor = BatchProcessor::new(dummy_tokenizer(), EmbeddingModel::MiniLM, 32);
        let result = processor.tokenize_batch(&[]);
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(matches!(err, InferenceError::InvalidInput(_)));
        assert!(err.to_string().contains("Empty text batch"));
    }

    #[test]
    fn test_tokenize_batch_exceeds_max_size_error() {
        let processor = BatchProcessor::new(dummy_tokenizer(), EmbeddingModel::MiniLM, 2);
        let texts: Vec<String> = (0..5).map(|i| format!("text {i}")).collect();
        let result = processor.tokenize_batch(&texts);
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(matches!(err, InferenceError::InvalidInput(_)));
        assert!(err.to_string().contains("exceeds maximum"));
    }

    #[test]
    fn test_tokenize_batch_exactly_at_max_size_does_not_error_before_encode() {
        let processor = BatchProcessor::new(dummy_tokenizer(), EmbeddingModel::MiniLM, 2);
        let texts = vec!["text one".to_string(), "text two".to_string()];
        let result = processor.tokenize_batch(&texts);
        // The BPE default tokenizer may fail at encode — that is fine,
        // what matters is it does NOT return an InvalidInput size error.
        if let Err(InferenceError::InvalidInput(msg)) = &result {
            assert!(
                !msg.contains("exceeds maximum"),
                "Batch at exactly max_size should pass size check, got: {msg}"
            );
        }
    }

    // ── mean_pooling tests ──────────────────────────────────────────────────

    #[test]
    fn test_mean_pooling_output_shape() {
        // batch=2, seq_len=3, hidden=4 → should produce 2 embeddings of size 4
        let lhs = vec![0.0f32; 2 * 3 * 4]; // all zeros
        let mask = vec![1i64; 2 * 3]; // all active
        let result = mean_pooling(&lhs, 2, 3, 4, &mask);
        assert_eq!(result.len(), 2);
        assert_eq!(result[0].len(), 4);
        assert_eq!(result[1].len(), 4);
    }

    #[test]
    fn test_mean_pooling_uniform_hidden_all_ones_mask() {
        // batch=1, seq_len=4, hidden=3 — all hidden values = 2.0, all mask = 1
        // Mean pool should return 2.0 for every dimension.
        let lhs = vec![2.0f32; 4 * 3];
        let mask = vec![1i64; 4];
        let result = mean_pooling(&lhs, 1, 4, 3, &mask);
        assert_eq!(result.len(), 1);
        for v in &result[0] {
            assert!((v - 2.0).abs() < 1e-5, "expected 2.0, got {v}");
        }
    }

    #[test]
    fn test_mean_pooling_masked_tokens_ignored() {
        // batch=1, seq_len=2, hidden=2
        // Token 0: hidden=[1.0, 1.0], mask=1; Token 1: hidden=[9.0, 9.0], mask=0
        // Mean pool should give [1.0, 1.0]
        let lhs = vec![1.0f32, 1.0, 9.0, 9.0];
        let mask = vec![1i64, 0i64];
        let result = mean_pooling(&lhs, 1, 2, 2, &mask);
        assert!(
            (result[0][0] - 1.0).abs() < 1e-5,
            "expected 1.0, got {}",
            result[0][0]
        );
        assert!(
            (result[0][1] - 1.0).abs() < 1e-5,
            "expected 1.0, got {}",
            result[0][1]
        );
    }

    #[test]
    fn test_mean_pooling_batch_independence() {
        // batch=2, seq_len=1, hidden=2
        // Batch 0: hidden=[3.0, 4.0], mask=1
        // Batch 1: hidden=[6.0, 8.0], mask=1
        // Each should pool independently
        let lhs = vec![3.0f32, 4.0, 6.0, 8.0];
        let mask = vec![1i64, 1i64];
        let result = mean_pooling(&lhs, 2, 1, 2, &mask);
        assert_eq!(result.len(), 2);
        assert!((result[0][0] - 3.0).abs() < 1e-5);
        assert!((result[0][1] - 4.0).abs() < 1e-5);
        assert!((result[1][0] - 6.0).abs() < 1e-5);
        assert!((result[1][1] - 8.0).abs() < 1e-5);
    }

    // ── normalize_embeddings tests ──────────────────────────────────────────

    #[test]
    fn test_normalize_embeddings_unit_length() {
        // After normalization, each row's L2 norm should be ≈ 1.0
        // [3, 4] → L2 norm = 5.0 → normalized = [0.6, 0.8]
        let mut embeddings = vec![vec![3.0f32, 4.0]];
        normalize_embeddings(&mut embeddings);
        let norm: f32 = embeddings[0].iter().map(|x| x * x).sum::<f32>().sqrt();
        assert!(
            (norm - 1.0).abs() < 1e-5,
            "L2 norm should be 1.0, got {norm}"
        );
    }

    #[test]
    fn test_normalize_embeddings_values() {
        let mut embeddings = vec![vec![3.0f32, 4.0]];
        normalize_embeddings(&mut embeddings);
        assert!(
            (embeddings[0][0] - 0.6).abs() < 1e-5,
            "expected 0.6, got {}",
            embeddings[0][0]
        );
        assert!(
            (embeddings[0][1] - 0.8).abs() < 1e-5,
            "expected 0.8, got {}",
            embeddings[0][1]
        );
    }

    #[test]
    fn test_normalize_embeddings_batch() {
        // Multiple rows — each should be independently normalized
        let mut embeddings = vec![vec![1.0f32, 0.0], vec![0.0f32, 1.0]];
        normalize_embeddings(&mut embeddings);
        let norm0: f32 = embeddings[0].iter().map(|x| x * x).sum::<f32>().sqrt();
        let norm1: f32 = embeddings[1].iter().map(|x| x * x).sum::<f32>().sqrt();
        assert!((norm0 - 1.0).abs() < 1e-5);
        assert!((norm1 - 1.0).abs() < 1e-5);
    }

    #[test]
    fn test_normalize_embeddings_output_shape() {
        let mut embeddings: Vec<Vec<f32>> = (1..=3)
            .map(|i| (1..=4).map(|j| (i * j) as f32).collect())
            .collect();
        normalize_embeddings(&mut embeddings);
        assert_eq!(embeddings.len(), 3);
        assert!(embeddings.iter().all(|v| v.len() == 4));
    }

    #[test]
    fn test_normalize_embeddings_near_zero_safe() {
        // Near-zero vector should not produce NaN/Inf due to clamp
        let mut embeddings = vec![vec![1e-14f32, 1e-14]];
        normalize_embeddings(&mut embeddings);
        for v in &embeddings[0] {
            assert!(v.is_finite(), "expected finite value, got {v}");
        }
    }

    // ── tokenize_batch happy-path (WordLevel tokenizer) ──────────────────────

    #[test]
    fn test_tokenize_batch_single_text_success() {
        let processor = BatchProcessor::new(simple_tokenizer(), EmbeddingModel::MiniLM, 32);
        let texts = vec!["hello world".to_string()];
        let result = processor.tokenize_batch(&texts);
        assert!(result.is_ok(), "Expected Ok, got {:?}", result);
        let batch = result.unwrap();
        assert_eq!(batch.batch_size, 1);
        assert_eq!(batch.original_lengths, vec![11]);
    }

    #[test]
    fn test_tokenize_batch_tensor_shapes_single() {
        let processor = BatchProcessor::new(simple_tokenizer(), EmbeddingModel::MiniLM, 32);
        let texts = vec!["hello world".to_string()];
        let batch = processor.tokenize_batch(&texts).unwrap();
        assert_eq!(batch.batch_size, 1);
        assert_eq!(batch.input_ids.len(), batch.batch_size * batch.seq_len);
        assert_eq!(batch.attention_mask.len(), batch.batch_size * batch.seq_len);
        assert_eq!(batch.token_type_ids.len(), batch.batch_size * batch.seq_len);
    }

    #[test]
    fn test_tokenize_batch_multiple_texts_batch_dim() {
        let processor = BatchProcessor::new(simple_tokenizer(), EmbeddingModel::MiniLM, 32);
        let texts = vec!["hello".to_string(), "hello world test".to_string()];
        let batch = processor.tokenize_batch(&texts).unwrap();
        assert_eq!(batch.batch_size, 2);
        assert_eq!(batch.original_lengths.len(), 2);
        assert_eq!(batch.input_ids.len(), batch.batch_size * batch.seq_len);
    }

    #[test]
    fn test_tokenize_batch_token_type_ids_default_zeros() {
        // WordLevel tokenizer returns no type_ids → code fills with zeros
        let processor = BatchProcessor::new(simple_tokenizer(), EmbeddingModel::MiniLM, 32);
        let texts = vec!["hello world".to_string()];
        let batch = processor.tokenize_batch(&texts).unwrap();
        for &v in &batch.token_type_ids {
            assert_eq!(v, 0, "Expected zero token_type_id from WordLevel, got {v}");
        }
    }

    #[test]
    fn test_tokenize_batch_original_lengths_preserved() {
        let processor = BatchProcessor::new(simple_tokenizer(), EmbeddingModel::MiniLM, 32);
        let texts = vec!["hello".to_string(), "hello world".to_string()];
        let batch = processor.tokenize_batch(&texts).unwrap();
        assert_eq!(batch.original_lengths[0], 5);
        assert_eq!(batch.original_lengths[1], 11);
    }

    #[test]
    fn test_tokenize_batch_three_texts_batch_size_field() {
        let processor = BatchProcessor::new(simple_tokenizer(), EmbeddingModel::MiniLM, 32);
        let texts = vec!["hello".to_string(), "world".to_string(), "test".to_string()];
        let batch = processor.tokenize_batch(&texts).unwrap();
        assert_eq!(batch.batch_size, 3);
    }

    #[test]
    fn test_tokenize_batch_all_arrays_consistent_length() {
        let processor = BatchProcessor::new(simple_tokenizer(), EmbeddingModel::MiniLM, 32);
        let texts = vec!["foo bar".to_string(), "baz".to_string()];
        let batch = processor.tokenize_batch(&texts).unwrap();
        let expected_len = batch.batch_size * batch.seq_len;
        assert_eq!(batch.input_ids.len(), expected_len);
        assert_eq!(batch.attention_mask.len(), expected_len);
        assert_eq!(batch.token_type_ids.len(), expected_len);
    }

    #[test]
    fn test_tokenize_batch_ids_are_i64() {
        let processor = BatchProcessor::new(simple_tokenizer(), EmbeddingModel::MiniLM, 32);
        let texts = vec!["hello world".to_string()];
        let batch = processor.tokenize_batch(&texts).unwrap();
        // Verify all IDs are non-negative i64 (u32 upcast)
        for &id in &batch.input_ids {
            assert!(id >= 0, "input_id should be non-negative, got {id}");
        }
        for &m in &batch.attention_mask {
            assert!(m == 0 || m == 1, "attention_mask should be 0 or 1, got {m}");
        }
    }

    // ── TokenBudgetBatcher ───────────────────────────────────────────────────

    /// Helper: use exact token counts so tests are deterministic.
    fn exact_batcher(budget: usize) -> TokenBudgetBatcher {
        TokenBudgetBatcher::new(budget).with_token_fn(|text| text.len())
    }

    #[test]
    fn test_token_budget_batcher_empty_finish() {
        let mut batcher = exact_batcher(100);
        let batches = batcher.finish();
        assert!(batches.is_empty());
    }

    #[test]
    fn test_token_budget_batcher_single_text_single_batch() {
        let mut batcher = exact_batcher(100);
        batcher.push("hello".to_string()); // 5 tokens
        let batches = batcher.finish();
        assert_eq!(batches.len(), 1);
        assert_eq!(batches[0], vec!["hello".to_string()]);
    }

    #[test]
    fn test_token_budget_batcher_fits_small_texts_in_one_batch() {
        let mut batcher = exact_batcher(50);
        for i in 0..5 {
            batcher.push(format!("t{i}")); // 2 tokens each → 10 total, fits in 50
        }
        let batches = batcher.finish();
        assert_eq!(batches.len(), 1);
        assert_eq!(batches[0].len(), 5);
    }

    #[test]
    fn test_token_budget_batcher_splits_on_budget_exceeded() {
        // budget=10; first 5 texts of 2 tokens each = 10 → 5th text stays in batch
        // 6th text of 2 tokens → would exceed 12, so flush first
        let mut batcher = exact_batcher(10);
        for _ in 0..5 {
            batcher.push("ab".to_string()); // 2 tokens
        }
        // Now at budget exactly; push one more
        batcher.push("cd".to_string()); // 2 tokens
        let batches = batcher.finish();
        // First batch: 5 × 2 = 10 tokens (fits exactly)
        // Second batch: "cd"
        assert_eq!(batches.len(), 2);
        assert_eq!(batches[0].len(), 5);
        assert_eq!(batches[1].len(), 1);
    }

    #[test]
    fn test_token_budget_batcher_large_single_text_gets_own_batch() {
        let mut batcher = exact_batcher(10);
        batcher.push("small".to_string()); // 5 tokens
        batcher.push("a".repeat(50)); // 50 tokens > budget → flushes "small", starts new batch
        let batches = batcher.finish();
        assert_eq!(batches.len(), 2);
        assert_eq!(batches[0][0], "small");
    }

    #[test]
    fn test_token_budget_batcher_finish_resets_state() {
        let mut batcher = exact_batcher(100);
        batcher.push("hello".to_string());
        let _first = batcher.finish();
        batcher.push("world".to_string());
        let second = batcher.finish();
        assert_eq!(second.len(), 1);
        assert_eq!(second[0][0], "world");
    }

    #[test]
    fn test_token_budget_batcher_push_all() {
        let mut batcher = exact_batcher(100);
        batcher.push_all(vec!["a".to_string(), "b".to_string(), "c".to_string()]);
        let batches = batcher.finish();
        assert_eq!(batches.len(), 1);
        assert_eq!(batches[0].len(), 3);
    }

    #[test]
    fn test_token_budget_batcher_pending_count() {
        let mut batcher = exact_batcher(100);
        assert_eq!(batcher.pending_count(), 0);
        batcher.push("hello".to_string());
        assert_eq!(batcher.pending_count(), 1);
        batcher.push("world".to_string());
        assert_eq!(batcher.pending_count(), 2);
    }

    // ── truncate_mrl tests ───────────────────────────────────────────────────

    #[test]
    fn test_mrl_truncation_basic() {
        let embedding = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
        let truncated = truncate_mrl(&embedding, 4).unwrap();
        assert_eq!(truncated.len(), 4);
    }

    #[test]
    fn test_mrl_truncation_normalized() {
        let embedding = vec![3.0f32, 4.0, 0.0, 0.0];
        let truncated = truncate_mrl(&embedding, 2).unwrap();
        // [3, 4] norm = 5 → [0.6, 0.8]
        assert!((truncated[0] - 0.6).abs() < 1e-5);
        assert!((truncated[1] - 0.8).abs() < 1e-5);
    }

    #[test]
    fn test_mrl_truncation_256_from_1024() {
        let embedding: Vec<f32> = (0..1024).map(|i| i as f32).collect();
        let truncated = truncate_mrl(&embedding, 256).unwrap();
        assert_eq!(truncated.len(), 256);
        // L2 norm should be ~1.0
        let norm: f32 = truncated.iter().map(|x| x * x).sum::<f32>().sqrt();
        assert!((norm - 1.0).abs() < 1e-4, "norm={norm}");
    }

    #[test]
    fn test_mrl_truncation_full_dimension_is_noop_shape() {
        let embedding = vec![0.0f32; 1024];
        // Near-zero → won't change direction but shape is preserved
        let truncated = truncate_mrl(&embedding, 1024).unwrap();
        assert_eq!(truncated.len(), 1024);
    }

    #[test]
    fn test_mrl_truncation_zero_target_dim_error() {
        let embedding = vec![1.0f32; 10];
        let result = truncate_mrl(&embedding, 0);
        assert!(result.is_err());
    }

    #[test]
    fn test_mrl_truncation_target_exceeds_length_error() {
        let embedding = vec![1.0f32; 4];
        let result = truncate_mrl(&embedding, 5);
        assert!(result.is_err());
    }

    #[test]
    fn test_mrl_preserves_semantic_direction() {
        // MRL property: truncated+renormed embedding should point in the same direction
        // as the full embedding. Mathematically, dot(truncated_renormed, full_first_256)
        // equals the L2 norm of the first target_dim slice of the full embedding.
        // For the assertion to reach >0.9, the first 256 dims must contain >81% of
        // total squared norm — here achieved by zeroing dims 256..1024.
        let mut embedding: Vec<f32> = (0..1024)
            .map(|i| if i < 256 { (i % 16) as f32 + 1.0 } else { 0.0 })
            .collect();
        let norm: f32 = embedding
            .iter()
            .map(|x| x * x)
            .sum::<f32>()
            .sqrt()
            .max(1e-12);
        for v in embedding.iter_mut() {
            *v /= norm;
        }
        let truncated = truncate_mrl(&embedding, 256).unwrap();
        // dot(truncated_renormed, first_256_of_unit_embedding) = partial_norm = 1.0 here
        let dot: f32 = truncated
            .iter()
            .zip(embedding.iter().take(256))
            .map(|(a, b)| a * b)
            .sum();
        assert!(dot > 0.9, "cosine similarity {dot} should be >0.9");
    }
}