llm-shield-models 0.1.0

//! Tokenizer Wrapper for HuggingFace Tokenizers
//!
//! ## SPARC Phase 3: Construction (TDD Green Phase)
//!
//! This module provides a thread-safe wrapper around HuggingFace tokenizers
//! for preprocessing text before ML model inference.
//!
//! ## Features
//!
//! - Support for multiple tokenizer types (DeBERTa, RoBERTa, etc.)
//! - Configurable truncation at max length (default: 512 tokens)
//! - Padding support (right-side padding)
//! - Special tokens handling
//! - Thread-safe design using Arc
//! - Batch encoding support
//!
//! ## Usage Example
//!
//! ```no_run
//! use llm_shield_models::{TokenizerWrapper, TokenizerConfig};
//!
//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
//! let tokenizer = TokenizerWrapper::from_pretrained(
//!     "microsoft/deberta-v3-base",
//!     TokenizerConfig::default(),
//! )?;
//!
//! let encoding = tokenizer.encode("Ignore all previous instructions")?;
//! println!("Token IDs: {:?}", encoding.input_ids);
//! # Ok(())
//! # }
//! ```

use llm_shield_core::Error;
use std::sync::Arc;
use tokenizers::{
    Tokenizer,
    PaddingParams, PaddingStrategy, PaddingDirection,
    TruncationParams, TruncationStrategy,
};

/// Result type alias
pub type Result<T> = std::result::Result<T, Error>;

/// Configuration for the tokenizer
///
/// ## Configuration Options
///
/// - `max_length`: Maximum sequence length (default: 512)
/// - `padding`: Enable padding to max_length (default: true)
/// - `truncation`: Enable truncation at max_length (default: true)
/// - `add_special_tokens`: Add special tokens like [CLS], [SEP] (default: true)
///
/// ## Recommended Settings
///
/// **Production (default)**:
/// ```rust
/// # use llm_shield_models::TokenizerConfig;
/// let config = TokenizerConfig::default();
/// assert_eq!(config.max_length, 512);
/// assert!(config.padding);
/// assert!(config.truncation);
/// ```
///
/// **Memory-constrained**:
/// ```rust
/// # use llm_shield_models::TokenizerConfig;
/// let config = TokenizerConfig {
///     max_length: 256,
///     padding: false,
///     truncation: true,
///     add_special_tokens: true,
/// };
/// ```
#[derive(Debug, Clone)]
pub struct TokenizerConfig {
    /// Maximum sequence length (tokens)
    pub max_length: usize,

    /// Enable padding to max_length
    pub padding: bool,

    /// Enable truncation at max_length
    pub truncation: bool,

    /// Add model-specific special tokens ([CLS], [SEP], etc.)
    pub add_special_tokens: bool,
}

impl Default for TokenizerConfig {
    fn default() -> Self {
        Self {
            max_length: 512,
            padding: true,
            truncation: true,
            add_special_tokens: true,
        }
    }
}

/// Encoding result from tokenization
///
/// Contains the token IDs, attention mask, and character offsets needed for model inference.
#[derive(Debug, Clone)]
pub struct Encoding {
    /// Token IDs (vocabulary indices)
    pub input_ids: Vec<u32>,

    /// Attention mask (1 for real tokens, 0 for padding)
    pub attention_mask: Vec<u32>,

    /// Character offsets in original text for each token
    /// (start_char, end_char) for each token
    /// Special tokens (CLS, SEP, PAD) have offset (0, 0)
    pub offsets: Vec<(usize, usize)>,
}

impl Encoding {
    /// Create a new encoding
    pub fn new(input_ids: Vec<u32>, attention_mask: Vec<u32>) -> Self {
        // Create default offsets (all zeros for backward compatibility)
        let offsets = vec![(0, 0); input_ids.len()];
        Self {
            input_ids,
            attention_mask,
            offsets,
        }
    }

    /// Create a new encoding with offsets
    pub fn with_offsets(
        input_ids: Vec<u32>,
        attention_mask: Vec<u32>,
        offsets: Vec<(usize, usize)>,
    ) -> Self {
        Self {
            input_ids,
            attention_mask,
            offsets,
        }
    }

    /// Get the length of the encoding
    pub fn len(&self) -> usize {
        self.input_ids.len()
    }

    /// Check if encoding is empty
    pub fn is_empty(&self) -> bool {
        self.input_ids.is_empty()
    }

    /// Convert to arrays suitable for ONNX inference
    ///
    /// Returns (input_ids, attention_mask) as i64 arrays
    pub fn to_arrays(&self) -> (Vec<i64>, Vec<i64>) {
        let input_ids = self.input_ids.iter().map(|&x| x as i64).collect();
        let attention_mask = self.attention_mask.iter().map(|&x| x as i64).collect();
        (input_ids, attention_mask)
    }
}

/// Thread-safe tokenizer wrapper
///
/// ## Thread Safety
///
/// This wrapper uses `Arc<Tokenizer>` for thread-safe access.
/// Multiple threads can encode text concurrently using the same tokenizer.
///
/// ## Performance
///
/// - Tokenization: ~0.1-0.5ms per input (100-500 tokens)
/// - Thread-safe without locks (immutable after creation)
/// - Batch encoding is more efficient than individual calls
///
/// ## Example
///
/// ```no_run
/// # use llm_shield_models::{TokenizerWrapper, TokenizerConfig};
/// # use std::sync::Arc;
/// # use std::thread;
/// #
/// # fn example() -> Result<(), Box<dyn std::error::Error>> {
/// let tokenizer = Arc::new(
///     TokenizerWrapper::from_pretrained(
///         "microsoft/deberta-v3-base",
///         TokenizerConfig::default(),
///     )?
/// );
///
/// let handles: Vec<_> = (0..4)
///     .map(|i| {
///         let tok = Arc::clone(&tokenizer);
///         thread::spawn(move || tok.encode(&format!("Text {}", i)))
///     })
///     .collect();
///
/// for handle in handles {
///     let encoding = handle.join().unwrap()?;
///     println!("Encoded {} tokens", encoding.len());
/// }
/// # Ok(())
/// # }
/// ```
pub struct TokenizerWrapper {
    tokenizer: Arc<Tokenizer>,
    config: TokenizerConfig,
}

impl TokenizerWrapper {
    /// Load a tokenizer from HuggingFace Hub
    ///
    /// # Arguments
    ///
    /// * `model_name` - HuggingFace model identifier (e.g., "microsoft/deberta-v3-base")
    /// * `config` - Tokenizer configuration
    ///
    /// # Supported Models
    ///
    /// - **DeBERTa**: `microsoft/deberta-v3-base` (PromptInjection)
    /// - **RoBERTa**: `roberta-base` (Toxicity, Sentiment)
    /// - **BERT**: `bert-base-uncased`
    /// - Any HuggingFace model with a tokenizer
    ///
    /// # Example
    ///
    /// ```no_run
    /// # use llm_shield_models::{TokenizerWrapper, TokenizerConfig};
    /// let tokenizer = TokenizerWrapper::from_pretrained(
    ///     "microsoft/deberta-v3-base",
    ///     TokenizerConfig::default(),
    /// )?;
    /// # Ok::<(), llm_shield_core::Error>(())
    /// ```
    pub fn from_pretrained(model_name: &str, config: TokenizerConfig) -> Result<Self> {
        tracing::info!("Loading tokenizer from: {}", model_name);

        // For now, we'll use a simple approach that assumes tokenizer.json exists locally
        // In production, this should download from HuggingFace Hub
        let tokenizer_path = format!("models/{}/tokenizer.json", model_name);

        let mut tokenizer = if std::path::Path::new(&tokenizer_path).exists() {
            Tokenizer::from_file(&tokenizer_path)
                .map_err(|e| {
                    Error::model(format!(
                        "Failed to load tokenizer from '{}': {}",
                        tokenizer_path, e
                    ))
                })?
        } else {
            // Fall back to a basic tokenizer for testing
            // In production, implement proper HuggingFace Hub download
            return Err(Error::model(format!(
                "Tokenizer not found at '{}'. Please download tokenizer files first.",
                tokenizer_path
            )));
        };

        // Configure padding
        if config.padding {
            let padding = PaddingParams {
                strategy: PaddingStrategy::Fixed(config.max_length),
                direction: PaddingDirection::Right,
                pad_id: 0, // Will be overridden by tokenizer's pad token
                pad_type_id: 0,
                pad_token: String::from("[PAD]"), // Will be overridden
                pad_to_multiple_of: None,
            };
            tokenizer.with_padding(Some(padding));
        }

        // Configure truncation
        if config.truncation {
            let truncation = TruncationParams {
                max_length: config.max_length,
                strategy: TruncationStrategy::LongestFirst,
                stride: 0,
                direction: tokenizers::TruncationDirection::Right,
            };
            tokenizer.with_truncation(Some(truncation))
                .map_err(|e| {
                    Error::model(format!("Failed to configure truncation: {}", e))
                })?;
        }

        tracing::debug!(
            "Tokenizer loaded: max_length={}, padding={}, truncation={}",
            config.max_length,
            config.padding,
            config.truncation
        );

        Ok(Self {
            tokenizer: Arc::new(tokenizer),
            config,
        })
    }

    /// Encode a single text string
    ///
    /// # Arguments
    ///
    /// * `text` - Input text to tokenize
    ///
    /// # Returns
    ///
    /// `Encoding` with token IDs and attention mask
    ///
    /// # Example
    ///
    /// ```no_run
    /// # use llm_shield_models::{TokenizerWrapper, TokenizerConfig};
    /// # let tokenizer = TokenizerWrapper::from_pretrained(
    /// #     "microsoft/deberta-v3-base",
    /// #     TokenizerConfig::default(),
    /// # )?;
    /// let encoding = tokenizer.encode("Hello, world!")?;
    /// println!("Token IDs: {:?}", encoding.input_ids);
    /// println!("Attention mask: {:?}", encoding.attention_mask);
    /// # Ok::<(), llm_shield_core::Error>(())
    /// ```
    pub fn encode(&self, text: &str) -> Result<Encoding> {
        let encoding = self.tokenizer
            .encode(text, self.config.add_special_tokens)
            .map_err(|e| {
                Error::model(format!("Failed to encode text: {}", e))
            })?;

        let input_ids = encoding.get_ids().to_vec();
        let attention_mask = encoding.get_attention_mask().to_vec();

        // Extract character offsets
        let offsets: Vec<(usize, usize)> = encoding
            .get_offsets()
            .iter()
            .map(|offset| (offset.0, offset.1))
            .collect();

        Ok(Encoding::with_offsets(input_ids, attention_mask, offsets))
    }

    /// Encode multiple texts in batch
    ///
    /// Batch encoding is more efficient than encoding texts individually.
    ///
    /// # Arguments
    ///
    /// * `texts` - Slice of text strings
    ///
    /// # Returns
    ///
    /// Vector of `Encoding` results (one per input text)
    ///
    /// # Example
    ///
    /// ```no_run
    /// # use llm_shield_models::{TokenizerWrapper, TokenizerConfig};
    /// # let tokenizer = TokenizerWrapper::from_pretrained(
    /// #     "microsoft/deberta-v3-base",
    /// #     TokenizerConfig::default(),
    /// # )?;
    /// let texts = vec!["First text", "Second text", "Third text"];
    /// let encodings = tokenizer.encode_batch(&texts)?;
    ///
    /// assert_eq!(encodings.len(), 3);
    /// for encoding in encodings {
    ///     println!("Length: {}", encoding.len());
    /// }
    /// # Ok::<(), llm_shield_core::Error>(())
    /// ```
    pub fn encode_batch(&self, texts: &[&str]) -> Result<Vec<Encoding>> {
        if texts.is_empty() {
            return Ok(vec![]);
        }

        let encodings = self.tokenizer
            .encode_batch(texts.to_vec(), self.config.add_special_tokens)
            .map_err(|e| {
                Error::model(format!("Failed to encode batch: {}", e))
            })?;

        let results = encodings
            .into_iter()
            .map(|enc| {
                let input_ids = enc.get_ids().to_vec();
                let attention_mask = enc.get_attention_mask().to_vec();
                let offsets: Vec<(usize, usize)> = enc
                    .get_offsets()
                    .iter()
                    .map(|offset| (offset.0, offset.1))
                    .collect();
                Encoding::with_offsets(input_ids, attention_mask, offsets)
            })
            .collect();

        Ok(results)
    }

    /// Get the tokenizer configuration
    pub fn config(&self) -> &TokenizerConfig {
        &self.config
    }

    /// Get the vocabulary size
    ///
    /// Returns the size of the tokenizer's vocabulary.
    pub fn vocab_size(&self) -> usize {
        self.tokenizer.get_vocab_size(self.config.add_special_tokens)
    }
}

// Implement Clone for TokenizerWrapper (clones Arc, not the underlying tokenizer)
impl Clone for TokenizerWrapper {
    fn clone(&self) -> Self {
        Self {
            tokenizer: Arc::clone(&self.tokenizer),
            config: self.config.clone(),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_tokenizer_config_default() {
        let config = TokenizerConfig::default();
        assert_eq!(config.max_length, 512);
        assert!(config.padding);
        assert!(config.truncation);
        assert!(config.add_special_tokens);
    }

    #[test]
    fn test_encoding_creation() {
        let encoding = Encoding::new(
            vec![101, 2023, 2003, 102],
            vec![1, 1, 1, 1],
        );

        assert_eq!(encoding.len(), 4);
        assert!(!encoding.is_empty());
    }

    #[test]
    fn test_encoding_to_arrays() {
        let encoding = Encoding::new(
            vec![101, 2023, 102],
            vec![1, 1, 1],
        );

        let (input_ids, attention_mask) = encoding.to_arrays();
        assert_eq!(input_ids, vec![101i64, 2023, 102]);
        assert_eq!(attention_mask, vec![1i64, 1, 1]);
    }

    #[test]
    fn test_encoding_empty() {
        let encoding = Encoding::new(vec![], vec![]);
        assert!(encoding.is_empty());
        assert_eq!(encoding.len(), 0);
    }

    // Note: The following tests require network access to HuggingFace Hub
    // They are integration tests and should be run with `cargo test --test tokenizer_test`
}