Skip to main content

textprep/
subword.rs

1//! Subword tokenization traits (minimal).
2//!
3//! This is intentionally small: it exists to support simple embedding/projection
4//! adapters in the workspace without pulling in heavy tokenizer stacks.
5//!
6//! If/when we standardize on `tokenizers` (HF) or other model tokenizers, keep
7//! those as separate, opt-in layers.
8
9use std::collections::HashMap;
10
11/// Trait for subword tokenizers.
12///
13/// Returns a sequence of token IDs for input text.
14pub trait SubwordTokenizer: Send + Sync {
15    fn tokenize(&self, text: &str) -> Vec<u32>;
16}
17
18/// A simple vocabulary lookup tokenizer.
19///
20/// Note: this is not true BPE. It is a thin adapter for toy projections/tests.
21#[derive(Debug, Clone, PartialEq, Eq)]
22#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
23pub struct BpeTokenizer {
24    vocab: HashMap<String, u32>,
25}
26
27impl BpeTokenizer {
28    /// Create a tokenizer from a token→id vocabulary map.
29    pub fn from_vocab(vocab: HashMap<String, u32>) -> Self {
30        Self { vocab }
31    }
32}
33
34impl SubwordTokenizer for BpeTokenizer {
35    fn tokenize(&self, text: &str) -> Vec<u32> {
36        text.split_whitespace()
37            .filter_map(|word| self.vocab.get(word).copied())
38            .collect()
39    }
40}