textprep/subword.rs
1//! Subword tokenization traits (minimal).
2//!
3//! This is intentionally small: it exists to support simple embedding/projection
4//! adapters in the workspace without pulling in heavy tokenizer stacks.
5//!
6//! If/when we standardize on `tokenizers` (HF) or other model tokenizers, keep
7//! those as separate, opt-in layers.
8
9use std::collections::HashMap;
10
11/// Trait for subword tokenizers.
12///
13/// Returns a sequence of token IDs for input text.
14pub trait SubwordTokenizer: Send + Sync {
15 fn tokenize(&self, text: &str) -> Vec<u32>;
16}
17
18/// A simple vocabulary lookup tokenizer.
19///
20/// Note: this is not true BPE. It is a thin adapter for toy projections/tests.
21#[derive(Debug, Clone, PartialEq, Eq)]
22#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
23pub struct BpeTokenizer {
24 vocab: HashMap<String, u32>,
25}
26
27impl BpeTokenizer {
28 /// Create a tokenizer from a token→id vocabulary map.
29 pub fn from_vocab(vocab: HashMap<String, u32>) -> Self {
30 Self { vocab }
31 }
32}
33
34impl SubwordTokenizer for BpeTokenizer {
35 fn tokenize(&self, text: &str) -> Vec<u32> {
36 text.split_whitespace()
37 .filter_map(|word| self.vocab.get(word).copied())
38 .collect()
39 }
40}