Struct bert_tokenizer::WordPieceTokenizer
source · pub struct WordPieceTokenizer { /* private fields */ }
Expand description
A subword tokenizer that runs WordPiece tokenization algorithm.
Example
use bert_tokenizer::{Tokenizer, Vocab, WordPieceTokenizer};
let mut vocab = Vocab::new();
vocab.insert("hello".to_string(), 0);
vocab.insert("world".to_string(), 1);
vocab.insert("!".to_string(), 2);
vocab.insert("##!".to_string(), 3);
vocab.insert("##world".to_string(), 4);
vocab.insert("##hello".to_string(), 5);
let tokenizer = WordPieceTokenizer::new(vocab).build();
let tokens = tokenizer.tokenize("hello world!");
assert_eq!(tokens, vec!["hello", "world", "##!"]);