1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
//! Tokenize words into word pieces.
//!
//! This crate provides a subword tokenizer. A subword tokenizer
//! splits a token into several pieces, so-called *word pieces*. Word
//! pieces were popularized by and used in the
//! [BERT](https://arxiv.org/abs/1810.04805) natural language encoder.
//!
//! The tokenizer splits a word, providing an iterator over pieces.
//! The piece is represented as a string and its vocabulary index.
//!
//! ~~~
//! use std::convert::TryFrom;
//! use std::fs::File;
//! use std::io::{BufRead, BufReader};
//!
//! use wordpieces::{WordPiece, WordPieces};
//!
//! let f = File::open("testdata/test.pieces").unwrap();
//! let word_pieces = WordPieces::from_buf_read(BufReader::new(f)).unwrap();
//!
//! // A word that can be split fully.
//! let pieces = word_pieces.split("coördinatie")
//! .map(|p| p.piece()).collect::<Vec<_>>();
//! assert_eq!(pieces, vec![Some("coördina"), Some("tie")]);
//!
//! // A word that can be split partially.
//! let pieces = word_pieces.split("voorkomen")
//! .map(|p| p.piece()).collect::<Vec<_>>();
//! assert_eq!(pieces, vec![Some("voor"), None]);
//! ~~~
mod error;
pub use error::WordPiecesError;
mod word_pieces;
pub use word_pieces::{WordPiece, WordPieces, WordPiecesBuilder};