1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
//! Tokenize words into word pieces.
//!
//! This crate provides a subword tokenizer. A subword tokenizer
//! splits a token into several pieces, so-called *word pieces*.  Word
//! pieces were popularized by and used in the
//! [BERT](https://arxiv.org/abs/1810.04805) natural language encoder.
//!
//! The tokenizer splits a word, providing an iterator over pieces.
//! The piece is represented as a string and its vocabulary index.
//!
//! ~~~
//! use std::convert::TryFrom;
//! use std::fs::File;
//! use std::io::{BufRead, BufReader};
//!
//! use wordpieces::{WordPiece, WordPieces};
//!
//! let f = File::open("testdata/test.pieces").unwrap();
//! let word_pieces = WordPieces::from_buf_read(BufReader::new(f)).unwrap();
//!
//! // A word that can be split fully.
//! let pieces = word_pieces.split("coördinatie")
//!  .map(|p| p.piece()).collect::<Vec<_>>();
//! assert_eq!(pieces, vec![Some("coördina"), Some("tie")]);
//!
//! // A word that can be split partially.
//! let pieces = word_pieces.split("voorkomen")
//!  .map(|p| p.piece()).collect::<Vec<_>>();
//! assert_eq!(pieces, vec![Some("voor"), None]);
//! ~~~

mod error;
pub use error::WordPiecesError;

mod word_pieces;
pub use word_pieces::{WordPiece, WordPieces, WordPiecesBuilder};