tokenizers/
lib.rs

1#![warn(clippy::all)]
2#![allow(clippy::upper_case_acronyms)]
3#![doc(html_favicon_url = "https://huggingface.co/favicon.ico")]
4#![doc(html_logo_url = "https://huggingface.co/landing/assets/huggingface_logo.svg")]
5
6//! The core of `tokenizers`, written in Rust.
7//! Provides an implementation of today's most used tokenizers, with a focus on performance and
8//! versatility.
9//!
10//! # What is a Tokenizer
11//!
12//! A Tokenizer works as a pipeline, it processes some raw text as input and outputs an `Encoding`.
13//! The various steps of the pipeline are:
14//!
15//! 1. The `Normalizer`: in charge of normalizing the text. Common examples of normalization are
16//!    the [unicode normalization standards](https://unicode.org/reports/tr15/#Norm_Forms), such as `NFD` or `NFKC`.
17//!    More details about how to use the `Normalizers` are available on the
18//!    [Hugging Face blog](https://huggingface.co/docs/tokenizers/components#normalizers)
19//! 2. The `PreTokenizer`: in charge of creating initial words splits in the text. The most common way of
20//!    splitting text is simply on whitespace.
21//! 3. The `Model`: in charge of doing the actual tokenization. An example of a `Model` would be
22//!    `BPE` or `WordPiece`.
23//! 4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything relevant
24//!    that, for example, a language model would need, such as special tokens.
25//!
26//! ## Loading a pretrained tokenizer from the Hub
27//! ```
28//! use tokenizers::tokenizer::{Result, Tokenizer};
29//!
30//! fn main() -> Result<()> {
31//!     # #[cfg(feature = "http")]
32//!     # {
33//!         let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None)?;
34//!
35//!         let encoding = tokenizer.encode("Hey there!", false)?;
36//!         println!("{:?}", encoding.get_tokens());
37//!     # }
38//!     Ok(())
39//! }
40//! ```
41//!
42//! ## Deserialization and tokenization example
43//!
44//! ```no_run
45//! use tokenizers::tokenizer::{Result, Tokenizer, EncodeInput};
46//! use tokenizers::models::bpe::BPE;
47//!
48//! fn main() -> Result<()> {
49//!     let bpe_builder = BPE::from_file("./path/to/vocab.json", "./path/to/merges.txt");
50//!     let bpe = bpe_builder
51//!         .dropout(0.1)
52//!         .unk_token("[UNK]".into())
53//!         .build()?;
54//!
55//!     let mut tokenizer = Tokenizer::new(bpe);
56//!
57//!     let encoding = tokenizer.encode("Hey there!", false)?;
58//!     println!("{:?}", encoding.get_tokens());
59//!
60//!     Ok(())
61//! }
62//! ```
63//!
64//! ## Training and serialization example
65//!
66//! ```no_run
67//! use tokenizers::decoders::DecoderWrapper;
68//! use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
69//! use tokenizers::normalizers::{strip::Strip, unicode::NFC, utils::Sequence, NormalizerWrapper};
70//! use tokenizers::pre_tokenizers::byte_level::ByteLevel;
71//! use tokenizers::pre_tokenizers::PreTokenizerWrapper;
72//! use tokenizers::processors::PostProcessorWrapper;
73//! use tokenizers::{AddedToken, Model, Result, TokenizerBuilder};
74//!
75//! use std::path::Path;
76//!
77//! fn main() -> Result<()> {
78//!     let vocab_size: usize = 100;
79//!
80//!     let mut trainer = BpeTrainerBuilder::new()
81//!         .show_progress(true)
82//!         .vocab_size(vocab_size)
83//!         .min_frequency(0)
84//!         .special_tokens(vec![
85//!             AddedToken::from(String::from("<s>"), true),
86//!             AddedToken::from(String::from("<pad>"), true),
87//!             AddedToken::from(String::from("</s>"), true),
88//!             AddedToken::from(String::from("<unk>"), true),
89//!             AddedToken::from(String::from("<mask>"), true),
90//!         ])
91//!         .build();
92//!
93//!     let mut tokenizer = TokenizerBuilder::new()
94//!         .with_model(BPE::default())
95//!         .with_normalizer(Some(Sequence::new(vec![
96//!             Strip::new(true, true).into(),
97//!             NFC.into(),
98//!         ])))
99//!         .with_pre_tokenizer(Some(ByteLevel::default()))
100//!         .with_post_processor(Some(ByteLevel::default()))
101//!         .with_decoder(Some(ByteLevel::default()))
102//!         .build()?;
103//!
104//!     let pretty = false;
105//!     tokenizer
106//!         .train_from_files(
107//!             &mut trainer,
108//!             vec!["path/to/vocab.txt".to_string()],
109//!         )?
110//!         .save("tokenizer.json", pretty)?;
111//!
112//!     Ok(())
113//! }
114//! ```
115//!
116//! # Additional information
117//!
118//! - tokenizers is designed to leverage CPU parallelism when possible. The level of parallelism is determined
119//!   by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_THREADS`
120//!   environment variable. As an example setting `RAYON_RS_NUM_THREADS=4` will allocate a maximum of 4 threads.
121//!   **_Please note this behavior may evolve in the future_**
122//!
123//! # Features
124//! **progressbar**: The progress bar visualization is enabled by default. It might be disabled if
125//!   compilation for certain targets is not supported by the [termios](https://crates.io/crates/termios)
126//!   dependency of the [indicatif](https://crates.io/crates/indicatif) progress bar.
127
128#[macro_use]
129extern crate log;
130#[macro_use]
131extern crate lazy_static;
132
133#[macro_use]
134extern crate derive_builder;
135
136#[macro_use]
137pub mod utils;
138pub mod decoders;
139pub mod models;
140pub mod normalizers;
141pub mod pre_tokenizers;
142pub mod processors;
143pub mod tokenizer;
144
145// Re-export from tokenizer
146pub use tokenizer::*;
147
148// Re-export also parallelism utils
149pub use utils::parallelism;
150
151// Re-export for from_pretrained
152#[cfg(feature = "http")]
153pub use utils::from_pretrained::FromPretrainedParameters;
tokenizers/lib.rs

tokenizers/
lib.rs