tokenizers/lib.rs
1#![warn(clippy::all)]
2#![allow(clippy::upper_case_acronyms)]
3#![doc(html_favicon_url = "https://huggingface.co/favicon.ico")]
4#![doc(html_logo_url = "https://huggingface.co/landing/assets/huggingface_logo.svg")]
5
6//! The core of `tokenizers`, written in Rust.
7//! Provides an implementation of today's most used tokenizers, with a focus on performance and
8//! versatility.
9//!
10//! # What is a Tokenizer
11//!
12//! A Tokenizer works as a pipeline, it processes some raw text as input and outputs an `Encoding`.
13//! The various steps of the pipeline are:
14//!
15//! 1. The `Normalizer`: in charge of normalizing the text. Common examples of normalization are
16//! the [unicode normalization standards](https://unicode.org/reports/tr15/#Norm_Forms), such as `NFD` or `NFKC`.
17//! More details about how to use the `Normalizers` are available on the
18//! [Hugging Face blog](https://huggingface.co/docs/tokenizers/components#normalizers)
19//! 2. The `PreTokenizer`: in charge of creating initial words splits in the text. The most common way of
20//! splitting text is simply on whitespace.
21//! 3. The `Model`: in charge of doing the actual tokenization. An example of a `Model` would be
22//! `BPE` or `WordPiece`.
23//! 4. The `PostProcessor`: in charge of post-processing the `Encoding` to add anything relevant
24//! that, for example, a language model would need, such as special tokens.
25//!
26//! ## Loading a pretrained tokenizer from the Hub
27//! ```
28//! use tokenizers::tokenizer::{Result, Tokenizer};
29//!
30//! fn main() -> Result<()> {
31//! # #[cfg(feature = "http")]
32//! # {
33//! let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None)?;
34//!
35//! let encoding = tokenizer.encode("Hey there!", false)?;
36//! println!("{:?}", encoding.get_tokens());
37//! # }
38//! Ok(())
39//! }
40//! ```
41//!
42//! ## Deserialization and tokenization example
43//!
44//! ```no_run
45//! use tokenizers::tokenizer::{Result, Tokenizer, EncodeInput};
46//! use tokenizers::models::bpe::BPE;
47//!
48//! fn main() -> Result<()> {
49//! let bpe_builder = BPE::from_file("./path/to/vocab.json", "./path/to/merges.txt");
50//! let bpe = bpe_builder
51//! .dropout(0.1)
52//! .unk_token("[UNK]".into())
53//! .build()?;
54//!
55//! let mut tokenizer = Tokenizer::new(bpe);
56//!
57//! let encoding = tokenizer.encode("Hey there!", false)?;
58//! println!("{:?}", encoding.get_tokens());
59//!
60//! Ok(())
61//! }
62//! ```
63//!
64//! ## Training and serialization example
65//!
66//! ```no_run
67//! use tokenizers::decoders::DecoderWrapper;
68//! use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
69//! use tokenizers::normalizers::{strip::Strip, unicode::NFC, utils::Sequence, NormalizerWrapper};
70//! use tokenizers::pre_tokenizers::byte_level::ByteLevel;
71//! use tokenizers::pre_tokenizers::PreTokenizerWrapper;
72//! use tokenizers::processors::PostProcessorWrapper;
73//! use tokenizers::{AddedToken, Model, Result, TokenizerBuilder};
74//!
75//! use std::path::Path;
76//!
77//! fn main() -> Result<()> {
78//! let vocab_size: usize = 100;
79//!
80//! let mut trainer = BpeTrainerBuilder::new()
81//! .show_progress(true)
82//! .vocab_size(vocab_size)
83//! .min_frequency(0)
84//! .special_tokens(vec![
85//! AddedToken::from(String::from("<s>"), true),
86//! AddedToken::from(String::from("<pad>"), true),
87//! AddedToken::from(String::from("</s>"), true),
88//! AddedToken::from(String::from("<unk>"), true),
89//! AddedToken::from(String::from("<mask>"), true),
90//! ])
91//! .build();
92//!
93//! let mut tokenizer = TokenizerBuilder::new()
94//! .with_model(BPE::default())
95//! .with_normalizer(Some(Sequence::new(vec![
96//! Strip::new(true, true).into(),
97//! NFC.into(),
98//! ])))
99//! .with_pre_tokenizer(Some(ByteLevel::default()))
100//! .with_post_processor(Some(ByteLevel::default()))
101//! .with_decoder(Some(ByteLevel::default()))
102//! .build()?;
103//!
104//! let pretty = false;
105//! tokenizer
106//! .train_from_files(
107//! &mut trainer,
108//! vec!["path/to/vocab.txt".to_string()],
109//! )?
110//! .save("tokenizer.json", pretty)?;
111//!
112//! Ok(())
113//! }
114//! ```
115//!
116//! # Additional information
117//!
118//! - tokenizers is designed to leverage CPU parallelism when possible. The level of parallelism is determined
119//! by the total number of core/threads your CPU provides but this can be tuned by setting the `RAYON_RS_NUM_THREADS`
120//! environment variable. As an example setting `RAYON_RS_NUM_THREADS=4` will allocate a maximum of 4 threads.
121//! **_Please note this behavior may evolve in the future_**
122//!
123//! # Features
124//! **progressbar**: The progress bar visualization is enabled by default. It might be disabled if
125//! compilation for certain targets is not supported by the [termios](https://crates.io/crates/termios)
126//! dependency of the [indicatif](https://crates.io/crates/indicatif) progress bar.
127
128#[macro_use]
129extern crate log;
130#[macro_use]
131extern crate lazy_static;
132
133#[macro_use]
134extern crate derive_builder;
135
136#[macro_use]
137pub mod utils;
138pub mod decoders;
139pub mod models;
140pub mod normalizers;
141pub mod pre_tokenizers;
142pub mod processors;
143pub mod tokenizer;
144
145// Re-export from tokenizer
146pub use tokenizer::*;
147
148// Re-export also parallelism utils
149pub use utils::parallelism;
150
151// Re-export for from_pretrained
152#[cfg(feature = "http")]
153pub use utils::from_pretrained::FromPretrainedParameters;