1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
// Copyright 2018 The HuggingFace Inc. team.
// Copyright 2019 Guillaume Becquin
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//!# High performance tokenizers for Rust
//!
//! This crate contains implementation of common tokenizers used in state-of-the-art language models.
//! It is usd as the reference tokenization crate of [rust-bert](https://docs.rs/rust-bert/), exposing modern transformer-based
//! models such as BERT, RoBERTa, GPT2, BART, XLNet...
//!
//! The following tokenizers have been implemented and validated against a Python reference implementation:
//! - Sentence Piece (unigram model)
//! - BERT
//! - DistilBERT
//! - RoBERTa
//! - FNet
//! - GPT
//! - GPT2
//! - CTRL
//! - ProphetNet
//! - XLNet
//! - Pegasus
//! - MBart50
//! - M2M100
//! - NLLB
//! - DeBERTa
//! - DeBERTa (v2)
//!
//! The library is structured into vocabularies (for the encoding and decoding of the tokens and registration of special tokens)
//! and tokenizers (splitting the input text into tokens). Generally, a tokenizer will contain a reference vocabulary that may
//! be used as part of the tokenization process (for example, containing a list of subwords or merges).
//!
//! ## Usage example
//!
//! ```no_run
//! # fn main() -> anyhow::Result<()> {
//! use rust_tokenizers::adapters::Example;
//! use rust_tokenizers::tokenizer::{BertTokenizer, Tokenizer, TruncationStrategy};
//! use rust_tokenizers::vocab::{BertVocab, Vocab};
//! let vocab_path = "path/to/vocab";
//! let vocab = BertVocab::from_file(&vocab_path)?;
//!
//! let test_sentence = Example::new_from_string("This is a sample sentence to be tokenized");
//! let bert_tokenizer: BertTokenizer = BertTokenizer::from_existing_vocab(vocab, true, true);
//!
//! println!(
//! "{:?}",
//! bert_tokenizer.encode(
//! &test_sentence.sentence_1,
//! None,
//! 128,
//! &TruncationStrategy::LongestFirst,
//! 0
//! )
//! );
//! # Ok(())
//! # }
//! ```
pub mod tokenizer;
pub mod vocab;
pub mod adapters;
pub mod error;
pub use tokenizer::base_tokenizer::{
ConsolidatableTokens, ConsolidatedTokenIterator, Mask, Offset, OffsetSize, Token,
TokenIdsWithOffsets, TokenIdsWithSpecialTokens, TokenRef, TokenTrait, TokenizedInput,
TokensWithOffsets,
};
#[macro_use]
extern crate lazy_static;