1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
// Copyright 2018 The HuggingFace Inc. team. // Copyright 2019 Guillaume Becquin // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //!# High performance tokenizers for Rust //! //! This crate contains implementation of common tokenizers used in state-of-the-art language models. //! It is usd as the reference tokenization crate of [rust-bert](https://docs.rs/rust-bert/), exposing modern transformer-based //! models such as BERT, RoBERTa, GPT2, BART, XLNet... //! //! The following tokenizers have been implemented and validated against a Python reference implementation: //! - Sentence Piece (unigram model) //! - BERT //! - DistilBERT //! - RoBERTa //! - GPT //! - GPT2 //! - CTRL //! - XLNet //! //! The library is structured into vocabularies (for the encoding and decoding of the tokens and registration of special tokens) //! and tokenizers (splitting the input text into tokens). Generally, a tokenizer will contain a reference vocabulary that may //! be used as part of the tokenization process (for example, containing a list of subwords or merges). //! //! ## Usage example //! //! ```no_run //! # fn main() -> anyhow::Result<()> { //! use rust_tokenizers::adapters::Example; //! use rust_tokenizers::tokenizer::{BertTokenizer, Tokenizer, TruncationStrategy}; //! use rust_tokenizers::vocab::{BertVocab, Vocab}; //! let vocab_path = "path/to/vocab"; //! let vocab = BertVocab::from_file(&vocab_path)?; //! //! let test_sentence = Example::new_from_string("This is a sample sentence to be tokenized"); //! let bert_tokenizer: BertTokenizer = BertTokenizer::from_existing_vocab(vocab, true, true); //! //! println!( //! "{:?}", //! bert_tokenizer.encode( //! &test_sentence.sentence_1, //! None, //! 128, //! &TruncationStrategy::LongestFirst, //! 0 //! ) //! ); //! # Ok(()) //! # } //! ``` pub mod tokenizer; pub mod vocab; pub mod adapters; pub mod error; pub use tokenizer::base_tokenizer::{ ConsolidatableTokens, ConsolidatedTokenIterator, Mask, Offset, OffsetSize, Token, TokenIdsWithOffsets, TokenIdsWithSpecialTokens, TokenRef, TokenTrait, TokenizedInput, }; #[macro_use] extern crate lazy_static;