rust_tokenizers/vocab/
mod.rs

1// Copyright 2019 Guillaume Becquin
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5//     http://www.apache.org/licenses/LICENSE-2.0
6// Unless required by applicable law or agreed to in writing, software
7// distributed under the License is distributed on an "AS IS" BASIS,
8// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9// See the License for the specific language governing permissions and
10// limitations under the License.
11
12//!# Vocabularies
13//!
14//! This module contains the vocabularies leveraged by the tokenizer. These contain methods for
15//! deserialization of vocabulary files and access by the tokenizers, including:
16//! - dictionaries (mapping from token to token ids)
17//! - merge files (used by Byte-Pair Encoding tokenizers)
18//! - sentence-piece models (trie structure and methods to find common prefix subtokens)
19//!
20//! The following vocabularies have been implemented:
21//! - BERT
22//! - ALBERT
23//! - GPT2
24//! - GPT
25//! - Marian
26//! - RoBERTa
27//! - T5
28//! - XLMRoBERTa
29//! - XLNet
30//! - SentencePiece
31//!
32//! All vocabularies implement the `Vocab` trait exposing a standard interface for integration with
33//! the tokenizers.
34
35mod albert_vocab;
36pub(crate) mod base_vocab;
37mod bert_vocab;
38pub(crate) mod bpe_vocab;
39mod deberta_v2_vocab;
40mod deberta_vocab;
41mod fnet_vocab;
42mod gpt2_vocab;
43mod m2m100_vocab;
44mod marian_vocab;
45mod mbart50_vocab;
46mod nllb_vocab;
47mod openai_gpt_vocab;
48mod pegasus_vocab;
49mod prophetnet_vocab;
50mod reformer_vocab;
51mod roberta_vocab;
52mod sentence_piece_bpe_model;
53mod sentence_piece_unigram_model;
54mod sentence_piece_vocab;
55pub(crate) mod sentencepiece_proto;
56mod t5_vocab;
57mod xlm_roberta_vocab;
58mod xlnet_vocab;
59
60pub use albert_vocab::AlbertVocab;
61pub use base_vocab::{BaseVocab, Vocab};
62pub use bert_vocab::BertVocab;
63pub use bpe_vocab::{BpePairRef, BpePairVocab};
64pub use deberta_v2_vocab::DeBERTaV2Vocab;
65pub use deberta_vocab::DeBERTaVocab;
66pub use fnet_vocab::FNetVocab;
67pub use gpt2_vocab::Gpt2Vocab;
68pub use m2m100_vocab::M2M100Vocab;
69pub use marian_vocab::MarianVocab;
70pub use mbart50_vocab::MBart50Vocab;
71pub use nllb_vocab::NLLBVocab;
72pub(crate) use nllb_vocab::EXTENDED_FAIRSEQ_LANGUAGE_CODES;
73pub use openai_gpt_vocab::OpenAiGptVocab;
74pub use pegasus_vocab::PegasusVocab;
75pub use prophetnet_vocab::ProphetNetVocab;
76pub use reformer_vocab::ReformerVocab;
77pub use roberta_vocab::RobertaVocab;
78pub use sentence_piece_bpe_model::SentencePieceBpeModel;
79pub use sentence_piece_unigram_model::SentencePieceModel;
80pub use sentence_piece_vocab::SentencePieceVocab;
81pub use t5_vocab::T5Vocab;
82pub use xlm_roberta_vocab::XLMRobertaVocab;
83pub use xlnet_vocab::XLNetVocab;