Struct rust_tokenizers::vocab::OpenAiGptVocab [−][src]
GPT Vocab
Vocabulary for GPT tokenizer. Only contains the unknown token as a special value. Expects a JSON-format vocabulary when created from file.
Fields
values: HashMap<String, i64>
A mapping of tokens as string to indices (i.e. the encoder base)
indices: HashMap<i64, String>
A mapping of token ids to strings (i.e. the decoder base)
unknown_value: &'static str
The string to use for unknown (out of vocabulary) tokens
special_values: HashMap<String, i64>
A mapping of special value tokens as strings to IDs (i.e. the encoder base for special values), special values typically include things like BOS/EOS markers, class markers, mask markers and padding markers
special_indices: HashMap<i64, String>
A mapping of special value tokens as IDs to strings (i.e. the decoder base for special values)
Trait Implementations
impl Clone for OpenAiGptVocab
[src]
fn clone(&self) -> OpenAiGptVocab
[src]
pub fn clone_from(&mut self, source: &Self)
1.0.0[src]
impl Debug for OpenAiGptVocab
[src]
impl MultiThreadedTokenizer<OpenAiGptVocab> for CtrlTokenizer
[src]
fn vocab(&self) -> &T
[src]
fn tokenize_list_with_offsets<S, ST>(
&self,
text_list: S
) -> Vec<TokensWithOffsets> where
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
[src]
&self,
text_list: S
) -> Vec<TokensWithOffsets> where
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
[src]
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
fn encode_list<S, ST>(
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
[src]
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
fn encode_pair_list<S, ST>(
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[(ST, ST)]>,
ST: AsRef<str> + Sync,
[src]
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[(ST, ST)]>,
ST: AsRef<str> + Sync,
fn decode_list(
&self,
token_ids_list: Vec<Vec<i64>>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>
[src]
&self,
token_ids_list: Vec<Vec<i64>>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>
impl MultiThreadedTokenizer<OpenAiGptVocab> for OpenAiGptTokenizer
[src]
fn vocab(&self) -> &T
[src]
fn tokenize_list_with_offsets<S, ST>(
&self,
text_list: S
) -> Vec<TokensWithOffsets> where
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
[src]
&self,
text_list: S
) -> Vec<TokensWithOffsets> where
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
[src]
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
fn encode_list<S, ST>(
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
[src]
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
fn encode_pair_list<S, ST>(
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[(ST, ST)]>,
ST: AsRef<str> + Sync,
[src]
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[(ST, ST)]>,
ST: AsRef<str> + Sync,
fn decode_list(
&self,
token_ids_list: Vec<Vec<i64>>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>
[src]
&self,
token_ids_list: Vec<Vec<i64>>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>
impl Tokenizer<OpenAiGptVocab> for CtrlTokenizer
[src]
fn vocab(&self) -> &OpenAiGptVocab
[src]
fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>
[src]
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String
[src]
fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>
[src]
fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets
[src]
fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where
S: AsRef<[ST]>,
ST: AsRef<str>,
[src]
S: AsRef<[ST]>,
ST: AsRef<str>,
fn tokenize_list_with_offsets<S, ST>(
&self,
text_list: S
) -> Vec<TokensWithOffsets> where
S: AsRef<[ST]>,
ST: AsRef<str>,
[src]
&self,
text_list: S
) -> Vec<TokensWithOffsets> where
S: AsRef<[ST]>,
ST: AsRef<str>,
fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where
S: AsRef<[ST]>,
ST: AsRef<str>,
[src]
S: AsRef<[ST]>,
ST: AsRef<str>,
fn encode<S: AsRef<str>>(
&self,
text_1: S,
text_2: Option<S>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> TokenizedInput
[src]
&self,
text_1: S,
text_2: Option<S>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> TokenizedInput
fn encode_list<S, ST>(
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[ST]>,
ST: AsRef<str>,
[src]
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[ST]>,
ST: AsRef<str>,
fn encode_pair_list<S, ST>(
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[(ST, ST)]>,
ST: AsRef<str>,
[src]
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[(ST, ST)]>,
ST: AsRef<str>,
fn decode_to_vec(
&self,
token_ids: Vec<i64>,
skip_special_tokens: bool
) -> Vec<String>
[src]
&self,
token_ids: Vec<i64>,
skip_special_tokens: bool
) -> Vec<String>
fn decode(
&self,
token_ids: Vec<i64>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> String
[src]
&self,
token_ids: Vec<i64>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> String
fn clean_up_tokenization(&self, input_string: String) -> String
[src]
fn decode_list(
&self,
token_ids_list: Vec<Vec<i64>>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>
[src]
&self,
token_ids_list: Vec<Vec<i64>>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>
fn build_input_with_special_tokens(
&self,
tokens_ids_with_offsets_1: TokenIdsWithOffsets,
tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
) -> TokenIdsWithSpecialTokens
[src]
&self,
tokens_ids_with_offsets_1: TokenIdsWithOffsets,
tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
) -> TokenIdsWithSpecialTokens
impl Tokenizer<OpenAiGptVocab> for OpenAiGptTokenizer
[src]
fn vocab(&self) -> &OpenAiGptVocab
[src]
fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>
[src]
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String
[src]
fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>
[src]
fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets
[src]
fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where
S: AsRef<[ST]>,
ST: AsRef<str>,
[src]
S: AsRef<[ST]>,
ST: AsRef<str>,
fn tokenize_list_with_offsets<S, ST>(
&self,
text_list: S
) -> Vec<TokensWithOffsets> where
S: AsRef<[ST]>,
ST: AsRef<str>,
[src]
&self,
text_list: S
) -> Vec<TokensWithOffsets> where
S: AsRef<[ST]>,
ST: AsRef<str>,
fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where
S: AsRef<[ST]>,
ST: AsRef<str>,
[src]
S: AsRef<[ST]>,
ST: AsRef<str>,
fn encode<S: AsRef<str>>(
&self,
text_1: S,
text_2: Option<S>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> TokenizedInput
[src]
&self,
text_1: S,
text_2: Option<S>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> TokenizedInput
fn encode_list<S, ST>(
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[ST]>,
ST: AsRef<str>,
[src]
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[ST]>,
ST: AsRef<str>,
fn encode_pair_list<S, ST>(
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[(ST, ST)]>,
ST: AsRef<str>,
[src]
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[(ST, ST)]>,
ST: AsRef<str>,
fn decode_to_vec(
&self,
token_ids: Vec<i64>,
skip_special_tokens: bool
) -> Vec<String>
[src]
&self,
token_ids: Vec<i64>,
skip_special_tokens: bool
) -> Vec<String>
fn decode(
&self,
token_ids: Vec<i64>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> String
[src]
&self,
token_ids: Vec<i64>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> String
fn clean_up_tokenization(&self, input_string: String) -> String
[src]
fn decode_list(
&self,
token_ids_list: Vec<Vec<i64>>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>
[src]
&self,
token_ids_list: Vec<Vec<i64>>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>
fn build_input_with_special_tokens(
&self,
tokens_ids_with_offsets_1: TokenIdsWithOffsets,
tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
) -> TokenIdsWithSpecialTokens
[src]
&self,
tokens_ids_with_offsets_1: TokenIdsWithOffsets,
tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
) -> TokenIdsWithSpecialTokens
impl Vocab for OpenAiGptVocab
[src]
fn unknown_value() -> &'static str
[src]
fn get_unknown_value(&self) -> &'static str
[src]
fn values(&self) -> &HashMap<String, i64>
[src]
fn indices(&self) -> &HashMap<i64, String>
[src]
fn special_values(&self) -> &HashMap<String, i64>
[src]
fn special_indices(&self) -> &HashMap<i64, String>
[src]
fn from_file(path: &str) -> Result<OpenAiGptVocab, TokenizerError>
[src]
fn token_to_id(&self, token: &str) -> i64
[src]
fn id_to_token(&self, id: &i64) -> String
[src]
fn read_vocab_file(path: &str) -> Result<HashMap<String, i64>, TokenizerError>
[src]
fn _token_to_id(
&self,
token: &str,
values: &HashMap<String, i64>,
special_values: &HashMap<String, i64>,
unknown_value: &str
) -> i64
[src]
&self,
token: &str,
values: &HashMap<String, i64>,
special_values: &HashMap<String, i64>,
unknown_value: &str
) -> i64
fn _id_to_token(
&self,
id: &i64,
indices: &HashMap<i64, String>,
special_indices: &HashMap<i64, String>,
unknown_value: &str
) -> String
[src]
&self,
id: &i64,
indices: &HashMap<i64, String>,
special_indices: &HashMap<i64, String>,
unknown_value: &str
) -> String
fn _register_as_special_value(
token: &str,
values: &HashMap<String, i64>,
special_values: &mut HashMap<String, i64>
) -> Result<(), TokenizerError>
[src]
token: &str,
values: &HashMap<String, i64>,
special_values: &mut HashMap<String, i64>
) -> Result<(), TokenizerError>
fn convert_tokens_to_ids(&self, tokens: &[&str]) -> Vec<i64>
[src]
Auto Trait Implementations
impl RefUnwindSafe for OpenAiGptVocab
impl Send for OpenAiGptVocab
impl Sync for OpenAiGptVocab
impl Unpin for OpenAiGptVocab
impl UnwindSafe for OpenAiGptVocab
Blanket Implementations
impl<T> Any for T where
T: 'static + ?Sized,
[src]
T: 'static + ?Sized,
impl<T> Borrow<T> for T where
T: ?Sized,
[src]
T: ?Sized,
impl<T> BorrowMut<T> for T where
T: ?Sized,
[src]
T: ?Sized,
pub fn borrow_mut(&mut self) -> &mut T
[src]
impl<T> From<T> for T
[src]
impl<T, U> Into<U> for T where
U: From<T>,
[src]
U: From<T>,
impl<T> Pointable for T
pub const ALIGN: usize
type Init = T
The type for initializers.
pub unsafe fn init(init: <T as Pointable>::Init) -> usize
pub unsafe fn deref<'a>(ptr: usize) -> &'a T
pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T
pub unsafe fn drop(ptr: usize)
impl<T> ToOwned for T where
T: Clone,
[src]
T: Clone,
type Owned = T
The resulting type after obtaining ownership.
pub fn to_owned(&self) -> T
[src]
pub fn clone_into(&self, target: &mut T)
[src]
impl<T, U> TryFrom<U> for T where
U: Into<T>,
[src]
U: Into<T>,
type Error = Infallible
The type returned in the event of a conversion error.
pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>
[src]
impl<T, U> TryInto<U> for T where
U: TryFrom<T>,
[src]
U: TryFrom<T>,