[−][src]Struct rust_tokenizers::tokenizer::SentencePieceTokenizer

pub struct SentencePieceTokenizer { /* fields omitted */ }

SentencePiece tokenizer

SentencePiece tokenizer performing:

text cleaning
NFKC decomposition
(optional) lower casing
SentencePiece decomposition

Implementations

`impl SentencePieceTokenizer`[src]

`pub fn from_file( path: &str, lower_case: bool ) -> Result<SentencePieceTokenizer, TokenizerError>`[src]

Create a new instance of a SentencePieceTokenizer Expects a SentencePiece protobuf file as an input.

Parameters

path (&str): path to the SentencePiece model file
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization
strip_accents (bool): flag indicating if accents should be stripped from the text

Example

use rust_tokenizers::tokenizer::{SentencePieceTokenizer, Tokenizer};
let lower_case = false;
let tokenizer = SentencePieceTokenizer::from_file("path/to/vocab/file", lower_case).unwrap();

`pub fn from_existing_vocab_and_model( vocab: SentencePieceVocab, model: SentencePieceModel, lower_case: bool ) -> SentencePieceTokenizer`[src]

Create a new instance of a SentencePieceTokenizer from an existing vocabulary and model

Parameters

vocab (SentencePieceVocab): vocabulary
model (SentencePieceModel): SentencePiece model
lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization
strip_accents (bool): flag indicating if accents should be stripped from the text

Example

use rust_tokenizers::tokenizer::{SentencePieceTokenizer, Tokenizer};
use rust_tokenizers::vocab::{SentencePieceModel, SentencePieceVocab, Vocab};
let lower_case = false;
let vocab = SentencePieceVocab::from_file("path/to/vocab/file").unwrap();
let model = SentencePieceModel::from_file("path/to/model/file").unwrap();

let tokenizer = SentencePieceTokenizer::from_existing_vocab_and_model(vocab, model, lower_case);

Trait Implementations

`impl MultiThreadedTokenizer<SentencePieceVocab> for SentencePieceTokenizer`[src]

`fn vocab(&self) -> &T`[src]

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str> + Sync,` [src]

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`[src]

`impl Tokenizer<SentencePieceVocab> for SentencePieceTokenizer`[src]

`fn vocab(&self) -> &SentencePieceVocab`[src]

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>`[src]

`fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets`[src]

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn encode<S: AsRef<str>>( &self, text_1: S, text_2: Option<S>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> TokenizedInput`[src]

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str>,` [src]

`fn decode_to_vec( &self, token_ids: Vec<i64>, skip_special_tokens: bool ) -> Vec<String>`[src]

`fn decode( &self, token_ids: Vec<i64>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> String`[src]

`fn clean_up_tokenization(&self, input_string: String) -> String`[src]

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

Auto Trait Implementations

`impl RefUnwindSafe for SentencePieceTokenizer`

`impl Send for SentencePieceTokenizer`

`impl Sync for SentencePieceTokenizer`

`impl Unpin for SentencePieceTokenizer`

`impl UnwindSafe for SentencePieceTokenizer`

Blanket Implementations

`impl<T> Any for T where T: 'static + ?Sized,` [src]

`pub fn type_id(&self) -> TypeId`[src]

`impl<T> Borrow<T> for T where T: ?Sized,` [src]

`pub fn borrow(&self) -> &T`[src]

`impl<T> BorrowMut<T> for T where T: ?Sized,` [src]

`pub fn borrow_mut(&mut self) -> &mut T`[src]

`impl<T> From<T> for T`[src]

`pub fn from(t: T) -> T`[src]

`impl<T, U> Into for T where U: From<T>,` [src]

`pub fn into(self) -> U`[src]

`impl<T> Pointable for T`

`pub const ALIGN: usize`

`type Init = T`

The type for initializers.

`pub unsafe fn init(init: <T as Pointable>::Init) -> usize`

`pub unsafe fn deref<'a>(ptr: usize) -> &'a T`

`pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T`

`pub unsafe fn drop(ptr: usize)`

`impl<T, U> TryFrom for T where U: Into<T>,` [src]

`type Error = Infallible`

The type returned in the event of a conversion error.

`pub fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>`[src]

`impl<T, U> TryInto for T where U: TryFrom<T>,` [src]

`type Error = >::Error`

The type returned in the event of a conversion error.

`pub fn try_into(self) -> Result<U, >::Error>`[src]

[−][src]Struct rust_tokenizers::tokenizer::SentencePieceTokenizer

SentencePiece tokenizer

Implementations

impl SentencePieceTokenizer[src]

pub fn from_file( path: &str, lower_case: bool) -> Result<SentencePieceTokenizer, TokenizerError>[src]

Parameters

Example

pub fn from_existing_vocab_and_model( vocab: SentencePieceVocab, model: SentencePieceModel, lower_case: bool) -> SentencePieceTokenizer[src]

Parameters

Example

Trait Implementations

impl MultiThreadedTokenizer<SentencePieceVocab> for SentencePieceTokenizer[src]

fn vocab(&self) -> &T[src]

fn tokenize_list_with_offsets<S, ST>( &self, text_list: S) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str> + Sync, [src]

fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str> + Sync, [src]

fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str> + Sync, [src]

fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str> + Sync, [src]

fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> Vec<String>[src]

impl Tokenizer<SentencePieceVocab> for SentencePieceTokenizer[src]

fn vocab(&self) -> &SentencePieceVocab[src]

fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>[src]

fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String[src]

fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>[src]

fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets[src]

fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str>, [src]

fn tokenize_list_with_offsets<S, ST>( &self, text_list: S) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str>, [src]

fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where S: AsRef<[ST]>, ST: AsRef<str>, [src]

fn encode<S: AsRef<str>>( &self, text_1: S, text_2: Option<S>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> TokenizedInput[src]

fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str>, [src]

fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str>, [src]

fn decode_to_vec( &self, token_ids: Vec<i64>, skip_special_tokens: bool) -> Vec<String>[src]

fn decode( &self, token_ids: Vec<i64>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> String[src]

fn clean_up_tokenization(&self, input_string: String) -> String[src]

fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool) -> Vec<String>[src]

fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>) -> TokenIdsWithSpecialTokens[src]

Auto Trait Implementations

impl RefUnwindSafe for SentencePieceTokenizer

impl Send for SentencePieceTokenizer

impl Sync for SentencePieceTokenizer

impl Unpin for SentencePieceTokenizer

impl UnwindSafe for SentencePieceTokenizer

Blanket Implementations

impl<T> Any for T where T: 'static + ?Sized, [src]

pub fn type_id(&self) -> TypeId[src]

impl<T> Borrow<T> for T where T: ?Sized, [src]

pub fn borrow(&self) -> &T[src]

impl<T> BorrowMut<T> for T where T: ?Sized, [src]

pub fn borrow_mut(&mut self) -> &mut T[src]

impl<T> From<T> for T[src]

pub fn from(t: T) -> T[src]

impl<T, U> Into<U> for T where U: From<T>, [src]

pub fn into(self) -> U[src]

impl<T> Pointable for T

pub const ALIGN: usize

type Init = T

pub unsafe fn init(init: <T as Pointable>::Init) -> usize

pub unsafe fn deref<'a>(ptr: usize) -> &'a T

pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

pub unsafe fn drop(ptr: usize)

impl<T, U> TryFrom<U> for T where U: Into<T>, [src]

type Error = Infallible

pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>[src]

impl<T, U> TryInto<U> for T where U: TryFrom<T>, [src]

type Error = <U as TryFrom<T>>::Error

pub fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>[src]

`impl SentencePieceTokenizer`[src]

`pub fn from_file( path: &str, lower_case: bool ) -> Result<SentencePieceTokenizer, TokenizerError>`[src]

`pub fn from_existing_vocab_and_model( vocab: SentencePieceVocab, model: SentencePieceModel, lower_case: bool ) -> SentencePieceTokenizer`[src]

`impl MultiThreadedTokenizer<SentencePieceVocab> for SentencePieceTokenizer`[src]

`fn vocab(&self) -> &T`[src]

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str> + Sync,` [src]

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str> + Sync,` [src]

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`[src]

`impl Tokenizer<SentencePieceVocab> for SentencePieceTokenizer`[src]

`fn vocab(&self) -> &SentencePieceVocab`[src]

`fn tokenize_to_tokens(&self, text: TokenRef<'_>) -> Vec<Token>`[src]

`fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String`[src]

`fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>`[src]

`fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets`[src]

`fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn tokenize_list_with_offsets<S, ST>( &self, text_list: S ) -> Vec<TokensWithOffsets> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn encode<S: AsRef<str>>( &self, text_1: S, text_2: Option<S>, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> TokenizedInput`[src]

`fn encode_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[ST]>, ST: AsRef<str>,` [src]

`fn encode_pair_list<S, ST>( &self, text_list: S, max_len: usize, truncation_strategy: &TruncationStrategy, stride: usize ) -> Vec<TokenizedInput> where S: AsRef<[(ST, ST)]>, ST: AsRef<str>,` [src]

`fn decode_to_vec( &self, token_ids: Vec<i64>, skip_special_tokens: bool ) -> Vec<String>`[src]

`fn decode( &self, token_ids: Vec<i64>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> String`[src]

`fn clean_up_tokenization(&self, input_string: String) -> String`[src]

`fn decode_list( &self, token_ids_list: Vec<Vec<i64>>, skip_special_tokens: bool, clean_up_tokenization_spaces: bool ) -> Vec<String>`[src]

`fn build_input_with_special_tokens( &self, tokens_ids_with_offsets_1: TokenIdsWithOffsets, tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets> ) -> TokenIdsWithSpecialTokens`[src]

`impl RefUnwindSafe for SentencePieceTokenizer`

`impl Send for SentencePieceTokenizer`

`impl Sync for SentencePieceTokenizer`

`impl Unpin for SentencePieceTokenizer`

`impl UnwindSafe for SentencePieceTokenizer`

`impl<T> Any for T where T: 'static + ?Sized,` [src]

`pub fn type_id(&self) -> TypeId`[src]

`impl<T> Borrow<T> for T where T: ?Sized,` [src]

`pub fn borrow(&self) -> &T`[src]

`impl<T> BorrowMut<T> for T where T: ?Sized,` [src]

`pub fn borrow_mut(&mut self) -> &mut T`[src]

`impl<T> From<T> for T`[src]

`pub fn from(t: T) -> T`[src]

`impl<T, U> Into<U> for T where U: From<T>,` [src]

`pub fn into(self) -> U`[src]

`impl<T> Pointable for T`

`pub const ALIGN: usize`

`type Init = T`

`pub unsafe fn init(init: <T as Pointable>::Init) -> usize`

`pub unsafe fn deref<'a>(ptr: usize) -> &'a T`

`pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T`

`pub unsafe fn drop(ptr: usize)`

`impl<T, U> TryFrom<U> for T where U: Into<T>,` [src]

`type Error = Infallible`

`pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>`[src]

`impl<T, U> TryInto<U> for T where U: TryFrom<T>,` [src]

`type Error = <U as TryFrom<T>>::Error`

`pub fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>`[src]