[][src]Struct rust_tokenizers::tokenizer::SentencePieceTokenizer

pub struct SentencePieceTokenizer { /* fields omitted */ }

SentencePiece tokenizer

SentencePiece tokenizer performing:

  • text cleaning
  • NFKC decomposition
  • (optional) lower casing
  • SentencePiece decomposition

Implementations

impl SentencePieceTokenizer[src]

pub fn from_file(
    path: &str,
    lower_case: bool
) -> Result<SentencePieceTokenizer, TokenizerError>
[src]

Create a new instance of a SentencePieceTokenizer Expects a SentencePiece protobuf file as an input.

Parameters

  • path (&str): path to the SentencePiece model file
  • lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization
  • strip_accents (bool): flag indicating if accents should be stripped from the text

Example

use rust_tokenizers::tokenizer::{SentencePieceTokenizer, Tokenizer};
let lower_case = false;
let tokenizer = SentencePieceTokenizer::from_file("path/to/vocab/file", lower_case).unwrap();

pub fn from_existing_vocab_and_model(
    vocab: SentencePieceVocab,
    model: SentencePieceModel,
    lower_case: bool
) -> SentencePieceTokenizer
[src]

Create a new instance of a SentencePieceTokenizer from an existing vocabulary and model

Parameters

  • vocab (SentencePieceVocab): vocabulary
  • model (SentencePieceModel): SentencePiece model
  • lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization
  • strip_accents (bool): flag indicating if accents should be stripped from the text

Example

use rust_tokenizers::tokenizer::{SentencePieceTokenizer, Tokenizer};
use rust_tokenizers::vocab::{SentencePieceModel, SentencePieceVocab, Vocab};
let lower_case = false;
let vocab = SentencePieceVocab::from_file("path/to/vocab/file").unwrap();
let model = SentencePieceModel::from_file("path/to/model/file").unwrap();

let tokenizer = SentencePieceTokenizer::from_existing_vocab_and_model(vocab, model, lower_case);

Trait Implementations

impl MultiThreadedTokenizer<SentencePieceVocab> for SentencePieceTokenizer[src]

impl Tokenizer<SentencePieceVocab> for SentencePieceTokenizer[src]

Auto Trait Implementations

Blanket Implementations

impl<T> Any for T where
    T: 'static + ?Sized
[src]

impl<T> Borrow<T> for T where
    T: ?Sized
[src]

impl<T> BorrowMut<T> for T where
    T: ?Sized
[src]

impl<T> From<T> for T[src]

impl<T, U> Into<U> for T where
    U: From<T>, 
[src]

impl<T> Pointable for T

type Init = T

The type for initializers.

impl<T, U> TryFrom<U> for T where
    U: Into<T>, 
[src]

type Error = Infallible

The type returned in the event of a conversion error.

impl<T, U> TryInto<U> for T where
    U: TryFrom<T>, 
[src]

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.