[][src]Struct rust_tokenizers::tokenizer::BaseTokenizer

pub struct BaseTokenizer<T: Vocab> { /* fields omitted */ }

Base tokenizer

Base tokenizer performing:

  • whitespace tokenization
  • splitting on special characters
  • splitting on punctuation
  • splitting on CJK characters
  • (optional) lower casing
  • (optional) accent stripping

This tokenizer is used as a pre-tokenizer step in the BERT and GPT tokenizers.

Implementations

impl<T: Vocab + Sync + Send> BaseTokenizer<T>[src]

pub fn from_file(
    path: &str,
    lower_case: bool,
    strip_accents: bool
) -> Result<BaseTokenizer<T>, TokenizerError>
[src]

Create a new instance of a BaseTokenizer Expects a vocabulary flat-file as an input.

Parameters

  • path (&str): path to the vocabulary file (only used for special character splitting)
  • lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization
  • strip_accents (bool): flag indicating if accents should be stripped from the text

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::BaseVocab;
let strip_accents = false;
let lower_case = false;
let tokenizer: BaseTokenizer<BaseVocab> =
    BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();

pub fn from_existing_vocab(
    vocab: T,
    lower_case: bool,
    strip_accents: bool
) -> BaseTokenizer<T>
[src]

Create a new instance of a BaseTokenizer from an existing vocabulary

Parameters

  • vocab (Vocab): Thread-safe reference to a vocabulary
  • lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization
  • strip_accents (bool): flag indicating if accents should be stripped from the text

Example

use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer};
use rust_tokenizers::vocab::{BaseVocab, Vocab};
let strip_accents = false;
let lower_case = false;
let base_vocab = BaseVocab::from_file("path/to/vocab/file").unwrap();

let tokenizer = BaseTokenizer::from_existing_vocab(base_vocab, lower_case, strip_accents);

Trait Implementations

impl<T: Vocab + Sync + Send> MultiThreadedTokenizer<T> for BaseTokenizer<T>[src]

impl<T: Vocab + Sync + Send> Tokenizer<T> for BaseTokenizer<T>[src]

Auto Trait Implementations

impl<T> RefUnwindSafe for BaseTokenizer<T> where
    T: RefUnwindSafe

impl<T> Send for BaseTokenizer<T> where
    T: Send

impl<T> Sync for BaseTokenizer<T> where
    T: Sync

impl<T> Unpin for BaseTokenizer<T> where
    T: Unpin

impl<T> UnwindSafe for BaseTokenizer<T> where
    T: UnwindSafe

Blanket Implementations

impl<T> Any for T where
    T: 'static + ?Sized
[src]

impl<T> Borrow<T> for T where
    T: ?Sized
[src]

impl<T> BorrowMut<T> for T where
    T: ?Sized
[src]

impl<T> From<T> for T[src]

impl<T, U> Into<U> for T where
    U: From<T>, 
[src]

impl<T> Pointable for T

type Init = T

The type for initializers.

impl<T, U> TryFrom<U> for T where
    U: Into<T>, 
[src]

type Error = Infallible

The type returned in the event of a conversion error.

impl<T, U> TryInto<U> for T where
    U: TryFrom<T>, 
[src]

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.