[][src]Struct rust_tokenizers::tokenizer::RobertaTokenizer

pub struct RobertaTokenizer { /* fields omitted */ }

RoBERTa tokenizer

RoBERTa tokenizer performing:

  • splitting on special characters
  • whitespace splitting
  • (optional) lower casing
  • BPE tokenization

Implementations

impl RobertaTokenizer[src]

pub fn from_file(
    vocab_path: &str,
    merges_path: &str,
    lower_case: bool,
    add_prefix_space: bool
) -> Result<RobertaTokenizer, TokenizerError>
[src]

Create a new instance of a RobertaTokenizer Expects a vocabulary json file and a merges file as an input.

Parameters

  • vocab_path (&str): path to the vocabulary file
  • merges_path (&str): path to the merges file (use as part of the BPE encoding process)
  • lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization

Example

use rust_tokenizers::tokenizer::{RobertaTokenizer, Tokenizer};
let lower_case = false;
let add_prefix_space = true;
let tokenizer = RobertaTokenizer::from_file(
    "path/to/vocab/file",
    "path/to/merges/file",
    lower_case,
    add_prefix_space,
)
.unwrap();

pub fn from_existing_vocab_and_merges(
    vocab: RobertaVocab,
    merges: BpePairVocab,
    lower_case: bool,
    add_prefix_space: bool
) -> RobertaTokenizer
[src]

Create a new instance of a RobertaTokenizer from an existing vocabulary and merges

Parameters

  • vocab (RobertaVocab): GPT-like vocabulary
  • merges (BpePairVocab): BPE pairs vocabulary
  • lower_case (bool): flag indicating if the text should be lower-cased as part of the tokenization

Example

use rust_tokenizers::tokenizer::{RobertaTokenizer, Tokenizer};
use rust_tokenizers::vocab::{BpePairVocab, RobertaVocab, Vocab};
let lower_case = false;
let add_prefix_space = true;
let vocab = RobertaVocab::from_file("path/to/vocab/file").unwrap();
let merges = BpePairVocab::from_file("path/to/merges/file").unwrap();

let tokenizer = RobertaTokenizer::from_existing_vocab_and_merges(
    vocab,
    merges,
    lower_case,
    add_prefix_space,
);

Trait Implementations

impl MultiThreadedTokenizer<RobertaVocab> for RobertaTokenizer[src]

impl Tokenizer<RobertaVocab> for RobertaTokenizer[src]

Auto Trait Implementations

Blanket Implementations

impl<T> Any for T where
    T: 'static + ?Sized
[src]

impl<T> Borrow<T> for T where
    T: ?Sized
[src]

impl<T> BorrowMut<T> for T where
    T: ?Sized
[src]

impl<T> From<T> for T[src]

impl<T, U> Into<U> for T where
    U: From<T>, 
[src]

impl<T> Pointable for T

type Init = T

The type for initializers.

impl<T, U> TryFrom<U> for T where
    U: Into<T>, 
[src]

type Error = Infallible

The type returned in the event of a conversion error.

impl<T, U> TryInto<U> for T where
    U: TryFrom<T>, 
[src]

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.