[−][src]Struct rust_tokenizers::tokenizer::BaseTokenizer
Base tokenizer
Base tokenizer performing:
- whitespace tokenization
- splitting on special characters
- splitting on punctuation
- splitting on CJK characters
- (optional) lower casing
- (optional) accent stripping
This tokenizer is used as a pre-tokenizer step in the BERT and GPT tokenizers.
Implementations
impl<T: Vocab + Sync + Send> BaseTokenizer<T>
[src]
pub fn from_file(
path: &str,
lower_case: bool,
strip_accents: bool
) -> Result<BaseTokenizer<T>, TokenizerError>
[src]
path: &str,
lower_case: bool,
strip_accents: bool
) -> Result<BaseTokenizer<T>, TokenizerError>
Create a new instance of a BaseTokenizer
Expects a vocabulary flat-file as an input.
Parameters
- path (
&str
): path to the vocabulary file (only used for special character splitting) - lower_case (
bool
): flag indicating if the text should be lower-cased as part of the tokenization - strip_accents (
bool
): flag indicating if accents should be stripped from the text
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer}; use rust_tokenizers::vocab::BaseVocab; let strip_accents = false; let lower_case = false; let tokenizer: BaseTokenizer<BaseVocab> = BaseTokenizer::from_file("path/to/vocab/file", lower_case, strip_accents).unwrap();
pub fn from_existing_vocab(
vocab: T,
lower_case: bool,
strip_accents: bool
) -> BaseTokenizer<T>
[src]
vocab: T,
lower_case: bool,
strip_accents: bool
) -> BaseTokenizer<T>
Create a new instance of a BaseTokenizer
from an existing vocabulary
Parameters
- vocab (
Vocab
): Thread-safe reference to a vocabulary - lower_case (
bool
): flag indicating if the text should be lower-cased as part of the tokenization - strip_accents (
bool
): flag indicating if accents should be stripped from the text
Example
use rust_tokenizers::tokenizer::{BaseTokenizer, Tokenizer}; use rust_tokenizers::vocab::{BaseVocab, Vocab}; let strip_accents = false; let lower_case = false; let base_vocab = BaseVocab::from_file("path/to/vocab/file").unwrap(); let tokenizer = BaseTokenizer::from_existing_vocab(base_vocab, lower_case, strip_accents);
Trait Implementations
impl<T: Vocab + Sync + Send> MultiThreadedTokenizer<T> for BaseTokenizer<T>
[src]
fn vocab(&self) -> &T
[src]
fn tokenize_list_with_offsets<S, ST>(
&self,
text_list: S
) -> Vec<TokensWithOffsets> where
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
[src]
&self,
text_list: S
) -> Vec<TokensWithOffsets> where
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
[src]
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
fn encode_list<S, ST>(
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
[src]
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[ST]>,
ST: AsRef<str> + Sync,
fn encode_pair_list<S, ST>(
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[(ST, ST)]>,
ST: AsRef<str> + Sync,
[src]
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[(ST, ST)]>,
ST: AsRef<str> + Sync,
fn decode_list(
&self,
token_ids_list: Vec<Vec<i64>>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>
[src]
&self,
token_ids_list: Vec<Vec<i64>>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>
impl<T: Vocab + Sync + Send> Tokenizer<T> for BaseTokenizer<T>
[src]
fn vocab(&self) -> &T
[src]
fn tokenize_to_tokens(&self, initial_token: TokenRef<'_>) -> Vec<Token>
[src]
fn tokenize<S: AsRef<str>>(&self, text: S) -> Vec<String>
[src]
fn tokenize_with_offsets<S: AsRef<str>>(&self, text: S) -> TokensWithOffsets
[src]
fn tokenize_list<S, ST>(&self, text_list: S) -> Vec<Vec<String>> where
S: AsRef<[ST]>,
ST: AsRef<str>,
[src]
S: AsRef<[ST]>,
ST: AsRef<str>,
fn tokenize_list_with_offsets<S, ST>(
&self,
text_list: S
) -> Vec<TokensWithOffsets> where
S: AsRef<[ST]>,
ST: AsRef<str>,
[src]
&self,
text_list: S
) -> Vec<TokensWithOffsets> where
S: AsRef<[ST]>,
ST: AsRef<str>,
fn convert_tokens_to_ids<S, ST>(&self, tokens: S) -> Vec<i64> where
S: AsRef<[ST]>,
ST: AsRef<str>,
[src]
S: AsRef<[ST]>,
ST: AsRef<str>,
fn encode<S: AsRef<str>>(
&self,
text_1: S,
text_2: Option<S>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> TokenizedInput
[src]
&self,
text_1: S,
text_2: Option<S>,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> TokenizedInput
fn encode_list<S, ST>(
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[ST]>,
ST: AsRef<str>,
[src]
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[ST]>,
ST: AsRef<str>,
fn encode_pair_list<S, ST>(
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[(ST, ST)]>,
ST: AsRef<str>,
[src]
&self,
text_list: S,
max_len: usize,
truncation_strategy: &TruncationStrategy,
stride: usize
) -> Vec<TokenizedInput> where
S: AsRef<[(ST, ST)]>,
ST: AsRef<str>,
fn decode_to_vec(
&self,
token_ids: Vec<i64>,
skip_special_tokens: bool
) -> Vec<String>
[src]
&self,
token_ids: Vec<i64>,
skip_special_tokens: bool
) -> Vec<String>
fn decode(
&self,
token_ids: Vec<i64>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> String
[src]
&self,
token_ids: Vec<i64>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> String
fn convert_tokens_to_string(&self, tokens: Vec<String>) -> String
[src]
fn clean_up_tokenization(&self, input_string: String) -> String
[src]
fn decode_list(
&self,
token_ids_list: Vec<Vec<i64>>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>
[src]
&self,
token_ids_list: Vec<Vec<i64>>,
skip_special_tokens: bool,
clean_up_tokenization_spaces: bool
) -> Vec<String>
fn build_input_with_special_tokens(
&self,
tokens_ids_with_offsets_1: TokenIdsWithOffsets,
tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
) -> TokenIdsWithSpecialTokens
[src]
&self,
tokens_ids_with_offsets_1: TokenIdsWithOffsets,
tokens_ids_with_offsets_2: Option<TokenIdsWithOffsets>
) -> TokenIdsWithSpecialTokens
Auto Trait Implementations
impl<T> RefUnwindSafe for BaseTokenizer<T> where
T: RefUnwindSafe,
T: RefUnwindSafe,
impl<T> Send for BaseTokenizer<T> where
T: Send,
T: Send,
impl<T> Sync for BaseTokenizer<T> where
T: Sync,
T: Sync,
impl<T> Unpin for BaseTokenizer<T> where
T: Unpin,
T: Unpin,
impl<T> UnwindSafe for BaseTokenizer<T> where
T: UnwindSafe,
T: UnwindSafe,
Blanket Implementations
impl<T> Any for T where
T: 'static + ?Sized,
[src]
T: 'static + ?Sized,
impl<T> Borrow<T> for T where
T: ?Sized,
[src]
T: ?Sized,
impl<T> BorrowMut<T> for T where
T: ?Sized,
[src]
T: ?Sized,
pub fn borrow_mut(&mut self) -> &mut T
[src]
impl<T> From<T> for T
[src]
impl<T, U> Into<U> for T where
U: From<T>,
[src]
U: From<T>,
impl<T> Pointable for T
pub const ALIGN: usize
type Init = T
The type for initializers.
pub unsafe fn init(init: <T as Pointable>::Init) -> usize
pub unsafe fn deref<'a>(ptr: usize) -> &'a T
pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T
pub unsafe fn drop(ptr: usize)
impl<T, U> TryFrom<U> for T where
U: Into<T>,
[src]
U: Into<T>,
type Error = Infallible
The type returned in the event of a conversion error.
pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>
[src]
impl<T, U> TryInto<U> for T where
U: TryFrom<T>,
[src]
U: TryFrom<T>,