[−][src]Trait tokenizers::tokenizer::PreTokenizer
The PreTokenizer
is in charge of doing the pre-segmentation step. It splits the given string
in multiple substrings, keeping track of the offsets of said substrings from the
NormalizedString
. In some occasions, the PreTokenizer
might need to modify the given
NormalizedString
to ensure we can entirely keep track of the offsets and the mapping with
the original string.
Required methods
fn pre_tokenize(
&self,
normalized: &mut NormalizedString
) -> Result<Vec<(String, Offsets)>>
&self,
normalized: &mut NormalizedString
) -> Result<Vec<(String, Offsets)>>
Implementors
impl PreTokenizer for BertPreTokenizer
[src]
fn pre_tokenize(
&self,
normalized: &mut NormalizedString
) -> Result<Vec<(String, Offsets)>>
[src]
&self,
normalized: &mut NormalizedString
) -> Result<Vec<(String, Offsets)>>
impl PreTokenizer for ByteLevel
[src]
As a PreTokenizer
, ByteLevel
is in charge of transforming all the unicode characters into
their byte-level counterpart. It also splits the input according to the configured regex.
fn pre_tokenize(
&self,
normalized: &mut NormalizedString
) -> Result<Vec<(String, Offsets)>>
[src]
&self,
normalized: &mut NormalizedString
) -> Result<Vec<(String, Offsets)>>
impl PreTokenizer for CharDelimiterSplit
[src]
fn pre_tokenize(
&self,
normalized: &mut NormalizedString
) -> Result<Vec<(String, Offsets)>>
[src]
&self,
normalized: &mut NormalizedString
) -> Result<Vec<(String, Offsets)>>
impl PreTokenizer for Metaspace
[src]
fn pre_tokenize(
&self,
normalized: &mut NormalizedString
) -> Result<Vec<(String, Offsets)>>
[src]
&self,
normalized: &mut NormalizedString
) -> Result<Vec<(String, Offsets)>>
impl PreTokenizer for Whitespace
[src]
fn pre_tokenize(
&self,
normalized: &mut NormalizedString
) -> Result<Vec<(String, Offsets)>>
[src]
&self,
normalized: &mut NormalizedString
) -> Result<Vec<(String, Offsets)>>
impl PreTokenizer for WhitespaceSplit
[src]
fn pre_tokenize(
&self,
normalized: &mut NormalizedString
) -> Result<Vec<(String, Offsets)>>
[src]
&self,
normalized: &mut NormalizedString
) -> Result<Vec<(String, Offsets)>>