[][src]Struct tokenizers::tokenizer::NormalizedString

pub struct NormalizedString { /* fields omitted */ }

A NormalizedString takes care of processing an "original" string to modify it and obtain a "normalized" string. It keeps both version of the string, alignments information between both and provides an interface to retrieve ranges of each string, using offsets from any of them.

It is possible to retrieve a part of the original string, by indexing it with offsets from the normalized one, and the other way around too. It is also possible to convert offsets from one referential to the other one easily.

Methods

impl NormalizedString[src]

pub fn from(s: &str) -> Self[src]

Create a NormalizedString from the given str

pub fn get(&self) -> &str[src]

Return the normalized string

pub fn get_original(&self) -> &str[src]

Return the original string

pub fn convert_offsets<T: RangeBounds<usize>>(
    &self,
    range: Range<T>
) -> Option<Range<usize>>
[src]

Convert the given offsets range from one referential to the other one: Original => Normalized or Normalized => Original

pub fn get_range<T: RangeBounds<usize>>(&self, range: Range<T>) -> Option<&str>[src]

Return a range of the normalized string (indexing on char not bytes)

pub fn get_range_original<T: RangeBounds<usize>>(
    &self,
    range: Range<T>
) -> Option<&str>
[src]

Return a range of the original string (indexing on char not bytes)

pub fn transform<I: Iterator<Item = (char, isize)>>(
    &mut self,
    dest: I,
    initial_offset: usize
)
[src]

Applies transformations to the current normalized version, updating the current alignments with the new ones. This method expect an Iterator yielding each char of the new normalized string with a change isize equals to:

  • 1 if this is a new char
  • -N if the char is right before N removed chars
  • 0 if this char represents the old one (even if changed) Since it is possible that the normalized string doesn't include some of the characters at the beginning of the original one, we need an initial_offset which represents the number of removed chars at the very beginning.

change should never be more than 1. If multiple chars are added, each of them has a change of 1, but more doesn't make any sense. We treat any value above 1 as 1.

pub fn nfd(&mut self) -> &mut Self[src]

Applies NFD normalization

pub fn nfkd(&mut self) -> &mut Self[src]

Applies NFKD normalization

pub fn nfc(&mut self) -> &mut Self[src]

Applies NFC normalization

pub fn nfkc(&mut self) -> &mut Self[src]

Applies NFKC normalization

pub fn filter<F: Fn(&char) -> bool>(&mut self, filter: F) -> &mut Self[src]

Applies filtering over our characters

pub fn prepend(&mut self, s: &str) -> &mut Self[src]

Prepend the given string to ourself

pub fn append(&mut self, s: &str) -> &mut Self[src]

Append the given string to ourself

pub fn map<F: Fn(char) -> char>(&mut self, map: F) -> &mut Self[src]

Map our characters

pub fn for_each<F: FnMut(char)>(&mut self, foreach: F) -> &mut Self[src]

Calls the given function for each characters

pub fn lowercase(&mut self) -> &mut Self[src]

Lowercase

pub fn uppercase(&mut self) -> &mut Self[src]

Uppercase

pub fn split_off(&mut self, at: usize) -> Self[src]

Split off ourselves, returning a new Self that contains the range [at, len). self will then contain the range [0, at). The provided at indexes on char not bytes.

pub fn merge_with(&mut self, other: &NormalizedString)[src]

Merge with the given NormalizedString by appending it to self

pub fn lstrip(&mut self) -> &mut Self[src]

Remove any leading space(s) of the normalized string

pub fn rstrip(&mut self) -> &mut Self[src]

Remove any trailing space(s) of the normalized string

pub fn strip(&mut self) -> &mut Self[src]

Remove any leading and trailing space(s) of the normalized string

pub fn len(&self) -> usize[src]

Returns the length of the normalized string (counting chars not bytes)

pub fn len_original(&self) -> usize[src]

Returns the length of the original string (counting chars not bytes)

pub fn is_empty(&self) -> bool[src]

Whether empty

Trait Implementations

impl Clone for NormalizedString[src]

impl Debug for NormalizedString[src]

impl Default for NormalizedString[src]

impl PartialEq<NormalizedString> for NormalizedString[src]

Auto Trait Implementations

Blanket Implementations

impl<T> Any for T where
    T: 'static + ?Sized
[src]

impl<T> Borrow<T> for T where
    T: ?Sized
[src]

impl<T> BorrowMut<T> for T where
    T: ?Sized
[src]

impl<T> From<T> for T[src]

impl<T, U> Into<U> for T where
    U: From<T>, 
[src]

impl<T> ToOwned for T where
    T: Clone
[src]

type Owned = T

The resulting type after obtaining ownership.

impl<T, U> TryFrom<U> for T where
    U: Into<T>, 
[src]

type Error = Infallible

The type returned in the event of a conversion error.

impl<T, U> TryInto<U> for T where
    U: TryFrom<T>, 
[src]

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.

impl<V, T> VZip<V> for T where
    V: MultiLane<T>,