Struct rust_tokenizers::vocab::SentencePieceModel[−][src]

pub struct SentencePieceModel {
    pub root: TrieNode,
}

SentencePiece Model

Model for SentencePiece tokenizer. Contains the following special values. This model performs the SentencePiece unigram decomposition. As such, it contains a Trie data structure for efficient common prefix search.

Expects a SentencePiece protobuf file when created from file.

Fields

root: TrieNode

Trie data structure containing the vocabulary elements and their unigram log-probabilities

Implementations

`impl SentencePieceModel`[src]

`pub fn from_file(path: &str) -> Result<SentencePieceModel, TokenizerError>`[src]

Creates a SentencePiece Model from a protobuf file.

Example

use rust_tokenizers::vocab::SentencePieceModel;
let path = "path/to/spiece.model";

let sentence_piece_model = SentencePieceModel::from_file(path);

`pub fn common_prefix_search<'a>(&'a self, text: &'a str) -> Vec<&'a TrieNode>`[src]

Performs a common prefix search for a given query on the model Trie structure

Arguments

text (&str): query to find common prefixes from

Returns

Vec<&TrieNode> containing references to the Trie nodes with a common (character based) prefix with the query

Example

use rust_tokenizers::vocab::SentencePieceModel;
let path = "path/to/spiece.model";
let sentence_piece_model = SentencePieceModel::from_file(path).unwrap();

let query = "hello";
let common_prefixes = sentence_piece_model.common_prefix_search(query);

`pub fn decode_forward_token_ref<'a>( &'a self, token: TokenRef<'a> ) -> Vec<Option<Node<'a>>>`[src]

Decodes a TokenRef to a lattice of potential subtokens. This step is usually followed by a backward step to find the most likely sequence.

Arguments

token (TokenRef<'a>): token to decompose in sub-tokens

Returns

Vec<Option<Node<'a>>> vector of lattice nodes. The string for the nodes references back to the original token.

Example

use rust_tokenizers::vocab::SentencePieceModel;
use rust_tokenizers::TokenRef;
let path = "path/to/spiece.model";
let sentence_piece_model = SentencePieceModel::from_file(path).unwrap();

let token = TokenRef::new("hello", &[0, 1, 2, 3]);
let lattice_nodes = sentence_piece_model.decode_forward_token_ref(token);

`pub fn decode_backward<'a>( &'a self, nodes: &'a [Option<Node<'a>>] ) -> Vec<&'a Node<'_>>`[src]

Backward pass through an array of nodes (generated as a result of the forward pass), returning the most likely sequence of nodes. These are usually converted back to tokens in a last step

Arguments

nodes (&'a [Option<Node<'a>>]): possible modes generated from the forward step

Returns

Vec<&'a Node> sequence of most likely nodes

Example

use rust_tokenizers::vocab::SentencePieceModel;
use rust_tokenizers::TokenRef;
let path = "path/to/spiece.model";
let sentence_piece_model = SentencePieceModel::from_file(path).unwrap();

let token = TokenRef::new("hello", &[0, 1, 2, 3]);
let lattice_nodes = sentence_piece_model.decode_forward_token_ref(token);
let best_nodes_sequence = sentence_piece_model.decode_backward(&lattice_nodes);

`pub fn parse_nodes_to_tokens(&self, nodes: Vec<&Node<'_>>) -> Vec<Token>`[src]

Convert the most likely node sequences to a vector of tokens that can be further processed by the tokenizer.

Arguments

nodes (Vec<&Node>): sequence of most likely nodes

Returns

Vec<Token> sequence of most likely sub-tokens

Example

use rust_tokenizers::vocab::SentencePieceModel;
use rust_tokenizers::TokenRef;
let path = "path/to/spiece.model";
let sentence_piece_model = SentencePieceModel::from_file(path).unwrap();

let token = TokenRef::new("hello", &[0, 1, 2, 3]);
let lattice_nodes = sentence_piece_model.decode_forward_token_ref(token);
let best_nodes_sequence = sentence_piece_model.decode_backward(&lattice_nodes);
let sub_tokens = sentence_piece_model.parse_nodes_to_tokens(best_nodes_sequence);

`pub fn populate_masks(&self, tokens: &mut [Token], whitespace_token: char)`[src]

Populates the mask field for a sequence of sub-tokens generated by a SentencePiece model. These masks are not generated as part of the standard unigram decomposition and must be added afterwards. Mutates the tokens in-place.

Arguments

tokens (&mut [Token]): tokens to get the masks from
whitespace_char (char): whitespace character to identify whether a token is a continuation token or not.

Example

use rust_tokenizers::vocab::SentencePieceModel;
use rust_tokenizers::TokenRef;
let path = "path/to/spiece.model";
let sentence_piece_model = SentencePieceModel::from_file(path).unwrap();

let token = TokenRef::new("hello", &[0, 1, 2, 3]);
let lattice_nodes = sentence_piece_model.decode_forward_token_ref(token);
let best_nodes_sequence = sentence_piece_model.decode_backward(&lattice_nodes);
let mut sub_tokens = sentence_piece_model.parse_nodes_to_tokens(best_nodes_sequence);
let sub_tokens_with_masks = sentence_piece_model.populate_masks(&mut sub_tokens, ' ');

Trait Implementations

`impl Clone for SentencePieceModel`[src]

`fn clone(&self) -> SentencePieceModel`[src]

`pub fn clone_from(&mut self, source: &Self)`1.0.0[src]

`impl Debug for SentencePieceModel`[src]

`fn fmt(&self, f: &mut Formatter<'_>) -> Result`[src]

Auto Trait Implementations

`impl RefUnwindSafe for SentencePieceModel`

`impl Send for SentencePieceModel`

`impl Sync for SentencePieceModel`

`impl Unpin for SentencePieceModel`

`impl UnwindSafe for SentencePieceModel`

Blanket Implementations

`impl<T> Any for T where T: 'static + ?Sized,` [src]

`pub fn type_id(&self) -> TypeId`[src]

`impl<T> Borrow<T> for T where T: ?Sized,` [src]

`pub fn borrow(&self) -> &T`[src]

`impl<T> BorrowMut<T> for T where T: ?Sized,` [src]

`pub fn borrow_mut(&mut self) -> &mut T`[src]

`impl<T> From<T> for T`[src]

`pub fn from(t: T) -> T`[src]

`impl<T, U> Into for T where U: From<T>,` [src]

`pub fn into(self) -> U`[src]

`impl<T> Pointable for T`

`pub const ALIGN: usize`

`type Init = T`

The type for initializers.

`pub unsafe fn init(init: <T as Pointable>::Init) -> usize`

`pub unsafe fn deref<'a>(ptr: usize) -> &'a T`

`pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T`

`pub unsafe fn drop(ptr: usize)`

`impl<T> ToOwned for T where T: Clone,` [src]

`type Owned = T`

The resulting type after obtaining ownership.

`pub fn to_owned(&self) -> T`[src]

`pub fn clone_into(&self, target: &mut T)`[src]

`impl<T, U> TryFrom for T where U: Into<T>,` [src]

`type Error = Infallible`

The type returned in the event of a conversion error.

`pub fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>`[src]

`impl<T, U> TryInto for T where U: TryFrom<T>,` [src]

`type Error = >::Error`

The type returned in the event of a conversion error.

Struct rust_tokenizers::vocab::SentencePieceModel[−][src]

Fields

Implementations

impl SentencePieceModel[src]

pub fn from_file(path: &str) -> Result<SentencePieceModel, TokenizerError>[src]

pub fn common_prefix_search<'a>(&'a self, text: &'a str) -> Vec<&'a TrieNode>[src]

pub fn decode_forward_token_ref<'a>( &'a self, token: TokenRef<'a>) -> Vec<Option<Node<'a>>>[src]

pub fn decode_backward<'a>( &'a self, nodes: &'a [Option<Node<'a>>]) -> Vec<&'a Node<'_>>[src]

pub fn parse_nodes_to_tokens(&self, nodes: Vec<&Node<'_>>) -> Vec<Token>[src]

pub fn populate_masks(&self, tokens: &mut [Token], whitespace_token: char)[src]

Trait Implementations

impl Clone for SentencePieceModel[src]

fn clone(&self) -> SentencePieceModel[src]

pub fn clone_from(&mut self, source: &Self)1.0.0[src]

impl Debug for SentencePieceModel[src]

fn fmt(&self, f: &mut Formatter<'_>) -> Result[src]

Auto Trait Implementations

impl RefUnwindSafe for SentencePieceModel

impl Send for SentencePieceModel

impl Sync for SentencePieceModel

impl Unpin for SentencePieceModel

impl UnwindSafe for SentencePieceModel

Blanket Implementations

impl<T> Any for T where T: 'static + ?Sized, [src]

pub fn type_id(&self) -> TypeId[src]

impl<T> Borrow<T> for T where T: ?Sized, [src]

pub fn borrow(&self) -> &T[src]

impl<T> BorrowMut<T> for T where T: ?Sized, [src]

pub fn borrow_mut(&mut self) -> &mut T[src]

impl<T> From<T> for T[src]

pub fn from(t: T) -> T[src]

impl<T, U> Into<U> for T where U: From<T>, [src]

pub fn into(self) -> U[src]

impl<T> Pointable for T

pub const ALIGN: usize

type Init = T

pub unsafe fn init(init: <T as Pointable>::Init) -> usize

pub unsafe fn deref<'a>(ptr: usize) -> &'a T

pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

pub unsafe fn drop(ptr: usize)

impl<T> ToOwned for T where T: Clone, [src]

type Owned = T

pub fn to_owned(&self) -> T[src]

pub fn clone_into(&self, target: &mut T)[src]

impl<T, U> TryFrom<U> for T where U: Into<T>, [src]

type Error = Infallible

pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>[src]

impl<T, U> TryInto<U> for T where U: TryFrom<T>, [src]

type Error = <U as TryFrom<T>>::Error

pub fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>[src]

`impl SentencePieceModel`[src]

`pub fn from_file(path: &str) -> Result<SentencePieceModel, TokenizerError>`[src]

`pub fn common_prefix_search<'a>(&'a self, text: &'a str) -> Vec<&'a TrieNode>`[src]

`pub fn decode_forward_token_ref<'a>( &'a self, token: TokenRef<'a> ) -> Vec<Option<Node<'a>>>`[src]

`pub fn decode_backward<'a>( &'a self, nodes: &'a [Option<Node<'a>>] ) -> Vec<&'a Node<'_>>`[src]

`pub fn parse_nodes_to_tokens(&self, nodes: Vec<&Node<'_>>) -> Vec<Token>`[src]

`pub fn populate_masks(&self, tokens: &mut [Token], whitespace_token: char)`[src]

`impl Clone for SentencePieceModel`[src]

`fn clone(&self) -> SentencePieceModel`[src]

`pub fn clone_from(&mut self, source: &Self)`1.0.0[src]

`impl Debug for SentencePieceModel`[src]

`fn fmt(&self, f: &mut Formatter<'_>) -> Result`[src]

`impl RefUnwindSafe for SentencePieceModel`

`impl Send for SentencePieceModel`

`impl Sync for SentencePieceModel`

`impl Unpin for SentencePieceModel`

`impl UnwindSafe for SentencePieceModel`

`impl<T> Any for T where T: 'static + ?Sized,` [src]

`pub fn type_id(&self) -> TypeId`[src]

`impl<T> Borrow<T> for T where T: ?Sized,` [src]

`pub fn borrow(&self) -> &T`[src]

`impl<T> BorrowMut<T> for T where T: ?Sized,` [src]

`pub fn borrow_mut(&mut self) -> &mut T`[src]

`impl<T> From<T> for T`[src]

`pub fn from(t: T) -> T`[src]

`impl<T, U> Into<U> for T where U: From<T>,` [src]

`pub fn into(self) -> U`[src]

`impl<T> Pointable for T`

`pub const ALIGN: usize`

`type Init = T`

`pub unsafe fn init(init: <T as Pointable>::Init) -> usize`

`pub unsafe fn deref<'a>(ptr: usize) -> &'a T`

`pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T`

`pub unsafe fn drop(ptr: usize)`

`impl<T> ToOwned for T where T: Clone,` [src]

`type Owned = T`

`pub fn to_owned(&self) -> T`[src]

`pub fn clone_into(&self, target: &mut T)`[src]

`impl<T, U> TryFrom<U> for T where U: Into<T>,` [src]

`type Error = Infallible`

`pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>`[src]

`impl<T, U> TryInto<U> for T where U: TryFrom<T>,` [src]

`type Error = <U as TryFrom<T>>::Error`

`pub fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>`[src]