Struct rust_tokenizers::vocab::SentencePieceModel[−][src]

pub struct SentencePieceModel {
    pub root: TrieNode,
}

Expand description

SentencePiece Model

Model for SentencePiece tokenizer. Contains the following special values. This model performs the SentencePiece unigram decomposition. As such, it contains a Trie data structure for efficient common prefix search.

Expects a SentencePiece protobuf file when created from file.

Fields

root: TrieNode

Trie data structure containing the vocabulary elements and their unigram log-probabilities

Implementations

impl SentencePieceModel[src]

pub fn from_file(path: &str) -> Result<SentencePieceModel, TokenizerError>[src]

Creates a SentencePiece Model from a protobuf file.

Example

use rust_tokenizers::vocab::SentencePieceModel;
let path = "path/to/spiece.model";

let sentence_piece_model = SentencePieceModel::from_file(path);

pub fn common_prefix_search<'a>(&'a self, text: &'a str) -> Vec<&'a TrieNode>[src]

Performs a common prefix search for a given query on the model Trie structure

Arguments

text (&str): query to find common prefixes from

Returns

Vec<&TrieNode> containing references to the Trie nodes with a common (character based) prefix with the query

Example

use rust_tokenizers::vocab::SentencePieceModel;
let path = "path/to/spiece.model";
let sentence_piece_model = SentencePieceModel::from_file(path).unwrap();

let query = "hello";
let common_prefixes = sentence_piece_model.common_prefix_search(query);

pub fn decode_forward_token_ref<'a>(
    &'a self, 
    token: TokenRef<'a>
) -> Vec<Option<Node<'a>>>

[src]

Decodes a TokenRef to a lattice of potential subtokens. This step is usually followed by a backward step to find the most likely sequence.

Arguments

token (TokenRef<'a>): token to decompose in sub-tokens

Returns

Vec<Option<Node<'a>>> vector of lattice nodes. The string for the nodes references back to the original token.

Example

use rust_tokenizers::vocab::SentencePieceModel;
use rust_tokenizers::TokenRef;
let path = "path/to/spiece.model";
let sentence_piece_model = SentencePieceModel::from_file(path).unwrap();

let token = TokenRef::new("hello", &[0, 1, 2, 3]);
let lattice_nodes = sentence_piece_model.decode_forward_token_ref(token);

pub fn decode_backward<'a>(
    &'a self, 
    nodes: &'a [Option<Node<'a>>]
) -> Vec<&'a Node<'_>>

[src]

Backward pass through an array of nodes (generated as a result of the forward pass), returning the most likely sequence of nodes. These are usually converted back to tokens in a last step

Arguments

nodes (&'a [Option<Node<'a>>]): possible modes generated from the forward step

Returns

Vec<&'a Node> sequence of most likely nodes

Example

use rust_tokenizers::vocab::SentencePieceModel;
use rust_tokenizers::TokenRef;
let path = "path/to/spiece.model";
let sentence_piece_model = SentencePieceModel::from_file(path).unwrap();

let token = TokenRef::new("hello", &[0, 1, 2, 3]);
let lattice_nodes = sentence_piece_model.decode_forward_token_ref(token);
let best_nodes_sequence = sentence_piece_model.decode_backward(&lattice_nodes);

pub fn parse_nodes_to_tokens(&self, nodes: Vec<&Node<'_>>) -> Vec<Token>[src]

Convert the most likely node sequences to a vector of tokens that can be further processed by the tokenizer.

Arguments

nodes (Vec<&Node>): sequence of most likely nodes

Returns

Vec<Token> sequence of most likely sub-tokens

Example

use rust_tokenizers::vocab::SentencePieceModel;
use rust_tokenizers::TokenRef;
let path = "path/to/spiece.model";
let sentence_piece_model = SentencePieceModel::from_file(path).unwrap();

let token = TokenRef::new("hello", &[0, 1, 2, 3]);
let lattice_nodes = sentence_piece_model.decode_forward_token_ref(token);
let best_nodes_sequence = sentence_piece_model.decode_backward(&lattice_nodes);
let sub_tokens = sentence_piece_model.parse_nodes_to_tokens(best_nodes_sequence);

pub fn populate_masks(&self, tokens: &mut [Token], whitespace_token: char)[src]

Populates the mask field for a sequence of sub-tokens generated by a SentencePiece model. These masks are not generated as part of the standard unigram decomposition and must be added afterwards. Mutates the tokens in-place.

Arguments

tokens (&mut [Token]): tokens to get the masks from
whitespace_char (char): whitespace character to identify whether a token is a continuation token or not.

Example

use rust_tokenizers::vocab::SentencePieceModel;
use rust_tokenizers::TokenRef;
let path = "path/to/spiece.model";
let sentence_piece_model = SentencePieceModel::from_file(path).unwrap();

let token = TokenRef::new("hello", &[0, 1, 2, 3]);
let lattice_nodes = sentence_piece_model.decode_forward_token_ref(token);
let best_nodes_sequence = sentence_piece_model.decode_backward(&lattice_nodes);
let mut sub_tokens = sentence_piece_model.parse_nodes_to_tokens(best_nodes_sequence);
let sub_tokens_with_masks = sentence_piece_model.populate_masks(&mut sub_tokens, ' ');

Trait Implementations

impl Clone for SentencePieceModel[src]

fn clone(&self) -> SentencePieceModel[src]

Returns a copy of the value. Read more

fn clone_from(&mut self, source: &Self)1.0.0 [src]

Performs copy-assignment from source. Read more

impl Debug for SentencePieceModel[src]

fn fmt(&self, f: &mut Formatter<'_>) -> Result[src]

Formats the value using the given formatter. Read more

Auto Trait Implementations

impl RefUnwindSafe for SentencePieceModel

impl Send for SentencePieceModel

impl Sync for SentencePieceModel

impl Unpin for SentencePieceModel

impl UnwindSafe for SentencePieceModel

Blanket Implementations

impl<T> Any for T where
    T: 'static + ?Sized,

[src]

pub fn type_id(&self) -> TypeId[src]

Gets the TypeId of self. Read more

impl<T> Borrow<T> for T where
    T: ?Sized,

[src]

pub fn borrow(&self) -> &T[src]

Immutably borrows from an owned value. Read more

impl<T> BorrowMut<T> for T where
    T: ?Sized,

[src]

pub fn borrow_mut(&mut self) -> &mut T[src]

Mutably borrows from an owned value. Read more

impl<T> From<T> for T[src]

pub fn from(t: T) -> T[src]

Performs the conversion.

impl<T, U> Into<U> for T where
    U: From<T>,

[src]

pub fn into(self) -> U[src]

Performs the conversion.

impl<T> Pointable for T

pub const ALIGN: usize

The alignment of pointer.

type Init = T

The type for initializers.

pub unsafe fn init(init: <T as Pointable>::Init) -> usize

Initializes a with the given initializer. Read more

pub unsafe fn deref<'a>(ptr: usize) -> &'a T

Dereferences the given pointer. Read more

pub unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

Mutably dereferences the given pointer. Read more

pub unsafe fn drop(ptr: usize)

Drops the object pointed to by the given pointer. Read more

impl<T> ToOwned for T where
    T: Clone,

[src]

type Owned = T

The resulting type after obtaining ownership.

pub fn to_owned(&self) -> T[src]

Creates owned data from borrowed data, usually by cloning. Read more

pub fn clone_into(&self, target: &mut T)[src]

🔬 This is a nightly-only experimental API. (toowned_clone_into)

recently added

Uses borrowed data to replace owned data, usually by cloning. Read more

impl<T, U> TryFrom<U> for T where
    U: Into<T>,

[src]

type Error = Infallible

The type returned in the event of a conversion error.

pub fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>[src]

Performs the conversion.

impl<T, U> TryInto<U> for T where
    U: TryFrom<T>,

[src]

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.

pub fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>[src]

Performs the conversion.