Trait Tokenizer

Source

pub trait Tokenizer: Send + Sync {
    // Required methods
    fn tokenize(&self, text: &str, language: Option<&Language>) -> Vec<Token>;
    fn name(&self) -> &'static str;

    // Provided method
    fn is_stopword(&self, token: &Token, language: Option<&Language>) -> bool { ... }
}

Expand description

Trait for language-specific tokenization.

Implementations should handle:

Word segmentation (whitespace, morphological, statistical)
Stopword detection
Case normalization (if applicable)
Script-specific rules (CJK, Arabic, etc.)

Required Methods§

Source

fn tokenize(&self, text: &str, language: Option<&Language>) -> Vec<Token>

Tokenize text into a sequence of tokens.

§Arguments

text: Input text to tokenize
language: Optional language hint (ISO 639-1 code or Language enum)

§Returns

Vector of tokens with positions and optional linguistic annotations.

Source

fn name(&self) -> &'static str

Get the tokenizer name/identifier.

Provided Methods§

Source

fn is_stopword(&self, token: &Token, language: Option<&Language>) -> bool

Check if a token is a stopword (common function words to ignore).

Default implementation returns false (no stopwords). Language-specific implementations should override this.

Tokenizer

Trait Tokenizer

Required Methods§

fn tokenize(&self, text: &str, language: Option<&Language>) -> Vec<Token>

§Arguments

§Returns

fn name(&self) -> &'static str

Provided Methods§

fn is_stopword(&self, token: &Token, language: Option<&Language>) -> bool

Implementors§

impl Tokenizer for UnicodeSegmenter

impl Tokenizer for WhitespaceTokenizer

Tokenizer

Trait Tokenizer Copy item path

Required Methods§

fn tokenize(&self, text: &str, language: Option<&Language>) -> Vec<Token>

§Arguments

§Returns

fn name(&self) -> &'static str

Provided Methods§

fn is_stopword(&self, token: &Token, language: Option<&Language>) -> bool

Implementors§

impl Tokenizer for UnicodeSegmenter

impl Tokenizer for WhitespaceTokenizer

Trait Tokenizer