[][src]Struct tantivy::tokenizer::NgramTokenizer

pub struct NgramTokenizer { /* fields omitted */ }

Tokenize the text by splitting words into n-grams of the given size(s)

With this tokenizer, the position is always 0. Beware however, in presence of multiple value for the same field, the position will be POSITION_GAP * index of value.

Example 1: hello would be tokenized as (min_gram: 2, max_gram: 3, prefix_only: false)

Termhehelelellllllolo
Position0000000
Offsets0,20,31,31,42,42,53,5

Example 2: hello would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: true)

Termhehelhellhello
Position0000
Offsets0,20,30,40,5

Example 3: hεllo (non-ascii) would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: true)

Termhεlhεllhεllo
Position0000
Offsets0,30,40,50,6

Example

use tantivy::tokenizer::*;
let tokenizer = NgramTokenizer::new(2, 3, false);
let mut stream = tokenizer.token_stream("hello");
{
    let token = stream.next().unwrap();
    assert_eq!(token.text, "he");
    assert_eq!(token.offset_from, 0);
    assert_eq!(token.offset_to, 2);
}
{
  let token = stream.next().unwrap();
    assert_eq!(token.text, "hel");
    assert_eq!(token.offset_from, 0);
    assert_eq!(token.offset_to, 3);
}
{
  let token = stream.next().unwrap();
    assert_eq!(token.text, "el");
    assert_eq!(token.offset_from, 1);
    assert_eq!(token.offset_to, 3);
}
{
  let token = stream.next().unwrap();
    assert_eq!(token.text, "ell");
    assert_eq!(token.offset_from, 1);
    assert_eq!(token.offset_to, 4);
}
{
  let token = stream.next().unwrap();
    assert_eq!(token.text, "ll");
    assert_eq!(token.offset_from, 2);
    assert_eq!(token.offset_to, 4);
}
{
  let token = stream.next().unwrap();
    assert_eq!(token.text, "llo");
    assert_eq!(token.offset_from, 2);
    assert_eq!(token.offset_to, 5);
}
{
  let token = stream.next().unwrap();
  assert_eq!(token.text, "lo");
  assert_eq!(token.offset_from, 3);
  assert_eq!(token.offset_to, 5);
}
assert!(stream.next().is_none());

Methods

impl NgramTokenizer[src]

pub fn new(
    min_gram: usize,
    max_gram: usize,
    prefix_only: bool
) -> NgramTokenizer
[src]

Configures a new Ngram tokenizer

pub fn all_ngrams(min_gram: usize, max_gram: usize) -> NgramTokenizer[src]

Create a NGramTokenizer which generates tokens for all inner ngrams.

This is as opposed to only prefix ngrams .

pub fn prefix_only(min_gram: usize, max_gram: usize) -> NgramTokenizer[src]

Create a NGramTokenizer which only generates tokens for the prefix ngrams.

Trait Implementations

impl<'a> Tokenizer<'a> for NgramTokenizer[src]

type TokenStreamImpl = NgramTokenStream<'a>

Type associated to the resulting tokenstream tokenstream.

impl Clone for NgramTokenizer[src]

Auto Trait Implementations

Blanket Implementations

impl<T> Fruit for T where
    T: Send + Downcast
[src]

impl<T, U> Into<U> for T where
    U: From<T>, 
[src]

impl<T> From<T> for T[src]

impl<T> ToOwned for T where
    T: Clone
[src]

type Owned = T

The resulting type after obtaining ownership.

impl<T, U> TryFrom<U> for T where
    U: Into<T>, 
[src]

type Error = Infallible

The type returned in the event of a conversion error.

impl<T, U> TryInto<U> for T where
    U: TryFrom<T>, 
[src]

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.

impl<T> Borrow<T> for T where
    T: ?Sized
[src]

impl<T> BorrowMut<T> for T where
    T: ?Sized
[src]

impl<T> Any for T where
    T: 'static + ?Sized
[src]

impl<T> Erased for T[src]

impl<T> Downcast for T where
    T: Any
[src]

impl<T> DowncastSync for T where
    T: Send + Sync + Any
[src]