use super::{Token, TokenStream, Tokenizer};
use crate::fts::tokenizer::BoxTokenStream;
#[derive(Clone)]
pub(crate) struct NgramTokenizer {
min_gram: usize,
max_gram: usize,
prefix_only: bool,
}
impl NgramTokenizer {
pub(crate) fn new(min_gram: usize, max_gram: usize, prefix_only: bool) -> NgramTokenizer {
assert!(min_gram > 0, "min_gram must be greater than 0");
assert!(
min_gram <= max_gram,
"min_gram must not be greater than max_gram"
);
NgramTokenizer {
min_gram,
max_gram,
prefix_only,
}
}
}
pub(crate) struct NgramTokenStream<'a> {
ngram_charidx_iterator: StutteringIterator<CodepointFrontiers<'a>>,
prefix_only: bool,
text: &'a str,
token: Token,
}
impl Tokenizer for NgramTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
From::from(NgramTokenStream {
ngram_charidx_iterator: StutteringIterator::new(
CodepointFrontiers::for_str(text),
self.min_gram,
self.max_gram,
),
prefix_only: self.prefix_only,
text,
token: Token::default(),
})
}
}
impl<'a> TokenStream for NgramTokenStream<'a> {
fn advance(&mut self) -> bool {
if let Some((offset_from, offset_to)) = self.ngram_charidx_iterator.next() {
if self.prefix_only && offset_from > 0 {
return false;
}
self.token.position = 0;
self.token.offset_from = offset_from;
self.token.offset_to = offset_to;
self.token.text.clear();
self.token.text.push_str(&self.text[offset_from..offset_to]);
true
} else {
false
}
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
}
}
struct StutteringIterator<T> {
underlying: T,
min_gram: usize,
max_gram: usize,
memory: Vec<usize>,
cursor: usize,
gram_len: usize,
}
impl<T> StutteringIterator<T>
where T: Iterator<Item = usize>
{
pub(crate) fn new(mut underlying: T, min_gram: usize, max_gram: usize) -> StutteringIterator<T> {
assert!(min_gram > 0);
let memory: Vec<usize> = (&mut underlying).take(max_gram + 1).collect();
if memory.len() <= min_gram {
StutteringIterator {
underlying,
min_gram: 1,
max_gram: 0,
memory,
cursor: 0,
gram_len: 0,
}
} else {
StutteringIterator {
underlying,
min_gram,
max_gram: memory.len() - 1,
memory,
cursor: 0,
gram_len: min_gram,
}
}
}
}
impl<T> Iterator for StutteringIterator<T>
where T: Iterator<Item = usize>
{
type Item = (usize, usize);
fn next(&mut self) -> Option<(usize, usize)> {
if self.gram_len > self.max_gram {
self.gram_len = self.min_gram;
if let Some(next_val) = self.underlying.next() {
self.memory[self.cursor] = next_val;
} else {
self.max_gram -= 1;
}
self.cursor += 1;
if self.cursor >= self.memory.len() {
self.cursor = 0;
}
}
if self.max_gram < self.min_gram {
return None;
}
let start = self.memory[self.cursor % self.memory.len()];
let stop = self.memory[(self.cursor + self.gram_len) % self.memory.len()];
self.gram_len += 1;
Some((start, stop))
}
}
struct CodepointFrontiers<'a> {
s: &'a str,
next_el: Option<usize>,
}
impl<'a> CodepointFrontiers<'a> {
fn for_str(s: &'a str) -> Self {
CodepointFrontiers {
s,
next_el: Some(0),
}
}
}
impl<'a> Iterator for CodepointFrontiers<'a> {
type Item = usize;
fn next(&mut self) -> Option<usize> {
self.next_el.inspect(|&offset| {
if self.s.is_empty() {
self.next_el = None;
} else {
let first_codepoint_width = utf8_codepoint_width(self.s.as_bytes()[0]);
self.s = &self.s[first_codepoint_width..];
self.next_el = Some(offset + first_codepoint_width);
}
})
}
}
const CODEPOINT_UTF8_WIDTH: [u8; 16] = [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4];
fn utf8_codepoint_width(b: u8) -> usize {
let higher_4_bits = (b as usize) >> 4;
CODEPOINT_UTF8_WIDTH[higher_4_bits] as usize
}
#[cfg(test)]
mod tests {
use super::{utf8_codepoint_width, CodepointFrontiers, StutteringIterator};
#[test]
fn test_utf8_codepoint_width() {
for i in 0..128 {
assert_eq!(utf8_codepoint_width(i), 1);
}
for i in (128 | 64)..(128 | 64 | 32) {
assert_eq!(utf8_codepoint_width(i), 2);
}
for i in (128 | 64 | 32)..(128 | 64 | 32 | 16) {
assert_eq!(utf8_codepoint_width(i), 3);
}
for i in (128 | 64 | 32 | 16)..256 {
assert_eq!(utf8_codepoint_width(i as u8), 4);
}
}
#[test]
fn test_codepoint_frontiers() {
assert_eq!(CodepointFrontiers::for_str("").collect::<Vec<_>>(), vec![0]);
assert_eq!(
CodepointFrontiers::for_str("abcd").collect::<Vec<_>>(),
vec![0, 1, 2, 3, 4]
);
assert_eq!(
CodepointFrontiers::for_str("aあ").collect::<Vec<_>>(),
vec![0, 1, 4]
);
}
#[test]
fn test_stutterring_iterator_empty() {
let rg: Vec<usize> = vec![0];
let mut it = StutteringIterator::new(rg.into_iter(), 1, 2);
assert_eq!(it.next(), None);
}
#[test]
fn test_stutterring_iterator() {
let mut it = StutteringIterator::new(0..10, 1, 2);
assert_eq!(it.next(), Some((0, 1)));
assert_eq!(it.next(), Some((0, 2)));
assert_eq!(it.next(), Some((1, 2)));
assert_eq!(it.next(), Some((1, 3)));
assert_eq!(it.next(), Some((2, 3)));
assert_eq!(it.next(), Some((2, 4)));
assert_eq!(it.next(), Some((3, 4)));
assert_eq!(it.next(), Some((3, 5)));
assert_eq!(it.next(), Some((4, 5)));
assert_eq!(it.next(), Some((4, 6)));
assert_eq!(it.next(), Some((5, 6)));
assert_eq!(it.next(), Some((5, 7)));
assert_eq!(it.next(), Some((6, 7)));
assert_eq!(it.next(), Some((6, 8)));
assert_eq!(it.next(), Some((7, 8)));
assert_eq!(it.next(), Some((7, 9)));
assert_eq!(it.next(), Some((8, 9)));
assert_eq!(it.next(), None);
}
}