use alloc::string::String;
use alloc::vec::Vec;
pub fn char_ngrams(text: &str, n: usize) -> impl Iterator<Item = &str> {
CharNgramIter::new(text, n)
}
pub fn token_ngrams<'a>(tokens: &'a [&'a str], n: usize) -> impl Iterator<Item = String> + 'a {
TokenNgramIter { tokens, n, pos: 0 }
}
struct CharNgramIter<'a> {
text: &'a str,
n: usize,
boundaries: Vec<usize>,
pos: usize,
}
impl<'a> CharNgramIter<'a> {
fn new(text: &'a str, n: usize) -> Self {
let boundaries: Vec<usize> = text
.char_indices()
.map(|(i, _)| i)
.chain(core::iter::once(text.len()))
.collect();
CharNgramIter {
text,
n,
boundaries,
pos: 0,
}
}
}
impl<'a> Iterator for CharNgramIter<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<Self::Item> {
if self.n == 0 {
return None;
}
let end_idx = self.pos + self.n;
if end_idx >= self.boundaries.len() {
return None;
}
let start_byte = self.boundaries[self.pos];
let end_byte = self.boundaries[end_idx];
self.pos += 1;
Some(&self.text[start_byte..end_byte])
}
}
struct TokenNgramIter<'a> {
tokens: &'a [&'a str],
n: usize,
pos: usize,
}
impl<'a> Iterator for TokenNgramIter<'a> {
type Item = String;
fn next(&mut self) -> Option<Self::Item> {
if self.n == 0 {
return None;
}
let end = self.pos + self.n;
if end > self.tokens.len() {
return None;
}
let gram: String = self.tokens[self.pos..end].concat();
self.pos += 1;
Some(gram)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn char_bigrams_ascii() {
let grams: Vec<&str> = char_ngrams("abcd", 2).collect();
assert_eq!(grams, &["ab", "bc", "cd"]);
}
#[test]
fn char_trigrams_thai() {
let grams: Vec<&str> = char_ngrams("กขค", 3).collect();
assert_eq!(grams, &["กขค"]);
}
#[test]
fn char_bigrams_thai_multibyte() {
let grams: Vec<&str> = char_ngrams("กขคง", 2).collect();
assert_eq!(grams, &["กข", "ขค", "คง"]);
for g in &grams {
assert_eq!(g.chars().count(), 2);
}
}
#[test]
fn char_ngrams_n_larger_than_text_is_empty() {
let grams: Vec<&str> = char_ngrams("กข", 5).collect();
assert!(grams.is_empty());
}
#[test]
fn char_ngrams_n_zero_is_empty() {
let grams: Vec<&str> = char_ngrams("กขค", 0).collect();
assert!(grams.is_empty());
}
#[test]
fn char_ngrams_empty_text_is_empty() {
let grams: Vec<&str> = char_ngrams("", 2).collect();
assert!(grams.is_empty());
}
#[test]
fn char_ngrams_n_equals_len_yields_one() {
let grams: Vec<&str> = char_ngrams("กขค", 3).collect();
assert_eq!(grams.len(), 1);
assert_eq!(grams[0], "กขค");
}
#[test]
fn char_ngrams_swasadee_bigrams() {
let grams: Vec<&str> = char_ngrams("สวัสดี", 2).collect();
assert_eq!(grams.len(), 5);
assert!(grams.contains(&"สว"));
}
#[test]
fn token_bigrams_basic() {
let tokens = &["กิน", "ข้าว", "กับ", "ปลา"];
let bigrams: Vec<String> = token_ngrams(tokens, 2).collect();
assert_eq!(bigrams, &["กินข้าว", "ข้าวกับ", "กับปลา"]);
}
#[test]
fn token_trigrams_basic() {
let tokens = &["กิน", "ข้าว", "กับ", "ปลา"];
let trigrams: Vec<String> = token_ngrams(tokens, 3).collect();
assert_eq!(trigrams, &["กินข้าวกับ", "ข้าวกับปลา"]);
}
#[test]
fn token_ngrams_n_larger_than_count_is_empty() {
let tokens = &["กิน", "ข้าว"];
let grams: Vec<String> = token_ngrams(tokens, 5).collect();
assert!(grams.is_empty());
}
#[test]
fn token_ngrams_n_zero_is_empty() {
let tokens = &["กิน", "ข้าว"];
let grams: Vec<String> = token_ngrams(tokens, 0).collect();
assert!(grams.is_empty());
}
#[test]
fn token_ngrams_empty_tokens_is_empty() {
let tokens: &[&str] = &[];
let grams: Vec<String> = token_ngrams(tokens, 2).collect();
assert!(grams.is_empty());
}
#[test]
fn token_unigrams_yield_each_token() {
let tokens = &["กิน", "ข้าว", "ปลา"];
let unigrams: Vec<String> = token_ngrams(tokens, 1).collect();
assert_eq!(
unigrams,
&[String::from("กิน"), String::from("ข้าว"), String::from("ปลา")]
);
}
}