use std::cmp;
use std::collections::{binary_heap, BinaryHeap};
use std::convert::TryInto;
use std::fmt;
use std::fs::File;
use std::io::{self, Write};
use std::path::Path;
use std::str::{self, FromStr};
use std::time::Instant;
use fnv::FnvHashMap;
use fst;
use memmap::Mmap;
use serde::{Deserialize, Serialize};
use serde_json;
use crate::error::{Error, Result};
use crate::index::writer::CursorWriter;
use crate::scored::{Scored, SearchResults};
use crate::util::{
fst_map_builder_file, fst_map_file, mmap_file, open_file, NiceDuration,
};
const CONFIG: &str = "names.config.json";
const NGRAM: &str = "names.ngram.fst";
const POSTINGS: &str = "names.postings.idx";
const IDMAP: &str = "names.idmap.idx";
const NORMS: &str = "names.norms.idx";
pub type NameID = u64;
type DocID = u32;
const MAX_DOC_ID: DocID = (1 << 28) - 1;
#[derive(Clone, Debug)]
pub struct NameQuery {
name: String,
size: usize,
scorer: NameScorer,
stop_word_ratio: f64,
}
impl NameQuery {
pub fn new(name: &str) -> NameQuery {
NameQuery {
name: name.to_string(),
size: 30,
scorer: NameScorer::default(),
stop_word_ratio: 0.01,
}
}
pub fn with_size(self, size: usize) -> NameQuery {
NameQuery { size, ..self }
}
pub fn with_scorer(self, scorer: NameScorer) -> NameQuery {
NameQuery { scorer, ..self }
}
pub fn with_stop_word_ratio(self, ratio: f64) -> NameQuery {
NameQuery { stop_word_ratio: ratio, ..self }
}
}
#[derive(Debug)]
pub struct IndexReader {
config: Config,
ngram: fst::Map<Mmap>,
postings: Mmap,
idmap: Mmap,
norms: Mmap,
}
#[derive(Debug, Deserialize, Serialize)]
struct Config {
ngram_type: NgramType,
ngram_size: usize,
avg_document_len: f64,
num_documents: u64,
}
impl IndexReader {
pub fn open<P: AsRef<Path>>(dir: P) -> Result<IndexReader> {
let dir = dir.as_ref();
let ngram = unsafe { fst_map_file(dir.join(NGRAM))? };
let postings = unsafe { mmap_file(dir.join(POSTINGS))? };
let idmap = unsafe { mmap_file(dir.join(IDMAP))? };
let norms = unsafe { mmap_file(dir.join(NORMS))? };
let config_file = open_file(dir.join(CONFIG))?;
let config: Config = serde_json::from_reader(config_file)
.map_err(|e| Error::config(e.to_string()))?;
Ok(IndexReader { config, ngram, postings, idmap, norms })
}
pub fn search(&self, query: &NameQuery) -> SearchResults<NameID> {
let start = Instant::now();
let mut searcher = Searcher::new(self, query);
let results = CollectTopK::new(query.size).collect(&mut searcher);
log::debug!(
"search for {:?} took {}",
query,
NiceDuration::since(start)
);
results
}
fn docid_to_nameid(&self, docid: DocID) -> NameID {
let start = 8 * (docid as usize);
let buf = self.idmap[start..start + 8].try_into().unwrap();
u64::from_le_bytes(buf)
}
fn document_length(&self, docid: DocID) -> u64 {
let start = 2 * (docid as usize);
let buf = self.norms[start..start + 2].try_into().unwrap();
u16::from_le_bytes(buf) as u64
}
}
struct CollectTopK {
k: usize,
queue: BinaryHeap<cmp::Reverse<Scored<NameID>>>,
byid: FnvHashMap<NameID, f64>,
}
impl CollectTopK {
fn new(k: usize) -> CollectTopK {
CollectTopK {
k,
queue: BinaryHeap::with_capacity(k),
byid: FnvHashMap::default(),
}
}
fn collect(mut self, searcher: &mut Searcher) -> SearchResults<NameID> {
if self.k == 0 {
return SearchResults::new();
}
let index = searcher.index();
let (mut count, mut push_count) = (0, 0);
for scored_with_docid in searcher {
count += 1;
let scored = scored_with_docid.map(|v| index.docid_to_nameid(v));
if let Some(&score) = self.byid.get(scored.value()) {
if scored.score() > score {
self.byid.insert(*scored.value(), scored.score());
}
continue;
}
let mut dopush = self.queue.len() < self.k;
if !dopush {
let worst = self.queue.peek_mut().unwrap();
if worst.0 < scored {
self.byid.remove(worst.0.value());
binary_heap::PeekMut::pop(worst);
dopush = true;
}
}
if dopush {
push_count += 1;
self.byid.insert(*scored.value(), scored.score());
self.queue.push(cmp::Reverse(scored));
}
}
log::debug!(
"collect count: {:?}, collect push count: {:?}",
count,
push_count
);
let mut results = SearchResults::from_min_heap(&mut self.queue);
results.normalize();
results
}
}
struct Searcher<'i> {
index: &'i IndexReader,
primary: Disjunction<'i>,
high: Disjunction<'i>,
}
impl<'i> Searcher<'i> {
fn new(idx: &'i IndexReader, query: &NameQuery) -> Searcher<'i> {
let num_docs = idx.config.num_documents as f64;
let (mut low, mut high) = (vec![], vec![]);
let (mut low_terms, mut high_terms) = (vec![], vec![]);
let name = normalize_query(&query.name);
let mut query_len = 0;
let mut multiset = FnvHashMap::default();
idx.config.ngram_type.iter(idx.config.ngram_size, &name, |term| {
*multiset.entry(term).or_insert(0) += 1;
query_len += 1;
});
for (term, &count) in multiset.iter() {
let postings = PostingIter::new(idx, query.scorer, count, term);
let ratio = (postings.len() as f64) / num_docs;
if ratio < query.stop_word_ratio {
low.push(postings);
low_terms.push(format!("{}:{}:{:0.6}", term, count, ratio));
} else {
high.push(postings);
high_terms.push(format!("{}:{}:{:0.6}", term, count, ratio));
}
}
log::debug!("starting search for: {:?}", name);
log::debug!("{:?} low frequency terms: {:?}", low.len(), low_terms);
log::debug!("{:?} high frequency terms: {:?}", high.len(), high_terms);
if low.is_empty() {
Searcher {
index: idx,
primary: Disjunction::new(idx, query_len, query.scorer, high),
high: Disjunction::empty(idx, query.scorer),
}
} else {
Searcher {
index: idx,
primary: Disjunction::new(idx, query_len, query.scorer, low),
high: Disjunction::new(idx, query_len, query.scorer, high),
}
}
}
fn index(&self) -> &'i IndexReader {
self.index
}
}
impl<'i> Iterator for Searcher<'i> {
type Item = Scored<DocID>;
fn next(&mut self) -> Option<Scored<DocID>> {
let mut scored = match self.primary.next() {
None => return None,
Some(scored) => scored,
};
if let Some(other_scored) = self.high.skip_to(*scored.value()) {
scored = scored.map_score(|s| s + other_scored.score());
}
Some(scored)
}
}
struct Disjunction<'i> {
index: &'i IndexReader,
query_len: f64,
scorer: NameScorer,
queue: BinaryHeap<PostingIter<'i>>,
is_done: bool,
}
impl<'i> Disjunction<'i> {
fn new(
index: &'i IndexReader,
query_len: usize,
scorer: NameScorer,
posting_iters: Vec<PostingIter<'i>>,
) -> Disjunction<'i> {
let mut queue = BinaryHeap::new();
for postings in posting_iters {
queue.push(postings);
}
let is_done = queue.is_empty();
let query_len = query_len as f64;
Disjunction { index, query_len, scorer, queue, is_done }
}
fn empty(index: &'i IndexReader, scorer: NameScorer) -> Disjunction<'i> {
Disjunction {
index,
query_len: 0.0,
scorer,
queue: BinaryHeap::new(),
is_done: true,
}
}
fn skip_to(&mut self, target_docid: DocID) -> Option<Scored<DocID>> {
if self.is_done {
return None;
}
let mut found = false;
loop {
let mut postings = self.queue.peek_mut().unwrap();
if postings.docid().map_or(true, |x| x >= target_docid) {
found = found || Some(target_docid) == postings.docid();
break;
}
while postings.docid().map_or(false, |x| x < target_docid) {
postings.next();
}
found = found || Some(target_docid) == postings.docid();
}
if !found {
return None;
}
self.next()
}
}
impl<'i> Iterator for Disjunction<'i> {
type Item = Scored<DocID>;
fn next(&mut self) -> Option<Scored<DocID>> {
if self.is_done {
return None;
}
let mut scored1 = {
let mut postings = self.queue.peek_mut().unwrap();
match postings.score() {
None => {
self.is_done = true;
return None;
}
Some(scored) => {
postings.next();
scored
}
}
};
loop {
let mut postings = self.queue.peek_mut().unwrap();
match postings.score() {
None => break,
Some(scored2) => {
if scored1.value() != scored2.value() {
break;
}
scored1 = scored1.map_score(|s| s + scored2.score());
postings.next();
}
}
}
if let NameScorer::Jaccard = self.scorer {
let doc_len = self.index.document_length(*scored1.value()) as f64;
let union = self.query_len + doc_len - scored1.score();
scored1 = scored1.map_score(|s| s / union);
} else if let NameScorer::QueryRatio = self.scorer {
scored1 = scored1.map_score(|s| s / self.query_len)
}
Some(scored1)
}
}
#[derive(Clone)]
struct PostingIter<'i> {
index: &'i IndexReader,
scorer: NameScorer,
count: f64,
postings: &'i [u8],
len: usize,
posting: Option<Posting>,
docid: DocID,
okapi_idf: f64,
}
#[derive(Clone, Copy, Debug)]
struct Posting {
docid: DocID,
frequency: u32,
}
impl Posting {
fn read(slice: &[u8]) -> Option<Posting> {
if slice.is_empty() {
None
} else {
let v = read_le_u32(slice);
Some(Posting { docid: v & MAX_DOC_ID, frequency: v >> 28 })
}
}
}
impl<'i> PostingIter<'i> {
fn new(
index: &'i IndexReader,
scorer: NameScorer,
count: usize,
term: &str,
) -> PostingIter<'i> {
let mut postings = &*index.postings;
let offset = match index.ngram.get(term.as_bytes()) {
Some(offset) => offset as usize,
None => {
return PostingIter {
index,
scorer,
count: 0.0,
postings: &[],
len: 0,
posting: None,
docid: MAX_DOC_ID + 1,
okapi_idf: 0.0,
};
}
};
postings = &postings[offset..];
let len = read_le_u32(postings) as usize;
postings = &postings[4..];
let corpus_count = index.config.num_documents as f64;
let df = len as f64;
let okapi_idf = (1.0 + (corpus_count - df + 0.5) / (df + 0.5)).log2();
let mut it = PostingIter {
index,
scorer,
count: count as f64,
postings: &postings[..4 * len],
len,
posting: None,
docid: 0,
okapi_idf,
};
it.next();
it
}
fn posting(&self) -> Option<Posting> {
self.posting
}
fn len(&self) -> usize {
self.len
}
fn docid(&self) -> Option<DocID> {
self.posting().map(|p| p.docid)
}
fn score(&self) -> Option<Scored<DocID>> {
match self.scorer {
NameScorer::OkapiBM25 => self.score_okapibm25(),
NameScorer::TFIDF => self.score_tfidf(),
NameScorer::Jaccard => self.score_jaccard(),
NameScorer::QueryRatio => self.score_query_ratio(),
}
.map(|scored| scored.map_score(|s| s * self.count))
}
fn score_okapibm25(&self) -> Option<Scored<DocID>> {
let post = match self.posting() {
None => return None,
Some(post) => post,
};
let k1 = 1.2;
let b = 0.75;
let doc_len = self.index.document_length(post.docid);
let norm = (doc_len as f64) / self.index.config.avg_document_len;
let tf = post.frequency as f64;
let num = tf * (k1 + 1.0);
let den = tf + k1 * (1.0 - b + b * norm);
let score = (num / den) * self.okapi_idf;
let capped = if score < 0.0 { 0.0 } else { score };
Some(Scored::new(post.docid).with_score(capped))
}
fn score_tfidf(&self) -> Option<Scored<DocID>> {
let post = match self.posting() {
None => return None,
Some(post) => post,
};
let corpus_docs = self.index.config.num_documents as f64;
let term_docs = self.len as f64;
let tf = post.frequency as f64;
let idf = (corpus_docs / (1.0 + term_docs)).log2();
let score = tf * idf;
Some(Scored::new(post.docid).with_score(score))
}
fn score_jaccard(&self) -> Option<Scored<DocID>> {
self.posting().map(|p| Scored::new(p.docid).with_score(1.0))
}
fn score_query_ratio(&self) -> Option<Scored<DocID>> {
self.posting().map(|p| Scored::new(p.docid).with_score(1.0))
}
}
impl<'i> Iterator for PostingIter<'i> {
type Item = Posting;
fn next(&mut self) -> Option<Posting> {
self.posting = match Posting::read(self.postings) {
None => {
self.docid = MAX_DOC_ID + 1;
None
}
Some(p) => {
self.postings = &self.postings[4..];
self.docid = p.docid;
Some(p)
}
};
self.posting
}
}
impl<'i> Eq for PostingIter<'i> {}
impl<'i> PartialEq for PostingIter<'i> {
fn eq(&self, other: &PostingIter<'i>) -> bool {
self.docid == other.docid
}
}
impl<'i> Ord for PostingIter<'i> {
fn cmp(&self, other: &PostingIter<'i>) -> cmp::Ordering {
self.docid.cmp(&other.docid).reverse()
}
}
impl<'i> PartialOrd for PostingIter<'i> {
fn partial_cmp(&self, other: &PostingIter<'i>) -> Option<cmp::Ordering> {
Some(self.cmp(other))
}
}
pub struct IndexWriter {
ngram: fst::MapBuilder<io::BufWriter<File>>,
ngram_type: NgramType,
ngram_size: usize,
postings: CursorWriter<io::BufWriter<File>>,
idmap: CursorWriter<io::BufWriter<File>>,
norms: CursorWriter<io::BufWriter<File>>,
config: CursorWriter<io::BufWriter<File>>,
terms: FnvHashMap<String, Postings>,
next_docid: DocID,
avg_document_len: f64,
}
#[derive(Clone, Debug, Default)]
struct Postings {
list: Vec<Posting>,
}
impl IndexWriter {
pub fn open<P: AsRef<Path>>(
dir: P,
ngram_type: NgramType,
ngram_size: usize,
) -> Result<IndexWriter> {
let dir = dir.as_ref();
let ngram = fst_map_builder_file(dir.join(NGRAM))?;
let postings = CursorWriter::from_path(dir.join(POSTINGS))?;
let idmap = CursorWriter::from_path(dir.join(IDMAP))?;
let norms = CursorWriter::from_path(dir.join(NORMS))?;
let config = CursorWriter::from_path(dir.join(CONFIG))?;
Ok(IndexWriter {
ngram,
ngram_type,
ngram_size,
postings,
idmap,
norms,
config,
terms: FnvHashMap::default(),
next_docid: 0,
avg_document_len: 0.0,
})
}
pub fn finish(mut self) -> Result<()> {
let num_docs = self.num_docs();
let mut ngram_to_postings: Vec<(String, Postings)> =
self.terms.into_iter().collect();
ngram_to_postings.sort_by(|&(ref t1, _), &(ref t2, _)| t1.cmp(t2));
for (term, postings) in ngram_to_postings {
let pos = self.postings.position() as u64;
self.ngram.insert(term.as_bytes(), pos).map_err(Error::fst)?;
self.postings
.write_u32(postings.list.len() as u32)
.map_err(Error::io)?;
for posting in postings.list {
let freq = cmp::min(15, posting.frequency);
let v = (freq << 28) | posting.docid;
self.postings.write_u32(v).map_err(Error::io)?;
}
}
serde_json::to_writer_pretty(
&mut self.config,
&Config {
ngram_type: self.ngram_type,
ngram_size: self.ngram_size,
avg_document_len: self.avg_document_len,
num_documents: num_docs as u64,
},
)
.map_err(|e| Error::config(e.to_string()))?;
self.ngram.finish().map_err(Error::fst)?;
self.idmap.flush().map_err(Error::io)?;
self.postings.flush().map_err(Error::io)?;
self.norms.flush().map_err(Error::io)?;
self.config.flush().map_err(Error::io)?;
Ok(())
}
pub fn insert(&mut self, name_id: NameID, name: &str) -> Result<()> {
let docid = self.next_docid(name_id)?;
let name = normalize_query(name);
let mut count = 0u16; self.ngram_type.clone().iter(self.ngram_size, &name, |ngram| {
self.insert_term(docid, ngram);
count = count.saturating_add(1);
});
self.avg_document_len +=
(count as f64 - self.avg_document_len) / (self.num_docs() as f64);
self.norms.write_u16(count).map_err(Error::io)?;
Ok(())
}
fn insert_term(&mut self, docid: DocID, term: &str) {
if let Some(posts) = self.terms.get_mut(term) {
posts.posting(docid).frequency += 1;
return;
}
let mut list = Postings::default();
list.posting(docid).frequency = 1;
self.terms.insert(term.to_string(), list);
}
fn next_docid(&mut self, name_id: NameID) -> Result<DocID> {
let docid = self.next_docid;
self.idmap.write_u64(name_id).map_err(Error::io)?;
self.next_docid = match self.next_docid.checked_add(1) {
None => bug!("exhausted doc ids"),
Some(next_docid) => next_docid,
};
if self.next_docid > MAX_DOC_ID {
let max = MAX_DOC_ID + 1; bug!("exceeded maximum number of names ({})", max);
}
Ok(docid)
}
fn num_docs(&self) -> u32 {
self.next_docid
}
}
impl Postings {
fn posting(&mut self, docid: DocID) -> &mut Posting {
if self.list.last().map_or(true, |x| x.docid != docid) {
self.list.push(Posting { docid, frequency: 0 });
}
self.list.last_mut().unwrap()
}
}
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
pub enum NameScorer {
OkapiBM25,
TFIDF,
Jaccard,
QueryRatio,
}
impl NameScorer {
pub fn possible_names() -> &'static [&'static str] {
&["okapibm25", "tfidf", "jaccard", "queryratio"]
}
pub fn as_str(&self) -> &'static str {
match *self {
NameScorer::OkapiBM25 => "okapibm25",
NameScorer::TFIDF => "tfidf",
NameScorer::Jaccard => "jaccard",
NameScorer::QueryRatio => "queryratio",
}
}
}
impl Default for NameScorer {
fn default() -> NameScorer {
NameScorer::OkapiBM25
}
}
impl fmt::Display for NameScorer {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.as_str())
}
}
impl FromStr for NameScorer {
type Err = Error;
fn from_str(s: &str) -> Result<NameScorer> {
match s {
"okapibm25" => Ok(NameScorer::OkapiBM25),
"tfidf" => Ok(NameScorer::TFIDF),
"jaccard" => Ok(NameScorer::Jaccard),
"queryratio" => Ok(NameScorer::QueryRatio),
unk => Err(Error::unknown_scorer(unk)),
}
}
}
#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]
pub enum NgramType {
#[serde(rename = "window")]
Window,
#[serde(rename = "edge")]
Edge,
}
const MIN_EDGE_NGRAM_SIZE: usize = 3;
impl NgramType {
pub fn possible_names() -> &'static [&'static str] {
&["window", "edge"]
}
pub fn as_str(&self) -> &'static str {
match *self {
NgramType::Window => "window",
NgramType::Edge => "edge",
}
}
fn iter<'t, F: FnMut(&'t str)>(&self, size: usize, text: &'t str, f: F) {
match *self {
NgramType::Window => NgramType::iter_window(size, text, f),
NgramType::Edge => NgramType::iter_edge(size, text, f),
}
}
fn iter_window<'t, F: FnMut(&'t str)>(
size: usize,
text: &'t str,
mut f: F,
) {
if size == 0 {
return;
}
let end_skip = text.chars().take(size).count().saturating_sub(1);
let start = text.char_indices();
let end = text.char_indices().skip(end_skip);
for ((s, _), (e, c)) in start.zip(end) {
f(&text[s..e + c.len_utf8()]);
}
}
fn iter_edge<'t, F: FnMut(&'t str)>(
max_size: usize,
text: &'t str,
mut f: F,
) {
if max_size == 0 {
return;
}
for word in text.split_whitespace() {
let end_skip = word
.chars()
.take(MIN_EDGE_NGRAM_SIZE)
.count()
.saturating_sub(1);
let mut size = end_skip + 1;
for (end, c) in word.char_indices().skip(end_skip) {
f(&word[..end + c.len_utf8()]);
size += 1;
if size > max_size {
break;
}
}
}
}
}
impl Default for NgramType {
fn default() -> NgramType {
NgramType::Window
}
}
impl fmt::Display for NgramType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.as_str())
}
}
impl FromStr for NgramType {
type Err = Error;
fn from_str(s: &str) -> Result<NgramType> {
match s {
"window" => Ok(NgramType::Window),
"edge" => Ok(NgramType::Edge),
unk => Err(Error::unknown_ngram_type(unk)),
}
}
}
fn normalize_query(s: &str) -> String {
s.to_lowercase()
}
fn read_le_u32(slice: &[u8]) -> u32 {
u32::from_le_bytes(slice[..4].try_into().unwrap())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::index::tests::TestContext;
fn create_index(index_dir: &Path, names: &[&str]) -> IndexReader {
let mut wtr =
IndexWriter::open(index_dir, NgramType::Window, 3).unwrap();
for (i, name) in names.iter().enumerate() {
wtr.insert(i as u64, name).unwrap();
}
wtr.finish().unwrap();
IndexReader::open(index_dir).unwrap()
}
fn name_query(name: &str) -> NameQuery {
NameQuery::new(name).with_stop_word_ratio(0.0)
}
fn ids(results: &[Scored<NameID>]) -> Vec<NameID> {
let mut ids: Vec<_> = results.iter().map(|r| *r.value()).collect();
ids.sort();
ids
}
const BRUCES: &'static [&'static str] = &[
"Bruce Springsteen", "Bruce Kulick", "Bruce Arians", "Bruce Smith", "Bruce Willis", "Bruce Wayne", "Bruce Banner", ];
#[test]
fn names_bruces_1() {
let ctx = TestContext::new("small");
let idx = create_index(ctx.index_dir(), BRUCES);
let query = name_query("bruce");
let results = idx.search(&query).into_vec();
assert_eq!(results.len(), 7);
assert_eq!(results[0].score(), 1.0);
assert_eq!(results[1].score(), 1.0);
assert_eq!(ids(&results[0..2]), vec![3, 5]);
}
#[test]
fn names_bruces_2() {
let ctx = TestContext::new("small");
let idx = create_index(ctx.index_dir(), BRUCES);
let query = name_query("e w");
let results = idx.search(&query).into_vec();
assert_eq!(results.len(), 2);
assert_eq!(*results[0].value(), 5);
assert_eq!(*results[1].value(), 4);
}
#[test]
fn names_bruces_3() {
let ctx = TestContext::new("small");
let idx = create_index(ctx.index_dir(), BRUCES);
let query = name_query("Springsteen");
let results = idx.search(&query).into_vec();
assert_eq!(results.len(), 1);
assert_eq!(*results[0].value(), 0);
}
#[test]
fn names_bruces_4() {
let ctx = TestContext::new("small");
let idx = create_index(ctx.index_dir(), BRUCES);
let query =
name_query("Springsteen Kulick Arians Smith Willis Wayne Banner");
let results = idx.search(&query).into_vec();
assert_eq!(results.len(), 7);
}
fn ngrams_window(n: usize, text: &str) -> Vec<&str> {
let mut grams = vec![];
NgramType::Window.iter(n, text, |gram| grams.push(gram));
grams
}
fn ngrams_edge(n: usize, text: &str) -> Vec<&str> {
let mut grams = vec![];
NgramType::Edge.iter(n, text, |gram| grams.push(gram));
grams
}
#[test]
#[should_panic]
fn ngrams_window_zero_banned() {
assert_eq!(ngrams_window(0, "abc"), vec!["abc"]);
}
#[test]
fn ngrams_window_weird_sizes() {
assert_eq!(
ngrams_window(2, "abcdef"),
vec!["ab", "bc", "cd", "de", "ef",]
);
assert_eq!(
ngrams_window(1, "abcdef"),
vec!["a", "b", "c", "d", "e", "f",]
);
assert_eq!(ngrams_window(2, "ab"), vec!["ab",]);
assert_eq!(ngrams_window(1, "ab"), vec!["a", "b",]);
assert_eq!(ngrams_window(1, "a"), vec!["a",]);
assert_eq!(ngrams_window(1, ""), Vec::<&str>::new());
}
#[test]
fn ngrams_window_ascii() {
assert_eq!(
ngrams_window(3, "abcdef"),
vec!["abc", "bcd", "cde", "def",]
);
assert_eq!(ngrams_window(3, "abcde"), vec!["abc", "bcd", "cde",]);
assert_eq!(ngrams_window(3, "abcd"), vec!["abc", "bcd",]);
assert_eq!(ngrams_window(3, "abc"), vec!["abc",]);
assert_eq!(ngrams_window(3, "ab"), vec!["ab",]);
assert_eq!(ngrams_window(3, "a"), vec!["a",]);
assert_eq!(ngrams_window(3, ""), Vec::<&str>::new());
}
#[test]
fn ngrams_window_non_ascii() {
assert_eq!(
ngrams_window(3, "αβγφδε"),
vec!["αβγ", "βγφ", "γφδ", "φδε",]
);
assert_eq!(ngrams_window(3, "αβγφδ"), vec!["αβγ", "βγφ", "γφδ",]);
assert_eq!(ngrams_window(3, "αβγφ"), vec!["αβγ", "βγφ",]);
assert_eq!(ngrams_window(3, "αβγ"), vec!["αβγ",]);
assert_eq!(ngrams_window(3, "αβ"), vec!["αβ",]);
assert_eq!(ngrams_window(3, "α"), vec!["α",]);
}
#[test]
fn ngrams_edge_ascii() {
assert_eq!(
ngrams_edge(5, "homer simpson"),
vec!["hom", "home", "homer", "sim", "simp", "simps",]
);
assert_eq!(ngrams_edge(5, "h"), vec!["h",]);
assert_eq!(ngrams_edge(5, "ho"), vec!["ho",]);
assert_eq!(ngrams_edge(5, "hom"), vec!["hom",]);
assert_eq!(ngrams_edge(5, "home"), vec!["hom", "home",]);
}
#[test]
fn ngrams_edge_non_ascii() {
assert_eq!(
ngrams_edge(5, "δεαβγφδε δε"),
vec!["δεα", "δεαβ", "δεαβγ", "δε",]
);
}
}