use std::collections::HashSet;
use dyntext::index::TextIndex;
use hegel::generators as gs;
use hegel::TestCase;
fn arb_text(tc: &TestCase) -> Vec<u8> {
let len = tc.draw(gs::integers::<usize>().min_value(0).max_value(40));
let mut out = Vec::with_capacity(len);
for _ in 0..len {
let c = tc.draw(gs::integers::<u8>().min_value(b'a').max_value(b'd'));
out.push(c);
}
out
}
fn arb_corpus(tc: &TestCase) -> Vec<Vec<u8>> {
let n = tc.draw(gs::integers::<usize>().min_value(0).max_value(8));
let mut out = Vec::with_capacity(n);
for _ in 0..n {
out.push(arb_text(tc));
}
out
}
fn arb_query(tc: &TestCase) -> Vec<u8> {
let len = tc.draw(gs::integers::<usize>().min_value(0).max_value(8));
let mut out = Vec::with_capacity(len);
for _ in 0..len {
let c = tc.draw(gs::integers::<u8>().min_value(b'a').max_value(b'd'));
out.push(c);
}
out
}
#[hegel::test(test_cases = 256)]
fn substring_search_no_false_negatives_under_arbitrary_corpus(tc: TestCase) {
let corpus = arb_corpus(&tc);
let query = arb_query(&tc);
let mut idx = TextIndex::new();
let ids: Vec<u32> = corpus.iter().map(|t| idx.insert(t.clone())).collect();
let observed: HashSet<u32> = idx.search_substring(&query).into_iter().collect();
let expected: HashSet<u32> = corpus
.iter()
.enumerate()
.filter(|(_, t)| substring_match(t, &query))
.map(|(i, _)| ids[i])
.collect();
assert!(
expected.is_subset(&observed),
"true positives missed: expected={expected:?} observed={observed:?} \
corpus={corpus:?} query={query:?}",
);
}
#[hegel::test(test_cases = 256)]
fn substring_search_returns_only_true_positives(tc: TestCase) {
let corpus = arb_corpus(&tc);
let query = arb_query(&tc);
let mut idx = TextIndex::new();
let ids: Vec<u32> = corpus.iter().map(|t| idx.insert(t.clone())).collect();
for got in idx.search_substring(&query) {
let pos = ids.iter().position(|id| *id == got).expect("id known");
let doc = &corpus[pos];
assert!(
substring_match(doc, &query),
"false positive: doc {doc:?} returned for query {query:?}",
);
}
}
#[hegel::test(test_cases = 256)]
fn insert_remove_round_trip(tc: TestCase) {
let corpus = arb_corpus(&tc);
let query = arb_query(&tc);
if corpus.is_empty() {
return;
}
let drop_idx = tc.draw(
gs::integers::<usize>()
.min_value(0)
.max_value(corpus.len() - 1),
);
let mut without = TextIndex::new();
let mut ids_without = Vec::new();
for (i, t) in corpus.iter().enumerate() {
if i == drop_idx {
continue;
}
ids_without.push(without.insert(t.clone()));
}
let mut with = TextIndex::new();
let mut all_ids = Vec::new();
for t in &corpus {
all_ids.push(with.insert(t.clone()));
}
let dropped_id = all_ids[drop_idx];
with.remove(dropped_id);
let r1: HashSet<u32> = without.search_substring(&query).into_iter().collect();
let r2: HashSet<u32> = with.search_substring(&query).into_iter().collect();
let m1: HashSet<&[u8]> = r1
.iter()
.map(|id| {
let pos = ids_without.iter().position(|i| i == id).unwrap();
let mut k = 0;
for (i, t) in corpus.iter().enumerate() {
if i == drop_idx {
continue;
}
if k == pos {
return t.as_slice();
}
k += 1;
}
panic!("id mapping failed");
})
.collect();
let m2: HashSet<&[u8]> = r2
.iter()
.map(|id| {
let pos = all_ids.iter().position(|i| i == id).unwrap();
corpus[pos].as_slice()
})
.collect();
assert_eq!(m1, m2);
}
fn substring_match(haystack: &[u8], needle: &[u8]) -> bool {
if needle.is_empty() {
return true;
}
if needle.len() > haystack.len() {
return false;
}
haystack.windows(needle.len()).any(|w| w == needle)
}