#![cfg(all(feature = "proximity", feature = "git"))]
mod common;
use common::setup_repo_and_dataset;
use prollytree::git::versioned_store::FileNamespacedKvStore;
use prollytree::proximity::{HashEmbedder, LineChunker, TextIndexConfig};
const N: usize = 32;
fn line_cfg(dim: u16, seed: u64) -> TextIndexConfig<HashEmbedder> {
TextIndexConfig::new(HashEmbedder::new(dim, seed)).with_chunker(LineChunker)
}
#[test]
fn line_chunker_produces_one_chunk_per_line() {
let (_temp, dataset) = setup_repo_and_dataset();
let mut store = FileNamespacedKvStore::<N>::init(&dataset).unwrap();
let mut personal = store.namespace("personal");
let mut docs = personal.text_index("docs", line_cfg(8, 0)).unwrap();
docs.insert(b"doc:1", "first line\nsecond line\nthird line")
.unwrap();
assert_eq!(docs.len(), 1, "one document inserted");
assert_eq!(docs.chunk_count(), 3, "three chunks (one per line)");
}
#[test]
fn delete_removes_all_chunks_for_a_doc() {
let (_temp, dataset) = setup_repo_and_dataset();
let mut store = FileNamespacedKvStore::<N>::init(&dataset).unwrap();
let mut personal = store.namespace("personal");
let mut docs = personal.text_index("docs", line_cfg(8, 0)).unwrap();
docs.insert(b"doc:1", "alpha\nbeta\ngamma").unwrap();
docs.insert(b"doc:2", "delta\nepsilon").unwrap();
assert_eq!(docs.len(), 2);
assert_eq!(docs.chunk_count(), 5);
assert!(docs.delete(b"doc:1"));
assert_eq!(docs.len(), 1);
assert_eq!(docs.chunk_count(), 2, "only doc:2's chunks remain");
}
#[test]
fn re_insert_replaces_all_old_chunks() {
let (_temp, dataset) = setup_repo_and_dataset();
let mut store = FileNamespacedKvStore::<N>::init(&dataset).unwrap();
let mut personal = store.namespace("personal");
let mut docs = personal.text_index("docs", line_cfg(8, 0)).unwrap();
docs.insert(b"doc:1", "one\ntwo\nthree\nfour").unwrap();
assert_eq!(docs.chunk_count(), 4);
docs.insert(b"doc:1", "shorter\ntext").unwrap();
assert_eq!(docs.len(), 1);
assert_eq!(docs.chunk_count(), 2, "old 4 chunks replaced by 2 new");
}
#[test]
fn search_returns_top_k_documents_dedupped() {
let (_temp, dataset) = setup_repo_and_dataset();
let mut store = FileNamespacedKvStore::<N>::init(&dataset).unwrap();
let mut personal = store.namespace("personal");
let mut docs = personal.text_index("docs", line_cfg(16, 0)).unwrap();
docs.insert(
b"doc:hit",
"unrelated noise\nthe exact target line\nmore filler",
)
.unwrap();
docs.insert(
b"doc:other",
"completely different content\nnothing matching here",
)
.unwrap();
let hits = docs.search("the exact target line", 1).unwrap();
assert_eq!(hits.len(), 1, "k=1 returns 1 document");
assert_eq!(
hits[0].id, b"doc:hit",
"matching chunk's doc_id is the result"
);
assert!(
hits[0].score < 1e-4,
"exact match should yield near-zero distance"
);
}
#[test]
fn search_dedups_when_many_chunks_match() {
let (_temp, dataset) = setup_repo_and_dataset();
let mut store = FileNamespacedKvStore::<N>::init(&dataset).unwrap();
let mut personal = store.namespace("personal");
let mut docs = personal.text_index("docs", line_cfg(8, 0)).unwrap();
docs.insert(
b"doc:hit",
"near match here\nthe exact phrase\nyet more text", )
.unwrap();
docs.insert(b"doc:other", "completely different\nstill different")
.unwrap();
let hits = docs.search("the exact phrase", 5).unwrap();
let hit_ids: Vec<_> = hits.iter().map(|h| h.id.clone()).collect();
let doc_hit_count = hit_ids
.iter()
.filter(|id| id == &&b"doc:hit".to_vec())
.count();
assert_eq!(
doc_hit_count, 1,
"doc:hit should appear once even though several chunks match"
);
}
#[test]
fn cascade_with_chunker_inserts_per_line() {
let (_temp, dataset) = setup_repo_and_dataset();
let mut store = FileNamespacedKvStore::<N>::init(&dataset).unwrap();
{
let _ = store
.namespace("personal")
.text_index("docs", line_cfg(8, 0))
.unwrap();
}
store.set_cascade("personal", vec!["docs".to_string()]);
{
let mut personal = store.namespace("personal");
personal
.insert(b"doc:1".to_vec(), b"alpha\nbeta\ngamma".to_vec())
.unwrap();
}
store.commit("cascade chunked").unwrap();
let mut personal = store.namespace("personal");
let docs = personal.text_index("docs", line_cfg(8, 0)).unwrap();
assert_eq!(docs.len(), 1);
assert_eq!(docs.chunk_count(), 3, "cascade chunked the primary value");
}
#[test]
fn cascade_delete_removes_all_chunks() {
let (_temp, dataset) = setup_repo_and_dataset();
let mut store = FileNamespacedKvStore::<N>::init(&dataset).unwrap();
{
let _ = store
.namespace("personal")
.text_index("docs", line_cfg(8, 0))
.unwrap();
}
store.set_cascade("personal", vec!["docs".to_string()]);
{
let mut personal = store.namespace("personal");
personal
.insert(b"doc:1".to_vec(), b"one\ntwo\nthree".to_vec())
.unwrap();
}
store.commit("insert").unwrap();
{
let mut personal = store.namespace("personal");
assert!(personal.delete(b"doc:1").unwrap());
}
store.commit("delete").unwrap();
let mut personal = store.namespace("personal");
let docs = personal.text_index("docs", line_cfg(8, 0)).unwrap();
assert_eq!(docs.chunk_count(), 0, "cascade delete removed all chunks");
}
#[test]
fn empty_document_under_line_chunker_indexes_nothing() {
let (_temp, dataset) = setup_repo_and_dataset();
let mut store = FileNamespacedKvStore::<N>::init(&dataset).unwrap();
let mut personal = store.namespace("personal");
let mut docs = personal.text_index("docs", line_cfg(8, 0)).unwrap();
docs.insert(b"doc:1", "").unwrap();
assert_eq!(docs.chunk_count(), 0);
assert_eq!(docs.len(), 0);
}
#[test]
fn identity_chunker_remains_default_back_compat() {
let (_temp, dataset) = setup_repo_and_dataset();
let mut store = FileNamespacedKvStore::<N>::init(&dataset).unwrap();
let mut personal = store.namespace("personal");
let mut docs = personal
.text_index("docs", TextIndexConfig::new(HashEmbedder::new(8, 0)))
.unwrap();
docs.insert(b"doc:1", "first line\nsecond line\nthird line")
.unwrap();
assert_eq!(docs.len(), 1);
assert_eq!(
docs.chunk_count(),
1,
"IdentityChunker should produce 1 chunk regardless of newlines"
);
}