import os
import subprocess
import tempfile
import pytest
prollytree = pytest.importorskip("prollytree")
HashEmbedder = getattr(prollytree, "HashEmbedder", None)
CallableEmbedder = getattr(prollytree, "CallableEmbedder", None)
if HashEmbedder is None:
pytest.skip(
"wheel built without the `proximity` feature — skipping text-index tests",
allow_module_level=True,
)
NamespacedKvStore = prollytree.NamespacedKvStore
def _make_dataset(tmpdir):
subprocess.run(["git", "init"], cwd=tmpdir, check=True, capture_output=True)
subprocess.run(["git", "config", "user.name", "Test User"], cwd=tmpdir, check=True)
subprocess.run(["git", "config", "user.email", "test@example.com"], cwd=tmpdir, check=True)
dataset = os.path.join(tmpdir, "dataset")
os.makedirs(dataset)
return dataset
def test_hash_embedder_basics():
e = HashEmbedder(16, 0)
assert e.dim == 16
v1 = e.embed("hello world")
v2 = e.embed("hello world")
assert len(v1) == 16
assert v1 == v2, "HashEmbedder must be deterministic"
v3 = e.embed("a different sentence")
assert v1 != v3
def test_callable_embedder_basics():
def my_embed(text: str):
vec = [0.0] * 8
for i, ch in enumerate(text):
vec[i % 8] += float(ord(ch)) / 256.0
return vec
e = CallableEmbedder(id="user:char-sum", version="v1", dim=8, embed_fn=my_embed)
assert e.id == "user:char-sum"
assert e.version == "v1"
assert e.dim == 8
out = e.embed("hello")
assert len(out) == 8
assert out == my_embed("hello")
def test_callable_embedder_rejects_wrong_dim():
e = CallableEmbedder(
id="user:bad-dim", version="v1", dim=4, embed_fn=lambda _t: [0.0, 0.0, 0.0]
)
with pytest.raises(ValueError) as excinfo:
e.embed("anything")
assert "wrong dim" in str(excinfo.value)
def test_callable_embedder_end_to_end_in_text_index():
table = {
"alpha document one": [1.0, 0.0, 0.0, 0.0],
"beta document two": [0.0, 1.0, 0.0, 0.0],
"gamma document three": [0.0, 0.0, 1.0, 0.0],
"delta document four": [0.0, 0.0, 0.0, 1.0],
}
def embed(text):
return table.get(text, [0.25, 0.25, 0.25, 0.25])
with tempfile.TemporaryDirectory() as tmpdir:
dataset = _make_dataset(tmpdir)
store = NamespacedKvStore(dataset)
emb = CallableEmbedder(id="user:lookup", version="v1", dim=4, embed_fn=embed)
store.text_index_open("personal", "docs", emb)
for doc_id, text in zip(
[b"alpha", b"beta", b"gamma", b"delta"], list(table.keys())
):
store.text_index_insert("personal", "docs", doc_id, text)
hits = store.text_index_search("personal", "docs", "gamma document three", 1)
assert len(hits) == 1
assert hits[0][0] == b"gamma"
def test_text_index_open_insert_search():
with tempfile.TemporaryDirectory() as tmpdir:
dataset = _make_dataset(tmpdir)
store = NamespacedKvStore(dataset)
embedder = HashEmbedder(32, 0)
store.text_index_open("personal", "docs", embedder)
store.text_index_insert("personal", "docs", b"doc:1", "the quick brown fox")
store.text_index_insert("personal", "docs", b"doc:2", "lazy dog asleep on the mat")
hits = store.text_index_search("personal", "docs", "the quick brown fox", 2)
assert len(hits) >= 1
assert hits[0][0] == b"doc:1"
assert isinstance(hits[0][1], float)
assert store.text_index_len("personal", "docs") == 2
assert store.text_index_chunk_count("personal", "docs") == 2
def test_text_index_delete_and_drop():
with tempfile.TemporaryDirectory() as tmpdir:
dataset = _make_dataset(tmpdir)
store = NamespacedKvStore(dataset)
embedder = HashEmbedder(16, 0)
store.text_index_open("personal", "docs", embedder)
store.text_index_insert("personal", "docs", b"id-a", "one")
store.text_index_insert("personal", "docs", b"id-b", "two")
assert store.text_index_delete("personal", "docs", b"id-a") is True
assert store.text_index_len("personal", "docs") == 1
assert store.text_index_drop("personal", "docs") is True
with pytest.raises(ValueError):
store.text_index_insert("personal", "docs", b"id-c", "three")
def test_text_index_line_chunker_multichunk():
with tempfile.TemporaryDirectory() as tmpdir:
dataset = _make_dataset(tmpdir)
store = NamespacedKvStore(dataset)
embedder = HashEmbedder(16, 0)
store.text_index_open("personal", "lines", embedder, "line")
store.text_index_insert(
"personal", "lines", b"doc:1", "alpha\nbeta\ngamma"
)
assert store.text_index_len("personal", "lines") == 1
assert store.text_index_chunk_count("personal", "lines") == 3
def test_cascade_mirrors_primary_inserts():
with tempfile.TemporaryDirectory() as tmpdir:
dataset = _make_dataset(tmpdir)
store = NamespacedKvStore(dataset)
embedder = HashEmbedder(16, 0)
store.text_index_open("personal", "docs", embedder)
store.set_cascade("personal", ["docs"])
assert store.cascade_for_namespace("personal") == ["docs"]
store.ns_insert("personal", b"doc:1", b"the cascading text")
store.commit("cascade insert")
hits = store.text_index_search("personal", "docs", "the cascading text", 1)
assert len(hits) == 1
assert hits[0][0] == b"doc:1"
store.clear_cascade("personal")
assert store.cascade_for_namespace("personal") is None
def test_audit_text_index_in_sync():
with tempfile.TemporaryDirectory() as tmpdir:
dataset = _make_dataset(tmpdir)
store = NamespacedKvStore(dataset)
embedder = HashEmbedder(16, 0)
store.text_index_open("personal", "docs", embedder)
store.set_cascade("personal", ["docs"])
store.ns_insert("personal", b"doc:1", b"first")
store.ns_insert("personal", b"doc:2", b"second")
store.commit("two docs")
report = store.audit_text_index("personal", "docs")
assert report["is_in_sync"] is True
assert report["orphans_in_index"] == []
assert report["missing_from_index"] == []
def test_externalize_threshold_accessor_round_trip():
with tempfile.TemporaryDirectory() as tmpdir:
dataset = _make_dataset(tmpdir)
store = NamespacedKvStore(dataset)
assert store.externalize_threshold() is None
store.set_externalize_threshold(64)
assert store.externalize_threshold() == 64
store.set_externalize_threshold(None)
assert store.externalize_threshold() is None
def test_gc_blobs_reports_empty_on_git_backend():
with tempfile.TemporaryDirectory() as tmpdir:
dataset = _make_dataset(tmpdir)
store = NamespacedKvStore(dataset)
report = store.gc_blobs()
assert report["total"] == 0
assert report["referenced"] == 0
assert report["removed"] == 0
assert report["errors"] == []
def test_repr_unchanged_back_compat():
with tempfile.TemporaryDirectory() as tmpdir:
dataset = _make_dataset(tmpdir)
store = NamespacedKvStore(dataset)
text = repr(store)
assert text.startswith("NamespacedKvStore(")