use std::fs;
use std::sync::Arc;
use rusqlite::Connection;
use lantern::embed::{
DEFAULT_EMBED_MODEL, EmbedOptions, EmbeddingBackend, MockBackendFactory, MockEmbeddingBackend,
VEC_MIRROR_DIM, embed_missing_with, embedding_stats, f32s_to_blob,
};
use lantern::feedback;
use lantern::ingest::ingest_path;
use lantern::mcp::{EmbedArgs, FeedbackArgs, FeedbackVote, LanternServer, SearchArgs};
use lantern::search::{
SemanticOptions, hybrid_search_with, semantic_search_with, vec_semantic_search_with,
};
use lantern::store::Store;
use tempfile::TempDir;
const MOCK_MODEL: &str = "mock-embed-test";
fn setup_with(files: &[(&str, &str)]) -> (TempDir, Store, std::path::PathBuf) {
let root = tempfile::tempdir().unwrap();
let store_dir = root.path().join("store");
let mut store = Store::initialize(&store_dir).unwrap();
let data = root.path().join("data");
fs::create_dir_all(&data).unwrap();
for (name, body) in files {
fs::write(data.join(name), body).unwrap();
}
ingest_path(&mut store, &data).unwrap();
(root, store, store_dir)
}
fn embed_all(store: &mut Store, backend: &MockEmbeddingBackend) {
let opts = EmbedOptions {
model: MOCK_MODEL.to_string(),
ollama_url: "http://mock".to_string(),
limit: None,
};
embed_missing_with(store, &opts, backend).unwrap();
}
fn sem_opts(limit: usize) -> SemanticOptions {
SemanticOptions {
limit,
kind: None,
path_contains: None,
model: MOCK_MODEL.to_string(),
ollama_url: "http://mock".to_string(),
instruction: None,
}
}
#[test]
fn embed_missing_writes_vectors_into_db() {
let (_root, mut store, _) = setup_with(&[
("a.md", "rust systems programming language"),
("b.md", "baking sourdough bread at home"),
]);
let backend = MockEmbeddingBackend::new(64);
let report = embed_missing_with(
&mut store,
&EmbedOptions {
model: MOCK_MODEL.to_string(),
ollama_url: "http://mock".to_string(),
limit: None,
},
&backend,
)
.unwrap();
assert_eq!(report.embedded, 2, "one chunk per short file");
assert_eq!(report.failed, 0);
assert_eq!(report.dim, Some(64));
assert_eq!(report.model, MOCK_MODEL);
let stats = embedding_stats(&store).unwrap();
assert_eq!(stats.len(), 1);
assert_eq!(stats[0].model, MOCK_MODEL);
assert_eq!(stats[0].dim, 64);
assert_eq!(stats[0].count, 2);
let second = embed_missing_with(
&mut store,
&EmbedOptions {
model: MOCK_MODEL.to_string(),
ollama_url: "http://mock".to_string(),
limit: None,
},
&backend,
)
.unwrap();
assert_eq!(second.embedded, 0);
assert_eq!(second.already_had, 2);
}
#[test]
fn embed_missing_dual_writes_into_vec_mirror_for_default_model() {
let (_root, mut store, _) = setup_with(&[
("a.md", "rust systems programming language"),
("b.md", "baking sourdough bread at home"),
]);
let backend = MockEmbeddingBackend::new(768);
let report = embed_missing_with(
&mut store,
&EmbedOptions {
model: DEFAULT_EMBED_MODEL.to_string(),
ollama_url: "http://mock".to_string(),
limit: None,
},
&backend,
)
.unwrap();
assert_eq!(report.embedded, 2);
assert_eq!(report.dim, Some(768));
let canonical: i64 = store
.conn()
.query_row(
"SELECT COUNT(*) FROM embeddings WHERE model = ?1",
[DEFAULT_EMBED_MODEL],
|row| row.get(0),
)
.unwrap();
assert_eq!(
canonical, 2,
"canonical embeddings table must have both rows"
);
let mirror: i64 = store
.conn()
.query_row("SELECT COUNT(*) FROM chunks_vec_nomic_768", [], |row| {
row.get(0)
})
.unwrap();
assert_eq!(mirror, 2, "vec mirror must be dual-written");
let orphans: i64 = store
.conn()
.query_row(
"SELECT COUNT(*) FROM chunks_vec_nomic_768 v
WHERE NOT EXISTS (SELECT 1 FROM chunks c WHERE c.rowid = v.rowid)",
[],
|row| row.get(0),
)
.unwrap();
assert_eq!(orphans, 0, "mirror rowids must all reference chunks");
}
#[test]
fn embed_missing_skips_vec_mirror_for_non_default_model() {
let (_root, mut store, _) = setup_with(&[("a.md", "anything at all")]);
let backend = MockEmbeddingBackend::new(768);
embed_missing_with(
&mut store,
&EmbedOptions {
model: MOCK_MODEL.to_string(),
ollama_url: "http://mock".to_string(),
limit: None,
},
&backend,
)
.unwrap();
let mirror: i64 = store
.conn()
.query_row("SELECT COUNT(*) FROM chunks_vec_nomic_768", [], |row| {
row.get(0)
})
.unwrap();
assert_eq!(mirror, 0);
}
#[test]
fn semantic_search_ranks_topically_related_chunks_first() {
let (_root, mut store, _) = setup_with(&[
(
"rust.md",
"rust is a systems programming language with memory safety",
),
(
"bread.md",
"sourdough bread requires flour water salt and patience",
),
(
"cars.md",
"electric cars have large batteries and regenerative braking",
),
]);
let backend = MockEmbeddingBackend::new(128);
embed_all(&mut store, &backend);
let hits =
semantic_search_with(&store, "systems programming memory", &sem_opts(3), &backend).unwrap();
assert_eq!(hits.len(), 3, "all three candidates should be returned");
assert!(
hits[0].uri.ends_with("/rust.md"),
"rust doc must rank first; got {}",
hits[0].uri
);
assert!(hits[0].score >= hits[1].score);
assert!(hits[1].score >= hits[2].score);
assert!(hits[0].score > 0.0);
}
#[test]
fn semantic_search_respects_limit() {
let (_root, mut store, _) = setup_with(&[
("a.md", "needle one"),
("b.md", "needle two"),
("c.md", "needle three"),
]);
let backend = MockEmbeddingBackend::new(64);
embed_all(&mut store, &backend);
let hits = semantic_search_with(&store, "needle", &sem_opts(2), &backend).unwrap();
assert_eq!(hits.len(), 2);
}
#[test]
fn semantic_search_empty_query_returns_nothing() {
let (_root, mut store, _) = setup_with(&[("a.md", "anything")]);
let backend = MockEmbeddingBackend::new(64);
embed_all(&mut store, &backend);
let hits = semantic_search_with(&store, " ", &sem_opts(10), &backend).unwrap();
assert!(hits.is_empty());
}
#[test]
fn hybrid_search_blends_keyword_and_semantic_signals() {
let (_root, mut store, _) = setup_with(&[
(
"rust.md",
"rust is a systems programming language with lifetimes",
),
(
"programming.md",
"systems programming languages balance safety and performance",
),
(
"trees.md",
"redwood trees grow tall along the northern coast",
),
]);
let backend = MockEmbeddingBackend::new(128);
embed_all(&mut store, &backend);
let hits = hybrid_search_with(&store, "rust programming", &sem_opts(3), &backend).unwrap();
assert!(!hits.is_empty());
assert!(
hits[0].uri.ends_with("/rust.md"),
"literal-token hit should lead hybrid ranking; got {}",
hits[0].uri
);
for w in hits.windows(2) {
assert!(w[0].score >= w[1].score);
}
}
#[test]
fn hybrid_search_rrf_rewards_cross_list_matches() {
let (_root, mut store, _) = setup_with(&[
(
"both.md",
"rust systems programming language with memory safety",
),
(
"literal.md",
"rust unrelated gibberish lorem ipsum dolor sit amet",
),
(
"topical.md",
"systems programming language tradeoffs at runtime",
),
]);
let backend = MockEmbeddingBackend::new(128);
embed_all(&mut store, &backend);
let hits = hybrid_search_with(&store, "rust programming", &sem_opts(10), &backend).unwrap();
let uri_of = |idx: usize| hits[idx].uri.clone();
let rank_of = |needle: &str| {
hits.iter()
.position(|h| h.uri.ends_with(needle))
.unwrap_or_else(|| {
panic!(
"expected {needle} in hybrid hits; got {:?}",
hits.iter().map(|h| h.uri.clone()).collect::<Vec<_>>()
)
})
};
let both = rank_of("/both.md");
let literal = rank_of("/literal.md");
let topical = rank_of("/topical.md");
assert!(
both < literal && both < topical,
"chunk matching both lists should outrank one-sided hits; got order: {}, {}, {}",
uri_of(0),
uri_of(1),
uri_of(2),
);
assert!(
hits.iter().any(|h| h.uri.ends_with("/topical.md")),
"semantic-only hit must still appear in hybrid results"
);
assert!(hits[0].score > 0.0);
for w in hits.windows(2) {
assert!(w[0].score >= w[1].score);
}
}
#[test]
fn vec_semantic_search_matches_brute_force_order_for_default_model() {
let (_root, mut store, _) = setup_with(&[
(
"rust.md",
"rust is a systems programming language with memory safety",
),
("prog.md", "other systems programming concepts"),
("mem.md", "memory considerations at runtime only"),
]);
let backend = MockEmbeddingBackend::new(VEC_MIRROR_DIM);
embed_missing_with(
&mut store,
&EmbedOptions {
model: DEFAULT_EMBED_MODEL.to_string(),
ollama_url: "http://mock".to_string(),
limit: None,
},
&backend,
)
.unwrap();
let opts = SemanticOptions {
limit: 3,
kind: None,
path_contains: None,
model: DEFAULT_EMBED_MODEL.to_string(),
ollama_url: "http://mock".to_string(),
instruction: None,
};
let query = "systems programming memory safety";
let brute = semantic_search_with(&store, query, &opts, &backend).unwrap();
let via_vec = vec_semantic_search_with(&store, query, &opts, &backend).unwrap();
assert!(!brute.is_empty(), "brute-force should rank something");
assert_eq!(
brute.len(),
via_vec.len(),
"vec helper must return the same number of hits"
);
let brute_ids: Vec<&str> = brute.iter().map(|h| h.chunk_id.as_str()).collect();
let vec_ids: Vec<&str> = via_vec.iter().map(|h| h.chunk_id.as_str()).collect();
assert_eq!(
brute_ids, vec_ids,
"vec-backed ordering must match brute-force for the default model"
);
assert!(
via_vec[0].uri.ends_with("/rust.md"),
"rust doc should rank first; got {}",
via_vec[0].uri
);
}
#[test]
fn vec_semantic_search_rejects_non_default_model() {
let (_root, mut store, _) = setup_with(&[("a.md", "anything")]);
let backend = MockEmbeddingBackend::new(VEC_MIRROR_DIM);
embed_missing_with(
&mut store,
&EmbedOptions {
model: MOCK_MODEL.to_string(),
ollama_url: "http://mock".to_string(),
limit: None,
},
&backend,
)
.unwrap();
let opts = SemanticOptions {
limit: 3,
kind: None,
path_contains: None,
model: MOCK_MODEL.to_string(),
ollama_url: "http://mock".to_string(),
instruction: None,
};
let err = vec_semantic_search_with(&store, "anything", &opts, &backend).unwrap_err();
let msg = format!("{err}");
assert!(
msg.contains(DEFAULT_EMBED_MODEL),
"error should name the supported model; got {msg}"
);
}
#[test]
fn semantic_search_reports_missing_model_with_available_models() {
let (_root, mut store, _) = setup_with(&[("a.md", "anything")]);
let backend = MockEmbeddingBackend::new(64);
embed_missing_with(
&mut store,
&EmbedOptions {
model: MOCK_MODEL.to_string(),
ollama_url: "http://mock".to_string(),
limit: None,
},
&backend,
)
.unwrap();
let opts = SemanticOptions {
limit: 3,
kind: None,
path_contains: None,
model: DEFAULT_EMBED_MODEL.to_string(),
ollama_url: "http://mock".to_string(),
instruction: None,
};
let err = semantic_search_with(&store, "anything", &opts, &backend).unwrap_err();
let msg = format!("{err}");
assert!(
msg.contains("no stored embeddings for model 'nomic-embed-text'"),
"error should name the missing model; got {msg}"
);
assert!(
msg.contains("available models: mock-embed-test"),
"error should list available models; got {msg}"
);
}
#[test]
fn semantic_search_with_default_model_still_honors_filters() {
let (_root, mut store, _) = setup_with(&[
("rust.md", "rust systems programming language with safety"),
(
"notes.json",
"{\"note\":\"rust systems programming language with safety\"}",
),
]);
let backend = MockEmbeddingBackend::new(VEC_MIRROR_DIM);
embed_missing_with(
&mut store,
&EmbedOptions {
model: DEFAULT_EMBED_MODEL.to_string(),
ollama_url: "http://mock".to_string(),
limit: None,
},
&backend,
)
.unwrap();
let opts = SemanticOptions {
limit: 5,
kind: Some("text/markdown".to_string()),
path_contains: None,
model: DEFAULT_EMBED_MODEL.to_string(),
ollama_url: "http://mock".to_string(),
instruction: None,
};
let hits = semantic_search_with(&store, "rust systems programming", &opts, &backend).unwrap();
assert!(!hits.is_empty());
assert!(
hits.iter().all(|hit| hit.kind == "text/markdown"),
"filtered semantic search must not leak non-markdown hits: {:?}",
hits.iter().map(|hit| &hit.kind).collect::<Vec<_>>()
);
assert!(
hits.iter().any(|hit| hit.uri.ends_with("/rust.md")),
"markdown doc should remain in the result set"
);
}
#[test]
fn legacy_store_migrates_and_backfills_vec_mirror() {
let root = tempfile::tempdir().unwrap();
let store_dir = root.path().join("store");
fs::create_dir_all(&store_dir).unwrap();
let db_path = store_dir.join(lantern::store::DB_FILENAME);
let conn = Connection::open(&db_path).unwrap();
conn.execute_batch(
"CREATE TABLE sources (
id TEXT PRIMARY KEY,
uri TEXT NOT NULL,
path TEXT,
kind TEXT NOT NULL,
bytes INTEGER NOT NULL,
content_sha256 TEXT NOT NULL,
mtime_unix INTEGER,
ingested_at INTEGER NOT NULL
);
CREATE TABLE chunks (
id TEXT PRIMARY KEY,
source_id TEXT NOT NULL,
ordinal INTEGER NOT NULL,
byte_start INTEGER NOT NULL,
byte_end INTEGER NOT NULL,
char_count INTEGER NOT NULL,
text TEXT NOT NULL,
sha256 TEXT NOT NULL,
created_at INTEGER NOT NULL,
UNIQUE(source_id, ordinal)
);
CREATE TABLE embeddings (
chunk_id TEXT PRIMARY KEY,
model TEXT NOT NULL,
dim INTEGER NOT NULL,
embedding BLOB NOT NULL,
created_at INTEGER NOT NULL
);
PRAGMA user_version = 3;",
)
.unwrap();
conn.execute(
"INSERT INTO sources (id, uri, path, kind, bytes, content_sha256, mtime_unix, ingested_at)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
rusqlite::params![
"s1",
"file:///tmp/rust.md",
Some("/tmp/rust.md"),
"text/markdown",
31_i64,
"sha",
Option::<i64>::None,
1_i64,
],
)
.unwrap();
conn.execute(
"INSERT INTO chunks (id, source_id, ordinal, byte_start, byte_end, char_count, text, sha256, created_at)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)",
rusqlite::params![
"c1",
"s1",
0_i64,
0_i64,
31_i64,
31_i64,
"rust systems programming language",
"sha",
1_i64,
],
)
.unwrap();
let backend = MockEmbeddingBackend::new(VEC_MIRROR_DIM);
let embedding = backend.embed("rust systems programming language").unwrap();
conn.execute(
"INSERT INTO embeddings (chunk_id, model, dim, embedding, created_at)
VALUES (?1, ?2, ?3, ?4, ?5)",
rusqlite::params![
"c1",
DEFAULT_EMBED_MODEL,
VEC_MIRROR_DIM as i64,
f32s_to_blob(&embedding),
1_i64,
],
)
.unwrap();
drop(conn);
let store = Store::open(&store_dir).unwrap();
let mirror_count: i64 = store
.conn()
.query_row(
&format!("SELECT count(*) FROM {}", lantern::store::VEC_MIRROR_TABLE),
[],
|row| row.get(0),
)
.unwrap();
assert_eq!(mirror_count, 1);
let hits = semantic_search_with(
&store,
"rust systems programming language",
&SemanticOptions {
limit: 5,
kind: None,
path_contains: None,
model: DEFAULT_EMBED_MODEL.to_string(),
ollama_url: "http://mock".to_string(),
instruction: None,
},
&backend,
)
.unwrap();
assert!(!hits.is_empty());
assert!(hits[0].uri.ends_with("/rust.md"));
}
#[test]
fn mcp_server_drives_semantic_search_through_injected_factory() {
let (_root, store, store_dir) = setup_with(&[
("a.md", "rust programming with borrow checker and traits"),
("b.md", "baking sourdough at home on the weekend"),
]);
drop(store);
let factory = Arc::new(MockBackendFactory::new(128));
let server = LanternServer::with_factory(store_dir, factory);
let report = server
.embed_sync(EmbedArgs {
model: Some(MOCK_MODEL.to_string()),
ollama_url: Some("http://mock".to_string()),
limit: None,
})
.unwrap();
assert_eq!(report.embedded, 2);
assert_eq!(report.dim, Some(128));
let resp = server
.search_sync(SearchArgs {
query: "rust programming".to_string(),
limit: Some(5),
kind: None,
path: None,
mode: Some("semantic".to_string()),
model: Some(MOCK_MODEL.to_string()),
ollama_url: Some("http://mock".to_string()),
instruction: None,
})
.unwrap();
let results = resp
.get("results")
.and_then(|v| v.as_array())
.expect("results array");
assert!(!results.is_empty(), "semantic search returned no hits");
let top_uri = results[0]
.get("uri")
.and_then(|v| v.as_str())
.unwrap_or_default();
assert!(
top_uri.ends_with("/a.md"),
"rust doc should rank first; got {top_uri}"
);
let hybrid = server
.search_sync(SearchArgs {
query: "rust programming".to_string(),
limit: Some(5),
kind: None,
path: None,
mode: Some("hybrid".to_string()),
model: Some(MOCK_MODEL.to_string()),
ollama_url: Some("http://mock".to_string()),
instruction: None,
})
.unwrap();
assert!(
hybrid
.get("results")
.and_then(|v| v.as_array())
.is_some_and(|arr| !arr.is_empty())
);
}
#[test]
fn mcp_feedback_sync_updates_the_same_chunk_score_as_cli_feedback() {
let (_root, store, store_dir) = setup_with(&[("a.md", "Lanterns glow in the dark forest.")]);
let chunk_id: String = store
.conn()
.query_row("SELECT id FROM chunks LIMIT 1", [], |row| row.get(0))
.unwrap();
drop(store);
let server =
LanternServer::with_factory(store_dir.clone(), Arc::new(MockBackendFactory::new(64)));
let report = server
.feedback_sync(FeedbackArgs {
chunk_id: chunk_id.clone(),
vote: FeedbackVote::Up,
})
.unwrap();
assert_eq!(report.chunk_id, chunk_id);
assert_eq!(report.delta, 1);
assert_eq!(report.score, 1);
assert_eq!(
feedback::get_feedback_score(&Store::open(&store_dir).unwrap(), &chunk_id).unwrap(),
Some(1)
);
}