use anyhow::Result;
use ignore::WalkBuilder;
use sha2::{Digest, Sha256};
use std::path::Path;
use std::time::SystemTime;
use crate::librarian::catalog::artifact::ArtifactRow;
use crate::librarian::catalog::{artifact, Catalog};
use crate::librarian::classify::{classify, CompiledRule};
use crate::librarian::frontmatter;
#[derive(Debug, Default)]
pub struct IndexReport {
pub added: usize,
pub updated: usize,
pub unchanged: usize,
pub removed: usize,
pub embedded: usize,
pub unknown_ids: Vec<String>,
}
pub type EmbedQueueItem = (String, Option<String>, String);
pub fn first_h1(body: &str) -> Option<String> {
use pulldown_cmark::{Event, HeadingLevel, Parser, Tag, TagEnd};
let parser = Parser::new(body);
let mut in_h1 = false;
let mut title = String::new();
for event in parser {
match event {
Event::Start(Tag::Heading {
level: HeadingLevel::H1,
..
}) => in_h1 = true,
Event::End(TagEnd::Heading(HeadingLevel::H1)) => {
if !title.trim().is_empty() {
return Some(title.trim().to_string());
}
in_h1 = false;
title.clear();
}
Event::Text(t) if in_h1 => title.push_str(&t),
Event::Code(t) if in_h1 => title.push_str(&t),
_ => {}
}
}
None
}
pub fn index_repo_sync(
cat: &Catalog,
rules: &[CompiledRule],
abs_root: &Path,
ignore: &globset::GlobSet,
want_embeddings: bool,
force_rewalk: bool,
) -> Result<(IndexReport, Vec<EmbedQueueItem>)> {
let mut report = IndexReport::default();
if crate::librarian::current_project::is_linked_worktree(abs_root) {
tracing::warn!(
"skipping index of linked git worktree {} — index its main worktree instead",
abs_root.display()
);
return Ok((report, Vec::new()));
}
let mut seen_ids: Vec<String> = Vec::new();
let mut embed_queue: Vec<EmbedQueueItem> = Vec::new();
let walker = WalkBuilder::new(abs_root).standard_filters(true).build();
for entry in walker.flatten() {
let path = entry.path();
if !path.is_file() || path.extension().and_then(|s| s.to_str()) != Some("md") {
continue;
}
let rel = crate::librarian::util::normalize_rel_path(
&path.strip_prefix(abs_root)?.to_string_lossy(),
);
if ignore.is_match(&rel) {
continue;
}
let id = crate::librarian::ids::artifact_id_from_abs(path);
let bytes = std::fs::read(path)?;
let content = String::from_utf8_lossy(&bytes);
let sha = {
let mut h = Sha256::new();
h.update(&bytes);
format!("{:x}", h.finalize())
};
let mtime = path
.metadata()?
.modified()?
.duration_since(SystemTime::UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as i64;
let existing = artifact::get(cat, &id)?;
let (fm, body) = frontmatter::parse(&content).unwrap_or((None, ""));
let rule_match = classify(rules, &rel);
let kind = fm
.as_ref()
.and_then(|f| f.kind.clone())
.or_else(|| rule_match.as_ref().map(|r| r.kind.clone()))
.unwrap_or_else(|| "unknown".into());
let status = fm
.as_ref()
.and_then(|f| f.status.clone())
.or_else(|| rule_match.as_ref().and_then(|r| r.status.clone()))
.unwrap_or_else(|| {
if kind == "unknown" {
"unknown".into()
} else {
"draft".into()
}
});
let time_scope = fm
.as_ref()
.and_then(|f| f.time_scope.clone())
.or_else(|| rule_match.as_ref().and_then(|r| r.time_scope.clone()));
let confidence = if fm.as_ref().and_then(|f| f.kind.as_ref()).is_some() {
1.0
} else {
0.5
};
let title = fm
.as_ref()
.and_then(|f| f.title.clone())
.or_else(|| first_h1(body));
let owners = fm.as_ref().map(|f| f.owners.clone()).unwrap_or_default();
let mut tags = fm.as_ref().map(|f| f.tags.clone()).unwrap_or_default();
if let Some(rm) = rule_match.as_ref() {
for t in &rm.tags {
if !tags.contains(t) {
tags.push(t.clone());
}
}
}
let topic = fm.as_ref().and_then(|f| f.topic.clone());
let content_unchanged = existing
.as_ref()
.map(|ex| ex.file_sha256 == sha)
.unwrap_or(false);
let meta_unchanged = existing
.as_ref()
.map(|ex| {
ex.kind == kind
&& ex.status == status
&& ex.time_scope == time_scope
&& ex.title == title
&& ex.owners == owners
&& ex.tags == tags
&& ex.topic == topic
&& (ex.confidence - confidence).abs() < f32::EPSILON as f64
})
.unwrap_or(false);
if !force_rewalk && content_unchanged && meta_unchanged {
seen_ids.push(id);
report.unchanged += 1;
continue;
}
let now = chrono::Utc::now().timestamp_millis();
let row = ArtifactRow {
id: id.clone(),
abs_path: path.to_path_buf(),
kind: kind.clone(),
status,
title: title.clone(),
owners,
tags,
topic,
time_scope,
source: Some("repo".into()),
created_at: existing.as_ref().map(|ex| ex.created_at).unwrap_or(now),
updated_at: now,
file_mtime: mtime,
file_sha256: sha,
confidence,
};
artifact::upsert(cat, &row)?;
if want_embeddings && !content_unchanged {
let chunks = codescout_embed::chunk_markdown(body, 512);
let first_chunk = chunks
.into_iter()
.next()
.unwrap_or_else(|| body.to_string());
embed_queue.push((id.clone(), title, first_chunk));
}
seen_ids.push(id.clone());
if existing.is_some() {
report.updated += 1;
} else {
report.added += 1;
}
if kind == "unknown" {
report.unknown_ids.push(id);
}
}
let root_prefix = format!(
"{}/",
crate::util::fs::RepoPath::from(abs_root)
.as_str()
.replace('\'', "''")
);
let removed = if seen_ids.is_empty() {
cat.conn.execute(
"DELETE FROM artifact WHERE abs_path LIKE ?1",
rusqlite::params![format!("{root_prefix}%")],
)?
} else {
let placeholders = seen_ids
.iter()
.enumerate()
.map(|(i, _)| format!("?{}", i + 2))
.collect::<Vec<_>>()
.join(", ");
let sql = format!(
"DELETE FROM artifact WHERE abs_path LIKE ?1 AND id NOT IN ({})",
placeholders
);
let mut params: Vec<Box<dyn rusqlite::ToSql>> = vec![Box::new(format!("{root_prefix}%"))];
for id in &seen_ids {
params.push(Box::new(id.clone()));
}
cat.conn.execute(
&sql,
rusqlite::params_from_iter(params.iter().map(|p| p.as_ref())),
)?
};
report.removed = removed;
Ok((report, embed_queue))
}
pub fn write_embeddings(cat: &Catalog, embeddings: &[(String, Vec<f32>)]) -> Result<()> {
use rusqlite::OptionalExtension;
if embeddings.is_empty() {
return Ok(());
}
let batch_dim = embeddings[0].1.len();
if batch_dim == 0 {
anyhow::bail!(
"embedding dim is 0 — embedder produced an empty vector. \
Likely an embedder misconfiguration or error sentinel returned by \
the backend. Inspect the embedder service before retrying."
);
}
for (id, vec) in embeddings {
if vec.len() != batch_dim {
anyhow::bail!(
"embedding dim mismatch within batch: id={} expected {} got {}. \
Inspect the embedder service — all embeddings in one batch must share \
the same dimensionality.",
id,
batch_dim,
vec.len()
);
}
}
let existing_blob_len: Option<i64> = cat
.conn
.query_row(
"SELECT length(embedding) FROM artifact_vec LIMIT 1",
[],
|r| r.get(0),
)
.optional()?;
if let Some(blob_len) = existing_blob_len {
let existing_dim = (blob_len / 4) as usize;
if batch_dim != existing_dim {
anyhow::bail!(
"embedding dim mismatch vs catalog: batch={}, existing={}. \
Likely causes: (1) embedder is misconfigured and returns error \
sentinels with wrong dim (the F-6b case — vec.len()=1), (2) the \
configured embedder model changed without a full re-embed pipeline. \
To rebuild with a new model, drop `artifact_vec` rows explicitly \
first; do NOT use `reindex(force=true)` (bug-tracker #6/#7).",
batch_dim,
existing_dim
);
}
}
for (id, vec) in embeddings {
let blob: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect();
cat.conn.execute(
"DELETE FROM artifact_vec WHERE id = ?1",
rusqlite::params![id],
)?;
cat.conn.execute(
"INSERT INTO artifact_vec (id, embedding) VALUES (?1, ?2)",
rusqlite::params![id, blob],
)?;
}
Ok(())
}
use futures::stream::{self, StreamExt};
const EMBED_CONCURRENCY: usize = 8;
pub async fn index_repo(
cat: &Catalog,
rules: &[CompiledRule],
abs_root: &Path,
ignore: &globset::GlobSet,
embedding: Option<&crate::librarian::embedding::EmbeddingService>,
store: Option<&dyn crate::librarian::artifact_store::ArtifactVectorStore>,
project_id: &str,
) -> Result<IndexReport> {
let want = embedding.is_some();
let (mut report, embed_queue) = index_repo_sync(cat, rules, abs_root, ignore, want, false)?;
if let Some(svc) = embedding {
let futures_iter = embed_queue
.into_iter()
.map(|(id, title, chunk_text)| async move {
let vec = svc.embed_artifact(title.as_deref(), &chunk_text).await?;
anyhow::Ok((id, vec))
});
let mut stream = stream::iter(futures_iter).buffer_unordered(EMBED_CONCURRENCY);
let mut batch: Vec<(String, Vec<f32>)> = Vec::with_capacity(100);
while let Some(res) = stream.next().await {
batch.push(res?);
if batch.len() >= 100 {
if let Some(s) = store {
for (id, vec) in &batch {
s.upsert(project_id, id, vec).await?;
}
} else {
write_embeddings(cat, &batch)?;
}
report.embedded += batch.len();
batch.clear();
}
}
if !batch.is_empty() {
report.embedded += batch.len();
if let Some(s) = store {
for (id, vec) in &batch {
s.upsert(project_id, id, vec).await?;
}
} else {
write_embeddings(cat, &batch)?;
}
}
}
Ok(report)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::librarian::classify::load_rules;
use std::path::PathBuf;
#[test]
fn indexes_fixture_repo_with_mixed_classifications() {
let cat = Catalog::open_in_memory().unwrap();
let rules = load_rules(
r#"
[[rule]]
glob = "**/docs/superpowers/specs/*.md"
kind = "spec"
status = "active"
[[rule]]
glob = "**/docs/research/*.md"
kind = "memory"
"#,
)
.unwrap();
let ignore = globset::GlobSet::empty();
let fixture =
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/librarian/fixtures/repo_a");
let (report, _) = index_repo_sync(&cat, &rules, &fixture, &ignore, false, false).unwrap();
assert_eq!(report.added, 3, "should index 3 .md files");
assert_eq!(report.unknown_ids.len(), 1, "README.md is unknown");
let (r2, _) = index_repo_sync(&cat, &rules, &fixture, &ignore, false, false).unwrap();
assert_eq!(r2.unchanged, 3);
assert_eq!(r2.added, 0);
}
#[test]
fn index_repo_sync_skips_linked_worktree() {
let tmp = tempfile::TempDir::new().unwrap();
let wt = tmp.path().join("wt");
std::fs::create_dir_all(wt.join("docs")).unwrap();
std::fs::write(
wt.join(".git"),
format!(
"gitdir: {}/main/.git/worktrees/feat\n",
tmp.path().display()
),
)
.unwrap();
std::fs::write(wt.join("docs/a.md"), "# a\n").unwrap();
let cat = Catalog::open_in_memory().unwrap();
let rules: Vec<CompiledRule> = Vec::new();
let ignore = globset::GlobSet::empty();
let (report, queue) = index_repo_sync(&cat, &rules, &wt, &ignore, false, false).unwrap();
assert_eq!(report.added, 0, "a linked worktree must not be indexed");
assert!(queue.is_empty());
let n: i64 = cat
.conn
.query_row("SELECT COUNT(*) FROM artifact", [], |r| r.get(0))
.unwrap();
assert_eq!(n, 0, "no artifact rows created for the worktree");
}
#[test]
fn index_removes_deleted_files() {
let tmp = tempfile::TempDir::new().unwrap();
let root = tmp.path();
std::fs::create_dir_all(root.join("docs/specs")).unwrap();
std::fs::write(root.join("docs/specs/a.md"), "# a\n").unwrap();
std::fs::write(root.join("docs/specs/b.md"), "# b\n").unwrap();
let cat = Catalog::open_in_memory().unwrap();
let rules = crate::librarian::classify::load_rules(
"[[rule]]\nglob = \"**/docs/specs/*.md\"\nkind = \"spec\"\n",
)
.unwrap();
let ignore = globset::GlobSet::empty();
let (r1, _) = index_repo_sync(&cat, &rules, root, &ignore, false, false).unwrap();
assert_eq!(r1.added, 2);
std::fs::remove_file(root.join("docs/specs/b.md")).unwrap();
let (r2, _) = index_repo_sync(&cat, &rules, root, &ignore, false, false).unwrap();
assert_eq!(r2.removed, 1);
}
#[test]
fn reindex_refreshes_stale_metadata() {
let tmp = tempfile::TempDir::new().unwrap();
let root = tmp.path();
std::fs::create_dir_all(root.join("docs/specs")).unwrap();
let path = root.join("docs/specs/a.md");
std::fs::write(&path, "---\ntitle: Original\n---\nbody\n").unwrap();
let cat = Catalog::open_in_memory().unwrap();
let rules = crate::librarian::classify::load_rules(
"[[rule]]\nglob = \"**/docs/specs/*.md\"\nkind = \"spec\"\n",
)
.unwrap();
let ignore = globset::GlobSet::empty();
index_repo_sync(&cat, &rules, root, &ignore, false, false).unwrap();
let id = crate::librarian::ids::artifact_id_from_abs(&root.join("docs/specs/a.md"));
let before = crate::librarian::catalog::artifact::get(&cat, &id)
.unwrap()
.unwrap();
assert_eq!(before.title.as_deref(), Some("Original"));
std::fs::write(&path, "---\ntitle: Updated\n---\nbody\n").unwrap();
let stale = crate::librarian::catalog::artifact::get(&cat, &id)
.unwrap()
.unwrap();
assert_eq!(
stale.title.as_deref(),
Some("Original"),
"must be stale before reindex"
);
index_repo_sync(&cat, &rules, root, &ignore, false, false).unwrap();
let fresh = crate::librarian::catalog::artifact::get(&cat, &id)
.unwrap()
.unwrap();
assert_eq!(fresh.title.as_deref(), Some("Updated"));
}
#[tokio::test]
async fn embeds_artifact_into_vec_table() {
use crate::librarian::embedding::EmbeddingService;
use async_trait::async_trait;
use codescout_embed::{Embedder, Embedding};
use std::sync::Arc;
struct MockEmbedder;
#[async_trait]
impl Embedder for MockEmbedder {
fn dimensions(&self) -> usize {
768
}
async fn embed(&self, texts: &[&str]) -> anyhow::Result<Vec<Embedding>> {
Ok(texts.iter().map(|_| vec![0.1f32; 768]).collect())
}
}
let tmp = tempfile::TempDir::new().unwrap();
let root = tmp.path();
std::fs::create_dir_all(root.join("docs/specs")).unwrap();
std::fs::write(
root.join("docs/specs/a.md"),
"---\ntitle: Test\n---\n# Body\n\nSome content.\n",
)
.unwrap();
let cat = Catalog::open_in_memory().unwrap();
let rules = crate::librarian::classify::load_rules(
"[[rule]]\nglob = \"**/docs/specs/*.md\"\nkind = \"spec\"\n",
)
.unwrap();
let ignore = globset::GlobSet::empty();
let svc = EmbeddingService::new(Arc::new(MockEmbedder));
let (report, embed_queue) =
index_repo_sync(&cat, &rules, root, &ignore, true, false).unwrap();
assert_eq!(report.added, 1);
let mut computed: Vec<(String, Vec<f32>)> = Vec::new();
for (id, title, chunk_text) in &embed_queue {
let vec = svc
.embed_artifact(title.as_deref(), chunk_text)
.await
.unwrap();
computed.push((id.clone(), vec));
}
write_embeddings(&cat, &computed).unwrap();
let count: i64 = cat
.conn
.query_row("SELECT count(*) FROM artifact_vec", [], |row| row.get(0))
.unwrap();
assert_eq!(count, 1, "embedding should be written to artifact_vec");
}
#[test]
fn rule_change_reclassifies_existing_rows_without_content_change() {
let tmp = tempfile::TempDir::new().unwrap();
let root = tmp.path();
std::fs::create_dir_all(root.join("docs/trackers")).unwrap();
let path = root.join("docs/trackers/foo.md");
std::fs::write(&path, "# Foo\nbody\n").unwrap();
let cat = Catalog::open_in_memory().unwrap();
let ignore = globset::GlobSet::empty();
let id = crate::librarian::ids::artifact_id_from_abs(&root.join("docs/trackers/foo.md"));
let no_rules = crate::librarian::classify::load_rules("").unwrap();
index_repo_sync(&cat, &no_rules, root, &ignore, false, false).unwrap();
let before = crate::librarian::catalog::artifact::get(&cat, &id)
.unwrap()
.unwrap();
assert_eq!(before.kind, "unknown");
assert_eq!(before.status, "unknown");
let stale = crate::librarian::catalog::artifact::get(&cat, &id)
.unwrap()
.unwrap();
assert_eq!(stale.kind, "unknown", "must be stale before reindex");
let with_rules = crate::librarian::classify::load_rules(
"[[rule]]\nglob = \"**/docs/trackers/*.md\"\nkind = \"tracker\"\nstatus = \"active\"\n",
)
.unwrap();
index_repo_sync(&cat, &with_rules, root, &ignore, false, false).unwrap();
let after = crate::librarian::catalog::artifact::get(&cat, &id)
.unwrap()
.unwrap();
assert_eq!(after.kind, "tracker");
assert_eq!(after.status, "active");
}
#[test]
fn write_embeddings_is_idempotent_on_same_id() {
let cat = Catalog::open_in_memory().unwrap();
let now = chrono::Utc::now().timestamp_millis();
let row = crate::librarian::catalog::artifact::ArtifactRow {
id: "r:docs/a.md".into(),
abs_path: std::path::PathBuf::from("/test/r/docs/a.md"),
kind: "spec".into(),
status: "draft".into(),
title: None,
owners: vec![],
tags: vec![],
topic: None,
time_scope: None,
source: None,
created_at: now,
updated_at: now,
file_mtime: now,
file_sha256: "deadbeef".into(),
confidence: 1.0,
};
crate::librarian::catalog::artifact::upsert(&cat, &row).unwrap();
let id = "r:docs/a.md".to_string();
let vec_a: Vec<f32> = vec![0.1f32; 768];
let vec_b: Vec<f32> = vec![0.2f32; 768];
write_embeddings(&cat, &[(id.clone(), vec_a)]).unwrap();
write_embeddings(&cat, &[(id.clone(), vec_b)]).unwrap();
let count: i64 = cat
.conn
.query_row("SELECT count(*) FROM artifact_vec", [], |row| row.get(0))
.unwrap();
assert_eq!(count, 1, "second write must replace, not duplicate");
}
#[test]
fn ignore_globs_skip_matching_files() {
let tmp = tempfile::TempDir::new().unwrap();
let root = tmp.path();
std::fs::create_dir_all(root.join("docs/specs")).unwrap();
std::fs::create_dir_all(root.join("tests/fixtures")).unwrap();
std::fs::write(root.join("docs/specs/a.md"), "# a\n").unwrap();
std::fs::write(root.join("tests/fixtures/b.md"), "# fixture\n").unwrap();
let cat = Catalog::open_in_memory().unwrap();
let rules = crate::librarian::classify::load_rules(
"[[rule]]\nglob = \"**/*.md\"\nkind = \"doc\"\n",
)
.unwrap();
let ignore =
crate::librarian::workspace::compile_ignore(&["**/tests/fixtures/**".to_string()])
.unwrap();
let (r, _) = index_repo_sync(&cat, &rules, root, &ignore, false, false).unwrap();
assert_eq!(r.added, 1, "fixture file must be skipped by ignore glob");
}
#[test]
fn first_h1_extracts_title() {
assert_eq!(first_h1("# Hello\n\nbody text"), Some("Hello".to_string()));
}
#[test]
fn first_h1_skips_blank_and_code_fences() {
let body = "\n```\n# not a header\n```\n\n# Real\n\nbody";
assert_eq!(first_h1(body), Some("Real".to_string()));
}
#[test]
fn first_h1_none_when_missing() {
assert_eq!(first_h1("## Only H2\n\nno h1 here"), None);
assert_eq!(first_h1(""), None);
}
#[test]
fn first_h1_extracts_setext_heading() {
let body = "Setext Title\n===========\n\nbody";
assert_eq!(first_h1(body), Some("Setext Title".into()));
}
#[test]
fn first_h1_ignores_h1_inside_code_fence() {
let body = "```\n# not a heading\n```\n\n# Real Heading\n";
assert_eq!(first_h1(body), Some("Real Heading".into()));
}
#[test]
fn index_derives_title_from_h1_when_no_frontmatter() {
let tmp = tempfile::TempDir::new().unwrap();
let root = tmp.path();
std::fs::create_dir_all(root.join("docs")).unwrap();
std::fs::write(root.join("docs/page.md"), "# Title X\n\nSome body text.\n").unwrap();
let cat = Catalog::open_in_memory().unwrap();
let rules = crate::librarian::classify::load_rules(
"[[rule]]\nglob = \"**/*.md\"\nkind = \"doc\"\n",
)
.unwrap();
let ignore = globset::GlobSet::empty();
let (report, _) = index_repo_sync(&cat, &rules, root, &ignore, false, false).unwrap();
assert_eq!(report.added, 1);
let id = crate::librarian::ids::artifact_id_from_abs(&root.join("docs/page.md"));
let row = crate::librarian::catalog::artifact::get(&cat, &id)
.unwrap()
.unwrap();
assert_eq!(row.title.as_deref(), Some("Title X"));
}
#[test]
fn index_unions_rule_tags_with_frontmatter_tags() {
let tmp = tempfile::TempDir::new().unwrap();
let root = tmp.path();
std::fs::create_dir_all(root.join("src/tools")).unwrap();
std::fs::write(
root.join("src/tools/render_prompt.md"),
"# Render Prompt\n\nbody\n",
)
.unwrap();
std::fs::write(
root.join("src/tools/with_fm.md"),
"---\nkind: doc\ntags:\n - manual\n---\n\n# With FM\n",
)
.unwrap();
let cat = Catalog::open_in_memory().unwrap();
let rules = crate::librarian::classify::load_rules(
"[[rule]]\nglob = \"src/**/*.md\"\nkind = \"doc\"\ntags = [\"codescout\"]\n",
)
.unwrap();
let ignore = globset::GlobSet::empty();
let (report, _) = index_repo_sync(&cat, &rules, root, &ignore, false, false).unwrap();
assert_eq!(report.added, 2);
let id_no_fm =
crate::librarian::ids::artifact_id_from_abs(&root.join("src/tools/render_prompt.md"));
let row = crate::librarian::catalog::artifact::get(&cat, &id_no_fm)
.unwrap()
.unwrap();
assert_eq!(row.kind, "doc", "rule rescues the file from kind=unknown");
assert_eq!(row.tags, vec!["codescout".to_string()]);
let id_fm = crate::librarian::ids::artifact_id_from_abs(&root.join("src/tools/with_fm.md"));
let row = crate::librarian::catalog::artifact::get(&cat, &id_fm)
.unwrap()
.unwrap();
assert_eq!(
row.tags,
vec!["manual".to_string(), "codescout".to_string()]
);
}
#[test]
fn removed_file_also_removes_embedding_row() {
let tmp = tempfile::TempDir::new().unwrap();
let root = tmp.path();
std::fs::create_dir_all(root.join("docs/specs")).unwrap();
std::fs::write(root.join("docs/specs/a.md"), "# a\n").unwrap();
std::fs::write(root.join("docs/specs/b.md"), "# b\n").unwrap();
let cat = Catalog::open_in_memory().unwrap();
let rules = crate::librarian::classify::load_rules(
"[[rule]]\nglob = \"**/docs/specs/*.md\"\nkind = \"spec\"\n",
)
.unwrap();
let ignore = globset::GlobSet::empty();
index_repo_sync(&cat, &rules, root, &ignore, false, false).unwrap();
let id_a = crate::librarian::ids::artifact_id_from_abs(&root.join("docs/specs/a.md"));
let id_b = crate::librarian::ids::artifact_id_from_abs(&root.join("docs/specs/b.md"));
let bytes: Vec<u8> = std::iter::repeat_n(0f32, 768)
.flat_map(|f: f32| f.to_le_bytes())
.collect();
cat.conn
.execute(
"INSERT INTO artifact_vec (id, embedding) VALUES (?, ?)",
rusqlite::params![id_a, bytes],
)
.unwrap();
cat.conn
.execute(
"INSERT INTO artifact_vec (id, embedding) VALUES (?, ?)",
rusqlite::params![id_b, bytes],
)
.unwrap();
std::fs::remove_file(root.join("docs/specs/b.md")).unwrap();
index_repo_sync(&cat, &rules, root, &ignore, false, false).unwrap();
let count_b: i64 = cat
.conn
.query_row(
"SELECT count(*) FROM artifact_vec WHERE id = ?",
rusqlite::params![id_b],
|r| r.get(0),
)
.unwrap();
assert_eq!(count_b, 0, "trigger must cascade to artifact_vec");
let count_a: i64 = cat
.conn
.query_row(
"SELECT count(*) FROM artifact_vec WHERE id = ?",
rusqlite::params![id_a],
|r| r.get(0),
)
.unwrap();
assert_eq!(count_a, 1, "surviving file keeps embedding");
}
#[tokio::test]
async fn concurrent_embed_queue_completes_all() {
use crate::librarian::embedding::EmbeddingService;
use async_trait::async_trait;
use codescout_embed::{Embedder, Embedding};
use std::sync::Arc;
struct MockEmbedder;
#[async_trait]
impl Embedder for MockEmbedder {
fn dimensions(&self) -> usize {
768
}
async fn embed(&self, texts: &[&str]) -> anyhow::Result<Vec<Embedding>> {
Ok(texts.iter().map(|_| vec![0.1f32; 768]).collect())
}
}
let tmp = tempfile::TempDir::new().unwrap();
let root = tmp.path();
std::fs::create_dir_all(root.join("docs/specs")).unwrap();
for i in 0..16u32 {
std::fs::write(
root.join(format!("docs/specs/{i}.md")),
format!("---\ntitle: File {i}\n---\n# File {i}\n\nContent {i}.\n"),
)
.unwrap();
}
let cat = Catalog::open_in_memory().unwrap();
let rules = crate::librarian::classify::load_rules(
"[[rule]]\nglob = \"**/docs/specs/*.md\"\nkind = \"spec\"\n",
)
.unwrap();
let ignore = globset::GlobSet::empty();
let svc = EmbeddingService::new(Arc::new(MockEmbedder));
let report = index_repo(&cat, &rules, root, &ignore, Some(&svc), None, "")
.await
.unwrap();
assert_eq!(report.added, 16);
assert_eq!(report.embedded, 16);
let count: i64 = cat
.conn
.query_row("SELECT count(*) FROM artifact_vec", [], |row| row.get(0))
.unwrap();
assert_eq!(
count, 16,
"all 16 embeddings must be written via buffer_unordered"
);
}
}