use std::ffi::OsStr;
use std::path::{Path, PathBuf};
use std::time::UNIX_EPOCH;
use soma_studio_core::{
AppConfig, NotebookEmbeddingItem, NotebookEmbeddingResponse, NotebookRetrievalResponse,
NotebookRetrievalResult, SearchProfile, SearchQuery, SourceRootSummary, lexical_score,
search_snippet,
};
#[derive(Debug, Clone)]
pub struct ScannedSourceFile {
pub source_root_id: String,
pub relative_path: String,
pub absolute_path: String,
pub fingerprint: String,
pub size_bytes: i64,
pub modified_at: String,
pub file_type: String,
pub status: String,
pub last_error: Option<String>,
}
pub fn scan_source_root(root: &SourceRootSummary) -> Result<Vec<ScannedSourceFile>, String> {
let root_path = PathBuf::from(&root.path);
if !root_path.is_dir() {
return Err(format!("source root must be a directory: {}", root.path));
}
let mut files = Vec::new();
collect_files(&root.id.to_string(), &root_path, &root_path, &mut files)?;
files.sort_by(|left, right| left.relative_path.cmp(&right.relative_path));
Ok(files)
}
pub fn materialize_text_artifacts(
config: &AppConfig,
files: &[ScannedSourceFile],
) -> Result<Vec<ScannedSourceFile>, String> {
let mut updated = files.to_vec();
if let Some(root_id) = files.first().map(|file| file.source_root_id.clone()) {
for root_dir in [
source_root_text_dir(config, &root_id),
source_root_chunk_dir(config, &root_id),
] {
if root_dir.exists() {
std::fs::remove_dir_all(&root_dir).map_err(|error| {
format!("failed to clear source root derived artifacts: {error}")
})?;
}
}
}
for file in &mut updated {
if file.file_type != "text" {
continue;
}
let content = match std::fs::read_to_string(&file.absolute_path) {
Ok(content) => content,
Err(error) => {
file.status = "error".to_string();
file.last_error = Some(format!("failed to read source text: {error}"));
continue;
}
};
let normalized = normalize_source_text(&file.absolute_path, &content);
let relative = Path::new(&file.relative_path);
let artifact = source_root_text_artifact_path(config, &file.source_root_id, relative);
if let Some(parent) = artifact.parent() {
std::fs::create_dir_all(parent)
.map_err(|error| format!("failed to create source root text directory: {error}"))?;
}
std::fs::write(&artifact, normalized)
.map_err(|error| format!("failed to write source root text artifact: {error}"))?;
let chunks =
chunk_text(&std::fs::read_to_string(&artifact).map_err(|error| {
format!("failed to re-read source root text artifact: {error}")
})?);
let chunk_artifact =
source_root_chunk_artifact_path(config, &file.source_root_id, relative);
if let Some(parent) = chunk_artifact.parent() {
std::fs::create_dir_all(parent).map_err(|error| {
format!("failed to create source root chunk directory: {error}")
})?;
}
std::fs::write(
&chunk_artifact,
serde_json::to_string_pretty(
&chunks
.iter()
.enumerate()
.map(|(index, text)| serde_json::json!({ "index": index, "text": text }))
.collect::<Vec<_>>(),
)
.map_err(|error| format!("failed to encode source root chunks: {error}"))?,
)
.map_err(|error| format!("failed to write source root chunk artifact: {error}"))?;
file.status = "indexed".to_string();
file.last_error = None;
}
Ok(updated)
}
fn collect_files(
root_id: &str,
root: &Path,
current: &Path,
files: &mut Vec<ScannedSourceFile>,
) -> Result<(), String> {
for entry in std::fs::read_dir(current)
.map_err(|error| format!("failed to read source root: {error}"))?
{
let entry = entry.map_err(|error| format!("failed to read source entry: {error}"))?;
let file_type = entry
.file_type()
.map_err(|error| format!("failed to inspect source entry: {error}"))?;
if file_type.is_symlink() {
continue;
}
let path = entry.path();
if file_type.is_dir() {
collect_files(root_id, root, &path, files)?;
continue;
}
if !file_type.is_file() {
continue;
}
let metadata = entry
.metadata()
.map_err(|error| format!("failed to read source metadata: {error}"))?;
let relative = path
.strip_prefix(root)
.map_err(|error| format!("failed to relativize source file: {error}"))?
.to_string_lossy()
.replace('\\', "/");
let size_bytes = metadata.len() as i64;
let modified_ms = metadata
.modified()
.ok()
.and_then(|time| time.duration_since(UNIX_EPOCH).ok())
.map(|duration| duration.as_millis())
.unwrap_or_default();
files.push(ScannedSourceFile {
source_root_id: root_id.to_string(),
relative_path: relative,
absolute_path: path.to_string_lossy().to_string(),
fingerprint: format!("{size_bytes}:{modified_ms}"),
size_bytes,
modified_at: modified_ms.to_string(),
file_type: classify_file_type(&path),
status: "queued".to_string(),
last_error: None,
});
}
Ok(())
}
fn classify_file_type(path: &Path) -> String {
match path
.extension()
.and_then(|value| value.to_str())
.map(str::to_lowercase)
{
Some(ext) if matches!(ext.as_str(), "md" | "txt" | "typ") => "text".to_string(),
Some(ext) if matches!(ext.as_str(), "pdf" | "docx") => "document".to_string(),
Some(ext) if matches!(ext.as_str(), "png" | "jpg" | "jpeg" | "webp") => "image".to_string(),
Some(ext) => format!("other:{ext}"),
None => "other".to_string(),
}
}
fn source_root_text_dir(config: &AppConfig, source_root_id: &str) -> PathBuf {
config
.derived_dir
.join("source-root-text")
.join(source_root_id)
}
fn source_root_chunk_dir(config: &AppConfig, source_root_id: &str) -> PathBuf {
config
.derived_dir
.join("source-root-chunks")
.join(source_root_id)
}
fn source_root_text_artifact_path(
config: &AppConfig,
source_root_id: &str,
relative: &Path,
) -> PathBuf {
let mut artifact = source_root_text_dir(config, source_root_id).join(relative);
artifact.set_extension("txt");
artifact
}
fn source_root_chunk_artifact_path(
config: &AppConfig,
source_root_id: &str,
relative: &Path,
) -> PathBuf {
let mut artifact = source_root_chunk_dir(config, source_root_id).join(relative);
artifact.set_extension("json");
artifact
}
fn source_root_embedding_dir(config: &AppConfig, source_root_id: &str) -> PathBuf {
config
.derived_dir
.join("source-root-embeddings")
.join(source_root_id)
}
fn source_root_embedding_artifact_path(
config: &AppConfig,
source_root_id: &str,
relative: &Path,
) -> PathBuf {
let mut artifact = source_root_embedding_dir(config, source_root_id).join(relative);
artifact.set_extension("json");
artifact
}
pub fn retrieve_source_root_text(
config: &AppConfig,
files: &[crate::storage::IndexedSourceFileRow],
query: &str,
) -> Result<NotebookRetrievalResponse, String> {
retrieve_source_root_text_with_profile(config, files, query, SearchProfile::RagContext)
}
pub fn retrieve_source_root_text_with_profile(
config: &AppConfig,
files: &[crate::storage::IndexedSourceFileRow],
query: &str,
profile: SearchProfile,
) -> Result<NotebookRetrievalResponse, String> {
let query = SearchQuery::new(query, profile);
if query.is_empty() {
return Ok(NotebookRetrievalResponse {
query: String::new(),
strategy: "none".to_string(),
results: Vec::new(),
});
}
let mut results = Vec::new();
for file in files {
let Some((chunk_artifact, chunks)) = load_or_build_source_root_chunks(config, file)? else {
continue;
};
for (chunk_index, chunk) in chunks.into_iter().enumerate() {
let score = lexical_score(&chunk, &query);
if score == 0 {
continue;
}
results.push(NotebookRetrievalResult {
path: format!("source-root/{}/{}", file.source_root_id, file.relative_path),
format: source_text_format(&file.relative_path),
chunk_path: chunk_artifact
.strip_prefix(&config.derived_dir)
.map(display_path)
.unwrap_or_else(|_| chunk_artifact.to_string_lossy().to_string()),
chunk_index,
score,
snippet: search_snippet(&chunk, 240),
provenance: format!(
"source=source-root/{}/{}; extractor=source-root-text-v1",
file.source_root_id, file.relative_path
),
});
}
}
results.sort_by(|left, right| {
right
.score
.cmp(&left.score)
.then_with(|| left.path.cmp(&right.path))
.then_with(|| left.chunk_index.cmp(&right.chunk_index))
});
results.truncate(query.limit);
Ok(NotebookRetrievalResponse {
query: query.raw,
strategy: "lexical".to_string(),
results,
})
}
#[derive(Debug, Clone)]
pub struct SourceRootEmbeddingInput {
pub source_root_id: String,
pub relative_path: String,
pub chunks: Vec<String>,
}
pub fn collect_source_root_embedding_inputs(
config: &AppConfig,
files: &[crate::storage::IndexedSourceFileRow],
) -> Result<Vec<SourceRootEmbeddingInput>, String> {
let mut inputs = Vec::new();
for file in files {
let Some((_, chunks)) = load_or_build_source_root_chunks(config, file)? else {
continue;
};
if chunks.is_empty() {
continue;
}
inputs.push(SourceRootEmbeddingInput {
source_root_id: file.source_root_id.clone(),
relative_path: file.relative_path.clone(),
chunks,
});
}
Ok(inputs)
}
pub fn write_source_root_embeddings(
config: &AppConfig,
source_root_id: &str,
relative_path: &str,
provider: &str,
model_id: &str,
vectors: &[Vec<f32>],
) -> Result<NotebookEmbeddingItem, String> {
std::fs::create_dir_all(&config.derived_dir)
.map_err(|error| format!("failed to create derived workspace: {error}"))?;
let relative = Path::new(relative_path);
let artifact = source_root_embedding_artifact_path(config, source_root_id, relative);
if let Some(parent) = artifact.parent() {
std::fs::create_dir_all(parent).map_err(|error| {
format!("failed to create source root embedding directory: {error}")
})?;
}
let payload = serde_json::json!({
"provider": provider,
"model_id": model_id,
"vectors": vectors,
});
std::fs::write(
&artifact,
serde_json::to_vec_pretty(&payload)
.map_err(|error| format!("failed to encode source root embeddings: {error}"))?,
)
.map_err(|error| format!("failed to write source root embeddings: {error}"))?;
Ok(NotebookEmbeddingItem {
path: format!("source-root/{source_root_id}/{relative_path}"),
format: source_text_format(relative_path),
embedding_path: artifact
.strip_prefix(&config.derived_dir)
.map(display_path)
.unwrap_or_else(|_| artifact.to_string_lossy().to_string()),
provider: provider.to_string(),
model_id: model_id.to_string(),
chunks: vectors.len(),
dimensions: vectors
.first()
.map(|vector| vector.len())
.unwrap_or_default(),
provenance: format!(
"source=source-root/{source_root_id}/{relative_path}; embedder={provider}/{model_id}; extractor=source-root-embedding-v1"
),
})
}
pub fn source_root_embedding_status(
config: &AppConfig,
files: &[crate::storage::IndexedSourceFileRow],
provider: &str,
model_id: &str,
) -> Result<NotebookEmbeddingResponse, String> {
let mut items = Vec::new();
for file in files {
let relative = Path::new(&file.relative_path);
let Some((chunk_artifact, _)) = load_current_source_root_chunks(config, file) else {
continue;
};
let artifact = source_root_embedding_artifact_path(config, &file.source_root_id, relative);
if !file_is_at_least_as_new(&artifact, &chunk_artifact) {
continue;
}
let Ok(payload) = std::fs::read_to_string(&artifact) else {
continue;
};
let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&payload) else {
continue;
};
if parsed.get("provider").and_then(|value| value.as_str()) != Some(provider)
|| parsed.get("model_id").and_then(|value| value.as_str()) != Some(model_id)
{
continue;
}
let vectors = parsed
.get("vectors")
.and_then(|value| value.as_array())
.cloned()
.unwrap_or_default();
let dimensions = vectors
.first()
.and_then(|value| value.as_array())
.map(|vector| vector.len())
.unwrap_or_default();
items.push(NotebookEmbeddingItem {
path: format!("source-root/{}/{}", file.source_root_id, file.relative_path),
format: source_text_format(&file.relative_path),
embedding_path: artifact
.strip_prefix(&config.derived_dir)
.map(display_path)
.unwrap_or_else(|_| artifact.to_string_lossy().to_string()),
provider: provider.to_string(),
model_id: model_id.to_string(),
chunks: vectors.len(),
dimensions,
provenance: format!(
"source=source-root/{}/{}; embedder={}/{}; extractor=source-root-embedding-v1",
file.source_root_id, file.relative_path, provider, model_id
),
});
}
Ok(NotebookEmbeddingResponse {
embedded: items.len(),
items,
})
}
pub fn retrieve_source_root_with_query_vector(
config: &AppConfig,
files: &[crate::storage::IndexedSourceFileRow],
provider: &str,
model_id: &str,
query_vector: &[f32],
) -> Result<NotebookRetrievalResponse, String> {
let mut results = Vec::new();
for file in files {
let relative = Path::new(&file.relative_path);
let Some((chunk_artifact, chunks)) = load_or_build_source_root_chunks(config, file)? else {
continue;
};
let embedding_artifact =
source_root_embedding_artifact_path(config, &file.source_root_id, relative);
if !file_is_at_least_as_new(&embedding_artifact, &chunk_artifact) {
continue;
}
let Ok(payload) = std::fs::read_to_string(&embedding_artifact) else {
continue;
};
let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&payload) else {
continue;
};
if parsed.get("provider").and_then(|value| value.as_str()) != Some(provider)
|| parsed.get("model_id").and_then(|value| value.as_str()) != Some(model_id)
{
continue;
}
let vectors = parsed
.get("vectors")
.and_then(|value| value.as_array())
.cloned()
.unwrap_or_default()
.into_iter()
.map(|vector| {
vector
.as_array()
.cloned()
.unwrap_or_default()
.into_iter()
.filter_map(|value| value.as_f64().map(|value| value as f32))
.collect::<Vec<_>>()
})
.collect::<Vec<_>>();
for (chunk_index, (chunk, vector)) in chunks.into_iter().zip(vectors).enumerate() {
let similarity = crate::notebook::cosine_similarity(query_vector, &vector);
if similarity <= 0.0 {
continue;
}
results.push(NotebookRetrievalResult {
path: format!("source-root/{}/{}", file.source_root_id, file.relative_path),
format: source_text_format(&file.relative_path),
chunk_path: chunk_artifact
.strip_prefix(&config.derived_dir)
.map(display_path)
.unwrap_or_else(|_| chunk_artifact.to_string_lossy().to_string()),
chunk_index,
score: (similarity * 1000.0).round().clamp(0.0, 1000.0) as usize,
snippet: chunk.chars().take(240).collect(),
provenance: format!(
"source=source-root/{}/{}; embedder={}/{}; extractor=source-root-embedding-v1",
file.source_root_id, file.relative_path, provider, model_id
),
});
}
}
results.sort_by(|left, right| {
right
.score
.cmp(&left.score)
.then_with(|| left.path.cmp(&right.path))
.then_with(|| left.chunk_index.cmp(&right.chunk_index))
});
results.truncate(8);
Ok(NotebookRetrievalResponse {
query: String::new(),
strategy: "semantic".to_string(),
results,
})
}
fn normalize_source_text(path: &str, content: &str) -> String {
match Path::new(path)
.extension()
.and_then(OsStr::to_str)
.map(str::to_lowercase)
{
Some(ext) if ext == "md" => normalize_markdown_text(content),
Some(ext) if ext == "typ" => normalize_typst_text(content),
_ => content.trim().to_string(),
}
}
fn load_or_build_source_root_chunks(
config: &AppConfig,
file: &crate::storage::IndexedSourceFileRow,
) -> Result<Option<(PathBuf, Vec<String>)>, String> {
let relative = Path::new(&file.relative_path);
let source_path = Path::new(&file.absolute_path);
let chunk_artifact = source_root_chunk_artifact_path(config, &file.source_root_id, relative);
if file_is_at_least_as_new(&chunk_artifact, source_path)
&& let Ok(chunks) = load_chunk_texts(&chunk_artifact)
{
return Ok(Some((chunk_artifact, chunks)));
}
if !source_path.exists() {
return Ok(None);
}
let text_artifact = source_root_text_artifact_path(config, &file.source_root_id, relative);
let text = if file_is_at_least_as_new(&text_artifact, source_path) {
std::fs::read_to_string(&text_artifact)
.map_err(|error| format!("failed to read source root text artifact: {error}"))?
} else {
let source = std::fs::read_to_string(source_path)
.map_err(|error| format!("failed to read source text: {error}"))?;
let normalized = normalize_source_text(&file.absolute_path, &source);
if let Some(parent) = text_artifact.parent() {
std::fs::create_dir_all(parent)
.map_err(|error| format!("failed to create source root text directory: {error}"))?;
}
std::fs::write(&text_artifact, &normalized)
.map_err(|error| format!("failed to write source root text artifact: {error}"))?;
normalized
};
let chunks = chunk_text(&text);
if let Some(parent) = chunk_artifact.parent() {
std::fs::create_dir_all(parent)
.map_err(|error| format!("failed to create source root chunk directory: {error}"))?;
}
std::fs::write(
&chunk_artifact,
serde_json::to_string_pretty(
&chunks
.iter()
.enumerate()
.map(|(index, text)| serde_json::json!({ "index": index, "text": text }))
.collect::<Vec<_>>(),
)
.map_err(|error| format!("failed to encode source root chunks: {error}"))?,
)
.map_err(|error| format!("failed to write source root chunk artifact: {error}"))?;
Ok(Some((chunk_artifact, chunks)))
}
fn load_current_source_root_chunks(
config: &AppConfig,
file: &crate::storage::IndexedSourceFileRow,
) -> Option<(PathBuf, Vec<String>)> {
let relative = Path::new(&file.relative_path);
let source_path = Path::new(&file.absolute_path);
let chunk_artifact = source_root_chunk_artifact_path(config, &file.source_root_id, relative);
if !file_is_at_least_as_new(&chunk_artifact, source_path) {
return None;
}
let Ok(chunks) = load_chunk_texts(&chunk_artifact) else {
return None;
};
Some((chunk_artifact, chunks))
}
fn normalize_markdown_text(content: &str) -> String {
let mut in_fence = false;
content
.lines()
.filter_map(|line| {
let trimmed = line.trim();
if trimmed.starts_with("```") {
in_fence = !in_fence;
return None;
}
if in_fence || trimmed.is_empty() {
return None;
}
Some(
trimmed
.trim_start_matches('#')
.trim_start_matches(['-', '*', '>'])
.trim()
.to_string(),
)
})
.filter(|line| !line.is_empty())
.collect::<Vec<_>>()
.join("\n")
}
fn normalize_typst_text(content: &str) -> String {
content
.lines()
.filter_map(|line| {
let trimmed = line.trim();
if trimmed.is_empty()
|| trimmed.starts_with("#set")
|| trimmed.starts_with("#import")
|| trimmed.starts_with("#include")
{
return None;
}
Some(
trimmed
.trim_start_matches('=')
.replace(['#', '[', ']', '*', '_'], "")
.trim()
.to_string(),
)
})
.filter(|line| !line.is_empty())
.collect::<Vec<_>>()
.join("\n")
}
fn chunk_text(content: &str) -> Vec<String> {
const TARGET_CHARS: usize = 800;
let mut chunks = Vec::new();
let mut current = String::new();
for paragraph in content.split("\n\n") {
let paragraph = paragraph.trim();
if paragraph.is_empty() {
continue;
}
let next_len = current.chars().count() + paragraph.chars().count() + 2;
if !current.is_empty() && next_len > TARGET_CHARS {
chunks.push(current.trim().to_string());
current.clear();
}
if !current.is_empty() {
current.push_str("\n\n");
}
current.push_str(paragraph);
}
if !current.trim().is_empty() {
chunks.push(current.trim().to_string());
}
chunks
}
fn load_chunk_texts(path: &Path) -> Result<Vec<String>, String> {
let content = std::fs::read_to_string(path)
.map_err(|error| format!("failed to read source root chunks: {error}"))?;
let items = serde_json::from_str::<Vec<serde_json::Value>>(&content)
.map_err(|error| format!("failed to decode source root chunks: {error}"))?;
Ok(items
.into_iter()
.filter_map(|item| {
item.get("text")
.and_then(|value| value.as_str())
.map(str::to_string)
})
.collect())
}
fn source_text_format(path: &str) -> String {
match Path::new(path)
.extension()
.and_then(OsStr::to_str)
.map(str::to_lowercase)
.as_deref()
{
Some("md") => "markdown".to_string(),
Some("typ") => "typst".to_string(),
_ => "plaintext".to_string(),
}
}
fn display_path(path: &Path) -> String {
path.components()
.filter_map(|component| match component {
std::path::Component::Normal(segment) => segment.to_str().map(str::to_string),
_ => None,
})
.collect::<Vec<_>>()
.join("/")
}
fn file_is_at_least_as_new(artifact: &Path, source: &Path) -> bool {
let Ok(artifact_modified) = artifact.metadata().and_then(|metadata| metadata.modified()) else {
return false;
};
let Ok(source_modified) = source.metadata().and_then(|metadata| metadata.modified()) else {
return false;
};
artifact_modified >= source_modified
}
#[cfg(test)]
mod tests {
use super::{
collect_source_root_embedding_inputs, materialize_text_artifacts,
retrieve_source_root_text, retrieve_source_root_with_query_vector, scan_source_root,
source_root_embedding_status, write_source_root_embeddings,
};
use crate::storage::IndexedSourceFileRow;
use soma_studio_core::{AppConfig, SourceRootSummary};
use std::path::PathBuf;
use uuid::Uuid;
#[test]
fn source_root_semantic_retrieval_prefers_closest_embedding() {
let temp_dir = std::env::temp_dir().join(format!("soma-studio-ingest-{}", Uuid::new_v4()));
let source_dir = temp_dir.join("source");
std::fs::create_dir_all(&source_dir).expect("source dir");
std::fs::write(source_dir.join("alpha.md"), "# Alpha\n\nAlpha context").expect("alpha");
std::fs::write(source_dir.join("beta.md"), "# Beta\n\nBeta context").expect("beta");
let config = test_config(&temp_dir);
let root = test_root(&source_dir);
let indexed = indexed_rows(
&materialize_text_artifacts(&config, &scan_source_root(&root).expect("scan"))
.expect("materialize"),
);
write_source_root_embeddings(
&config,
&root.id.to_string(),
"alpha.md",
"ollama",
"embed-model",
&[vec![1.0, 0.0]],
)
.expect("embed alpha");
write_source_root_embeddings(
&config,
&root.id.to_string(),
"beta.md",
"ollama",
"embed-model",
&[vec![0.0, 1.0]],
)
.expect("embed beta");
let retrieval = retrieve_source_root_with_query_vector(
&config,
&indexed,
"ollama",
"embed-model",
&[0.9, 0.1],
)
.expect("retrieve");
assert_eq!(retrieval.strategy, "semantic");
assert_eq!(
retrieval.results[0].path,
format!("source-root/{}/alpha.md", root.id)
);
let _ = std::fs::remove_dir_all(temp_dir);
}
#[test]
fn source_root_embeddings_ignore_stale_chunks_and_rebuild_inputs_from_fresh_source() {
let temp_dir = std::env::temp_dir().join(format!("soma-studio-ingest-{}", Uuid::new_v4()));
let source_dir = temp_dir.join("source");
std::fs::create_dir_all(&source_dir).expect("source dir");
let source_file = source_dir.join("topic.md");
std::fs::write(&source_file, "# Topic\n\nold text").expect("seed source");
let config = test_config(&temp_dir);
let root = test_root(&source_dir);
let indexed = indexed_rows(
&materialize_text_artifacts(&config, &scan_source_root(&root).expect("scan"))
.expect("materialize"),
);
write_source_root_embeddings(
&config,
&root.id.to_string(),
"topic.md",
"ollama",
"embed-model",
&[vec![1.0, 0.0]],
)
.expect("embed");
std::thread::sleep(std::time::Duration::from_millis(20));
std::fs::write(&source_file, "# Topic\n\nfresh source text").expect("update source");
let status = source_root_embedding_status(&config, &indexed, "ollama", "embed-model")
.expect("status");
let retrieval = retrieve_source_root_with_query_vector(
&config,
&indexed,
"ollama",
"embed-model",
&[1.0, 0.0],
)
.expect("retrieve");
let inputs =
collect_source_root_embedding_inputs(&config, &indexed).expect("embedding inputs");
assert_eq!(status.embedded, 0);
assert!(retrieval.results.is_empty());
assert!(inputs[0].chunks.join("\n").contains("fresh source text"));
write_source_root_embeddings(
&config,
&root.id.to_string(),
"topic.md",
"ollama",
"embed-model",
&[vec![1.0, 0.0]],
)
.expect("re-embed");
let rebuilt_status =
source_root_embedding_status(&config, &indexed, "ollama", "embed-model")
.expect("rebuilt status");
let rebuilt_retrieval = retrieve_source_root_with_query_vector(
&config,
&indexed,
"ollama",
"embed-model",
&[1.0, 0.0],
)
.expect("rebuilt retrieve");
assert_eq!(rebuilt_status.embedded, 1);
assert!(
rebuilt_retrieval.results[0]
.snippet
.contains("fresh source text")
);
let _ = std::fs::remove_dir_all(temp_dir);
}
#[test]
fn deleted_indexed_source_files_are_skipped_until_rescan() {
let temp_dir = std::env::temp_dir().join(format!("soma-studio-ingest-{}", Uuid::new_v4()));
let source_dir = temp_dir.join("source");
std::fs::create_dir_all(&source_dir).expect("source dir");
let source_file = source_dir.join("topic.md");
std::fs::write(&source_file, "# Topic\n\nsearchable text").expect("seed source");
let config = test_config(&temp_dir);
let root = test_root(&source_dir);
let indexed = indexed_rows(
&materialize_text_artifacts(&config, &scan_source_root(&root).expect("scan"))
.expect("materialize"),
);
std::fs::remove_file(&source_file).expect("delete source");
let retrieval =
retrieve_source_root_text(&config, &indexed, "searchable").expect("retrieve");
let inputs =
collect_source_root_embedding_inputs(&config, &indexed).expect("embedding inputs");
assert!(retrieval.results.is_empty());
assert!(inputs.is_empty());
let _ = std::fs::remove_dir_all(temp_dir);
}
#[test]
fn corrupt_source_root_chunks_are_rebuilt_from_source() {
let temp_dir = std::env::temp_dir().join(format!("soma-studio-ingest-{}", Uuid::new_v4()));
let source_dir = temp_dir.join("source");
std::fs::create_dir_all(&source_dir).expect("source dir");
std::fs::write(source_dir.join("topic.md"), "# Topic\n\nrecoverable text").expect("source");
let config = test_config(&temp_dir);
let root = test_root(&source_dir);
let indexed = indexed_rows(
&materialize_text_artifacts(&config, &scan_source_root(&root).expect("scan"))
.expect("materialize"),
);
std::thread::sleep(std::time::Duration::from_millis(20));
let mut corrupt_chunk = config
.derived_dir
.join("source-root-chunks")
.join(root.id.to_string())
.join("topic.md");
corrupt_chunk.set_extension("json");
std::fs::write(&corrupt_chunk, "{not json").expect("corrupt chunk");
let retrieval =
retrieve_source_root_text(&config, &indexed, "recoverable").expect("retrieve");
let inputs =
collect_source_root_embedding_inputs(&config, &indexed).expect("embedding inputs");
assert_eq!(retrieval.results.len(), 1);
assert!(retrieval.results[0].snippet.contains("recoverable text"));
assert!(inputs[0].chunks.join("\n").contains("recoverable text"));
let _ = std::fs::remove_dir_all(temp_dir);
}
#[test]
fn corrupt_source_root_embeddings_are_ignored_for_status_and_retrieval() {
let temp_dir = std::env::temp_dir().join(format!("soma-studio-ingest-{}", Uuid::new_v4()));
let source_dir = temp_dir.join("source");
std::fs::create_dir_all(&source_dir).expect("source dir");
std::fs::write(source_dir.join("topic.md"), "# Topic\n\nsemantic text").expect("source");
let config = test_config(&temp_dir);
let root = test_root(&source_dir);
let indexed = indexed_rows(
&materialize_text_artifacts(&config, &scan_source_root(&root).expect("scan"))
.expect("materialize"),
);
write_source_root_embeddings(
&config,
&root.id.to_string(),
"topic.md",
"ollama",
"embed-model",
&[vec![1.0, 0.0]],
)
.expect("embed");
let mut corrupt_embedding = config
.derived_dir
.join("source-root-embeddings")
.join(root.id.to_string())
.join("topic.md");
corrupt_embedding.set_extension("json");
std::fs::write(corrupt_embedding, "{not json").expect("corrupt embedding");
let status = source_root_embedding_status(&config, &indexed, "ollama", "embed-model")
.expect("status");
let retrieval = retrieve_source_root_with_query_vector(
&config,
&indexed,
"ollama",
"embed-model",
&[1.0, 0.0],
)
.expect("retrieve");
assert_eq!(status.embedded, 0);
assert!(retrieval.results.is_empty());
let _ = std::fs::remove_dir_all(temp_dir);
}
fn test_config(temp_dir: &std::path::Path) -> AppConfig {
AppConfig {
app_name: "Soma Studio".to_string(),
bind_addr: "127.0.0.1:0".to_string(),
project_root: temp_dir.to_path_buf(),
data_dir: temp_dir.to_path_buf(),
derived_dir: temp_dir.join("derived"),
notebook_dir: temp_dir.join("notebook"),
user_assets_dir: temp_dir.join("assets"),
db_path: temp_dir.join("test.db"),
web_build_dir: PathBuf::from("unused"),
web_shell_file: PathBuf::from("unused/spa.html"),
}
}
fn test_root(source_dir: &std::path::Path) -> SourceRootSummary {
SourceRootSummary {
id: Uuid::new_v4(),
path: source_dir.to_string_lossy().to_string(),
read_only: true,
}
}
fn indexed_rows(files: &[super::ScannedSourceFile]) -> Vec<IndexedSourceFileRow> {
files
.iter()
.filter(|file| file.status == "indexed")
.map(|file| IndexedSourceFileRow {
source_root_id: file.source_root_id.clone(),
relative_path: file.relative_path.clone(),
absolute_path: file.absolute_path.clone(),
})
.collect()
}
}