use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
use std::time::{SystemTime, UNIX_EPOCH};
use serde::{Deserialize, Serialize};
use soma_studio_core::{
AppConfig, NotebookNoteFormat, SearchDocumentStatus, SearchFieldScope,
SearchIndexRebuildResponse, SearchIndexStatusResponse, SearchProfile, SearchQuery,
SearchResponse, SearchResult, SearchSort, SearchSourceType, lexical_score, search_highlights,
search_snippet,
};
use tantivy::collector::TopDocs;
use tantivy::query::{AllQuery, QueryParser};
use tantivy::schema::{FAST, Field, STORED, STRING, Schema, TEXT, Value};
use tantivy::{Index, IndexWriter, TantivyDocument, Term, doc};
const SEARCH_INDEX_PROFILE: &str = "default";
const SEARCH_INDEX_SCHEMA_VERSION: u32 = 4;
const SEARCH_INDEX_TOKENIZER_PROFILE: &str = "tantivy-default";
#[derive(Debug, Clone)]
struct SearchIndexDocument {
document_id: String,
source_type: String,
source_id: String,
path: String,
title: String,
format: String,
source_path: String,
source_fingerprint: String,
updated_at_ms: u64,
indexed_at_ms: u64,
body: String,
chunk_path: String,
chunk_index: usize,
provenance: String,
}
#[derive(Debug, Clone)]
struct SearchIndexFields {
document_id: tantivy::schema::Field,
source_type: tantivy::schema::Field,
source_id: tantivy::schema::Field,
path: tantivy::schema::Field,
path_text: tantivy::schema::Field,
title: tantivy::schema::Field,
format: tantivy::schema::Field,
source_path: tantivy::schema::Field,
source_fingerprint: tantivy::schema::Field,
updated_at_ms: tantivy::schema::Field,
indexed_at_ms: tantivy::schema::Field,
body: tantivy::schema::Field,
chunk_path: tantivy::schema::Field,
chunk_index: tantivy::schema::Field,
provenance: tantivy::schema::Field,
}
#[derive(Debug, Clone, Default)]
pub struct SearchIndexQueryOptions {
pub source_type_filter: Option<SearchSourceType>,
pub format_filter: Option<String>,
pub field_scope: Option<SearchFieldScope>,
pub sort: Option<SearchSort>,
pub limit: Option<usize>,
pub offset: Option<usize>,
pub cursor: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct StoredSearchIndexState {
index_profile: String,
schema_version: u32,
tokenizer_profile: String,
document_count: usize,
chunk_count: usize,
last_rebuilt_at: Option<String>,
last_synced_at: Option<String>,
}
pub async fn rebuild_search_index(
config: AppConfig,
indexed_source_files: Vec<crate::storage::IndexedSourceFileRow>,
) -> Result<SearchIndexRebuildResponse, String> {
tokio::task::spawn_blocking(move || {
rebuild_search_index_blocking(&config, &indexed_source_files)
})
.await
.map_err(|error| format!("search index rebuild worker join failed: {error}"))?
}
pub async fn sync_search_index(
config: AppConfig,
indexed_source_files: Vec<crate::storage::IndexedSourceFileRow>,
) -> Result<SearchIndexRebuildResponse, String> {
tokio::task::spawn_blocking(move || sync_search_index_blocking(&config, &indexed_source_files))
.await
.map_err(|error| format!("search index sync worker join failed: {error}"))?
}
pub async fn search_index_query(
config: AppConfig,
query: String,
options: SearchIndexQueryOptions,
) -> Result<SearchResponse, String> {
tokio::task::spawn_blocking(move || search_index_query_blocking(&config, &query, options))
.await
.map_err(|error| format!("search index query worker join failed: {error}"))?
}
pub fn search_index_status(config: &AppConfig) -> SearchIndexStatusResponse {
let index_dir = search_index_dir(config);
let state = read_search_index_state(&index_dir);
let mut diagnostics = Vec::new();
let index_exists = index_dir.join("meta.json").is_file();
if !index_exists {
diagnostics.push("tantivy index metadata is missing; rebuild required".to_string());
} else if let Err(error) = Index::open_in_dir(&index_dir) {
diagnostics.push(format!("tantivy index cannot be opened: {error}"));
}
let state = match state {
Ok(Some(state)) => Some(state),
Ok(None) => {
diagnostics.push("search index state is missing; rebuild required".to_string());
None
}
Err(error) => {
diagnostics.push(format!("search index state is unreadable: {error}"));
None
}
};
let document_count = state
.as_ref()
.map(|state| state.document_count)
.unwrap_or(0);
let chunk_count = state.as_ref().map(|state| state.chunk_count).unwrap_or(0);
let last_rebuilt_at = state
.as_ref()
.and_then(|state| state.last_rebuilt_at.clone());
let last_synced_at = state
.as_ref()
.and_then(|state| state.last_synced_at.clone())
.or_else(|| last_rebuilt_at.clone());
if let Some(state) = &state {
if state.schema_version != SEARCH_INDEX_SCHEMA_VERSION {
diagnostics.push(format!(
"schema version mismatch: stored {}, expected {}",
state.schema_version, SEARCH_INDEX_SCHEMA_VERSION
));
}
if state.tokenizer_profile != SEARCH_INDEX_TOKENIZER_PROFILE {
diagnostics.push(format!(
"tokenizer profile mismatch: stored {}, expected {}",
state.tokenizer_profile, SEARCH_INDEX_TOKENIZER_PROFILE
));
}
}
SearchIndexStatusResponse {
index_profile: SEARCH_INDEX_PROFILE.to_string(),
schema_version: SEARCH_INDEX_SCHEMA_VERSION,
tokenizer_profile: SEARCH_INDEX_TOKENIZER_PROFILE.to_string(),
index_path: display_path(&index_dir),
exists: index_exists,
ready: index_exists && diagnostics.is_empty(),
document_count,
chunk_count,
last_rebuilt_at,
last_synced_at,
diagnostics,
}
}
fn search_index_query_blocking(
config: &AppConfig,
raw_query: &str,
options: SearchIndexQueryOptions,
) -> Result<SearchResponse, String> {
let cursor_offset = match options.cursor.as_deref() {
Some(cursor) if !cursor.trim().is_empty() => Some(decode_search_cursor(cursor)?),
_ => None,
};
if cursor_offset.is_some() && options.offset.is_some() {
return Err("invalid search cursor: cannot be combined with offset".to_string());
}
let query = SearchQuery::new(raw_query, SearchProfile::InteractiveSearch)
.with_field(options.field_scope.unwrap_or(SearchFieldScope::All))
.with_sort(options.sort.unwrap_or(SearchSort::Relevance))
.with_limit(
options
.limit
.unwrap_or_else(|| SearchProfile::InteractiveSearch.default_limit()),
)
.with_offset(cursor_offset.or(options.offset).unwrap_or_default());
if query.is_empty() {
return Ok(SearchResponse {
query: String::new(),
profile: SearchProfile::InteractiveSearch,
field: query.field,
sort: query.sort,
limit: query.limit,
offset: query.offset,
total_results: 0,
next_cursor: None,
previous_cursor: None,
results: Vec::new(),
});
}
let status = search_index_status(config);
if !status.ready {
return Err(format!(
"search index is not ready: {}",
status.diagnostics.join("; ")
));
}
let index_dir = search_index_dir(config);
let index = Index::open_in_dir(&index_dir)
.map_err(|error| format!("failed to open search index: {error}"))?;
let schema = index.schema();
let fields = search_index_fields(&schema)?;
let reader = index
.reader()
.map_err(|error| format!("failed to create search index reader: {error}"))?;
let searcher = reader.searcher();
let query_fields = search_query_fields(&fields, query.field);
let mut parser = QueryParser::for_index(&index, query_fields);
if matches!(query.field, SearchFieldScope::All | SearchFieldScope::Title) {
parser.set_field_boost(fields.title, 2.0);
}
if matches!(query.field, SearchFieldScope::All | SearchFieldScope::Path) {
parser.set_field_boost(fields.path_text, 1.4);
}
let (tantivy_query, _parse_errors) = parser.parse_query_lenient(&query.raw);
let candidate_limit = status.chunk_count.max(query.limit);
let top_docs = searcher
.search(
&tantivy_query,
&TopDocs::with_limit(candidate_limit).order_by_score(),
)
.map_err(|error| format!("failed to search index: {error}"))?;
let mut filtered_results = Vec::new();
for (score, address) in top_docs {
let document = searcher
.doc::<TantivyDocument>(address)
.map_err(|error| format!("failed to load search result document: {error}"))?;
let result = search_result_from_index_document(config, &document, &fields, score, &query)?;
if !search_result_matches_filters(
&result,
options.source_type_filter,
options.format_filter.as_deref(),
) {
continue;
}
filtered_results.push(result);
}
sort_search_results(&mut filtered_results, query.sort);
let total_results = filtered_results.len();
let next_cursor = search_next_cursor(query.offset, query.limit, total_results);
let previous_cursor = search_previous_cursor(query.offset, query.limit);
let results = filtered_results
.into_iter()
.skip(query.offset)
.take(query.limit)
.collect();
Ok(SearchResponse {
query: query.raw,
profile: SearchProfile::InteractiveSearch,
field: query.field,
sort: query.sort,
limit: query.limit,
offset: query.offset,
total_results,
next_cursor,
previous_cursor,
results,
})
}
fn rebuild_search_index_blocking(
config: &AppConfig,
indexed_source_files: &[crate::storage::IndexedSourceFileRow],
) -> Result<SearchIndexRebuildResponse, String> {
crate::notebook::index_notes(config)?;
crate::notebook::chunk_notes(config)?;
let indexed_at_ms = unix_timestamp_millis();
let documents = collect_search_index_documents(config, indexed_source_files, indexed_at_ms)?;
let document_count = count_unique_documents(&documents);
let chunk_count = documents.len();
let index_dir = search_index_dir(config);
if index_dir.exists() {
std::fs::remove_dir_all(&index_dir)
.map_err(|error| format!("failed to clear search index: {error}"))?;
}
std::fs::create_dir_all(&index_dir)
.map_err(|error| format!("failed to create search index directory: {error}"))?;
let (schema, fields) = search_index_schema();
let index = Index::create_in_dir(&index_dir, schema)
.map_err(|error| format!("failed to create tantivy index: {error}"))?;
let mut writer = index
.writer(50_000_000)
.map_err(|error| format!("failed to create tantivy writer: {error}"))?;
for document in &documents {
add_search_index_document(&mut writer, &fields, document)?;
}
writer
.commit()
.map_err(|error| format!("failed to commit search index: {error}"))?;
let timestamp = unix_timestamp_string();
write_search_index_state(
&index_dir,
&StoredSearchIndexState {
index_profile: SEARCH_INDEX_PROFILE.to_string(),
schema_version: SEARCH_INDEX_SCHEMA_VERSION,
tokenizer_profile: SEARCH_INDEX_TOKENIZER_PROFILE.to_string(),
document_count,
chunk_count,
last_rebuilt_at: Some(timestamp.clone()),
last_synced_at: Some(timestamp),
},
)?;
Ok(SearchIndexRebuildResponse {
indexed_documents: document_count,
indexed_chunks: chunk_count,
status: search_index_status(config),
})
}
fn sync_search_index_blocking(
config: &AppConfig,
indexed_source_files: &[crate::storage::IndexedSourceFileRow],
) -> Result<SearchIndexRebuildResponse, String> {
let status = search_index_status(config);
if !status.ready {
return rebuild_search_index_blocking(config, indexed_source_files);
}
crate::notebook::index_notes(config)?;
crate::notebook::chunk_notes(config)?;
let indexed_at_ms = unix_timestamp_millis();
let target_documents =
collect_search_index_documents(config, indexed_source_files, indexed_at_ms)?;
let target_document_count = count_unique_documents(&target_documents);
let target_chunk_count = target_documents.len();
let index_dir = search_index_dir(config);
let previous_state = read_search_index_state(&index_dir)?.unwrap_or(StoredSearchIndexState {
index_profile: SEARCH_INDEX_PROFILE.to_string(),
schema_version: SEARCH_INDEX_SCHEMA_VERSION,
tokenizer_profile: SEARCH_INDEX_TOKENIZER_PROFILE.to_string(),
document_count: status.document_count,
chunk_count: status.chunk_count,
last_rebuilt_at: status.last_rebuilt_at.clone(),
last_synced_at: status.last_synced_at.clone(),
});
let index = Index::open_in_dir(&index_dir)
.map_err(|error| format!("failed to open search index: {error}"))?;
let schema = index.schema();
let fields = search_index_fields(&schema)?;
let existing_documents = load_search_index_documents(&index, &fields)?;
let target_groups = group_search_index_documents(target_documents);
let existing_groups = group_search_index_documents(existing_documents);
let mut writer = index
.writer(50_000_000)
.map_err(|error| format!("failed to create tantivy writer: {error}"))?;
let mut changed = false;
for document_id in existing_groups.keys() {
if !target_groups.contains_key(document_id) {
writer.delete_term(Term::from_field_text(fields.document_id, document_id));
changed = true;
}
}
for (document_id, target_group) in &target_groups {
let needs_update = existing_groups
.get(document_id)
.map(|existing_group| !search_index_document_groups_match(existing_group, target_group))
.unwrap_or(true);
if !needs_update {
continue;
}
writer.delete_term(Term::from_field_text(fields.document_id, document_id));
for document in target_group {
add_search_index_document(&mut writer, &fields, document)?;
}
changed = true;
}
if changed {
writer
.commit()
.map_err(|error| format!("failed to commit search index sync: {error}"))?;
}
write_search_index_state(
&index_dir,
&StoredSearchIndexState {
index_profile: SEARCH_INDEX_PROFILE.to_string(),
schema_version: SEARCH_INDEX_SCHEMA_VERSION,
tokenizer_profile: SEARCH_INDEX_TOKENIZER_PROFILE.to_string(),
document_count: target_document_count,
chunk_count: target_chunk_count,
last_rebuilt_at: previous_state.last_rebuilt_at,
last_synced_at: Some(unix_timestamp_string()),
},
)?;
Ok(SearchIndexRebuildResponse {
indexed_documents: target_document_count,
indexed_chunks: target_chunk_count,
status: search_index_status(config),
})
}
fn collect_search_index_documents(
config: &AppConfig,
indexed_source_files: &[crate::storage::IndexedSourceFileRow],
indexed_at_ms: u64,
) -> Result<Vec<SearchIndexDocument>, String> {
let mut documents = Vec::new();
for note in crate::notebook::list_notes(config)? {
let chunk_path = notebook_chunk_artifact_path(config, Path::new(¬e.path));
let source_path = config.notebook_dir.join(¬e.path);
for (chunk_index, body) in read_chunk_texts(&chunk_path)?.into_iter().enumerate() {
documents.push(SearchIndexDocument {
document_id: format!("notebook/{}", note.path),
source_type: "notebook".to_string(),
source_id: String::new(),
path: note.path.clone(),
title: note.title.clone(),
format: notebook_format_name(note.format).to_string(),
source_path: display_path(&source_path),
source_fingerprint: file_fingerprint(&source_path)?,
updated_at_ms: file_modified_millis(&source_path)?,
indexed_at_ms,
body,
chunk_path: display_relative_path(&config.derived_dir, &chunk_path),
chunk_index,
provenance: format!("source=notebook/{}; index=tantivy-v1", note.path),
});
}
}
for file in indexed_source_files {
let relative = Path::new(&file.relative_path);
let chunk_path = source_root_chunk_artifact_path(config, &file.source_root_id, relative);
if !Path::new(&file.absolute_path).is_file() || !chunk_path.is_file() {
continue;
}
for (chunk_index, body) in read_chunk_texts(&chunk_path)?.into_iter().enumerate() {
documents.push(SearchIndexDocument {
document_id: format!("source-root/{}/{}", file.source_root_id, file.relative_path),
source_type: "source_root".to_string(),
source_id: file.source_root_id.clone(),
path: format!("source-root/{}/{}", file.source_root_id, file.relative_path),
title: search_result_title(&file.relative_path),
format: source_text_format(&file.relative_path),
source_path: file.absolute_path.clone(),
source_fingerprint: file_fingerprint(Path::new(&file.absolute_path))?,
updated_at_ms: file_modified_millis(Path::new(&file.absolute_path))?,
indexed_at_ms,
body,
chunk_path: display_relative_path(&config.derived_dir, &chunk_path),
chunk_index,
provenance: format!(
"source=source-root/{}/{}; index=tantivy-v1",
file.source_root_id, file.relative_path
),
});
}
}
Ok(documents)
}
fn load_search_index_documents(
index: &Index,
fields: &SearchIndexFields,
) -> Result<Vec<SearchIndexDocument>, String> {
let reader = index
.reader()
.map_err(|error| format!("failed to create search index reader: {error}"))?;
let searcher = reader.searcher();
let document_count = usize::try_from(searcher.num_docs())
.map_err(|_| "search index document count exceeds platform capacity".to_string())?;
if document_count == 0 {
return Ok(Vec::new());
}
let top_docs = searcher
.search(
&AllQuery,
&TopDocs::with_limit(document_count).order_by_score(),
)
.map_err(|error| format!("failed to inspect search index documents: {error}"))?;
top_docs
.into_iter()
.map(|(_score, address)| {
let document = searcher
.doc::<TantivyDocument>(address)
.map_err(|error| format!("failed to load search index document: {error}"))?;
Ok(search_index_document_from_stored(&document, fields))
})
.collect()
}
fn search_index_document_from_stored(
document: &TantivyDocument,
fields: &SearchIndexFields,
) -> SearchIndexDocument {
SearchIndexDocument {
document_id: stored_string(document, fields.document_id),
source_type: stored_string(document, fields.source_type),
source_id: stored_string(document, fields.source_id),
path: stored_string(document, fields.path),
title: stored_string(document, fields.title),
format: stored_string(document, fields.format),
source_path: stored_string(document, fields.source_path),
source_fingerprint: stored_string(document, fields.source_fingerprint),
updated_at_ms: stored_u64(document, fields.updated_at_ms),
indexed_at_ms: stored_u64(document, fields.indexed_at_ms),
body: stored_string(document, fields.body),
chunk_path: stored_string(document, fields.chunk_path),
chunk_index: stored_u64(document, fields.chunk_index) as usize,
provenance: stored_string(document, fields.provenance),
}
}
fn group_search_index_documents(
documents: Vec<SearchIndexDocument>,
) -> BTreeMap<String, Vec<SearchIndexDocument>> {
let mut groups = BTreeMap::<String, Vec<SearchIndexDocument>>::new();
for document in documents {
groups
.entry(document.document_id.clone())
.or_default()
.push(document);
}
for documents in groups.values_mut() {
documents.sort_by_key(|document| document.chunk_index);
}
groups
}
fn search_index_document_groups_match(
left: &[SearchIndexDocument],
right: &[SearchIndexDocument],
) -> bool {
left.len() == right.len()
&& left
.iter()
.zip(right)
.all(|(left, right)| search_index_documents_match(left, right))
}
fn search_index_documents_match(left: &SearchIndexDocument, right: &SearchIndexDocument) -> bool {
left.document_id == right.document_id
&& left.source_type == right.source_type
&& left.source_id == right.source_id
&& left.path == right.path
&& left.title == right.title
&& left.format == right.format
&& left.source_path == right.source_path
&& left.source_fingerprint == right.source_fingerprint
&& left.updated_at_ms == right.updated_at_ms
&& left.body == right.body
&& left.chunk_path == right.chunk_path
&& left.chunk_index == right.chunk_index
&& left.provenance == right.provenance
}
fn add_search_index_document(
writer: &mut IndexWriter,
fields: &SearchIndexFields,
document: &SearchIndexDocument,
) -> Result<(), String> {
writer
.add_document(doc!(
fields.document_id => document.document_id.as_str(),
fields.source_type => document.source_type.as_str(),
fields.source_id => document.source_id.as_str(),
fields.path => document.path.as_str(),
fields.path_text => document.path.as_str(),
fields.title => document.title.as_str(),
fields.format => document.format.as_str(),
fields.source_path => document.source_path.as_str(),
fields.source_fingerprint => document.source_fingerprint.as_str(),
fields.updated_at_ms => document.updated_at_ms,
fields.indexed_at_ms => document.indexed_at_ms,
fields.body => document.body.as_str(),
fields.chunk_path => document.chunk_path.as_str(),
fields.chunk_index => document.chunk_index as u64,
fields.provenance => document.provenance.as_str(),
))
.map_err(|error| format!("failed to add search index document: {error}"))?;
Ok(())
}
fn search_index_schema() -> (Schema, SearchIndexFields) {
let mut builder = Schema::builder();
let document_id = builder.add_text_field("document_id", STRING | STORED);
let source_type = builder.add_text_field("source_type", STRING | STORED);
let source_id = builder.add_text_field("source_id", STRING | STORED);
let path = builder.add_text_field("path", STRING | STORED);
let path_text = builder.add_text_field("path_text", TEXT);
let title = builder.add_text_field("title", TEXT | STORED);
let format = builder.add_text_field("format", STRING | STORED);
let source_path = builder.add_text_field("source_path", STORED);
let source_fingerprint = builder.add_text_field("source_fingerprint", STORED);
let updated_at_ms = builder.add_u64_field("updated_at_ms", FAST | STORED);
let indexed_at_ms = builder.add_u64_field("indexed_at_ms", FAST | STORED);
let body = builder.add_text_field("body", TEXT | STORED);
let chunk_path = builder.add_text_field("chunk_path", STRING | STORED);
let chunk_index = builder.add_u64_field("chunk_index", FAST | STORED);
let provenance = builder.add_text_field("provenance", STORED);
let schema = builder.build();
(
schema,
SearchIndexFields {
document_id,
source_type,
source_id,
path,
path_text,
title,
format,
source_path,
source_fingerprint,
updated_at_ms,
indexed_at_ms,
body,
chunk_path,
chunk_index,
provenance,
},
)
}
fn search_index_fields(schema: &Schema) -> Result<SearchIndexFields, String> {
Ok(SearchIndexFields {
document_id: schema_field(schema, "document_id")?,
source_type: schema_field(schema, "source_type")?,
source_id: schema_field(schema, "source_id")?,
path: schema_field(schema, "path")?,
path_text: schema_field(schema, "path_text")?,
title: schema_field(schema, "title")?,
format: schema_field(schema, "format")?,
source_path: schema_field(schema, "source_path")?,
source_fingerprint: schema_field(schema, "source_fingerprint")?,
updated_at_ms: schema_field(schema, "updated_at_ms")?,
indexed_at_ms: schema_field(schema, "indexed_at_ms")?,
body: schema_field(schema, "body")?,
chunk_path: schema_field(schema, "chunk_path")?,
chunk_index: schema_field(schema, "chunk_index")?,
provenance: schema_field(schema, "provenance")?,
})
}
fn schema_field(schema: &Schema, name: &str) -> Result<Field, String> {
schema
.get_field(name)
.map_err(|error| format!("search index field {name} is missing: {error}"))
}
fn search_query_fields(fields: &SearchIndexFields, field: SearchFieldScope) -> Vec<Field> {
match field {
SearchFieldScope::All => vec![fields.title, fields.path_text, fields.body],
SearchFieldScope::Title => vec![fields.title],
SearchFieldScope::Body => vec![fields.body],
SearchFieldScope::Path => vec![fields.path_text],
}
}
fn sort_search_results(results: &mut [SearchResult], sort: SearchSort) {
match sort {
SearchSort::Relevance => {}
SearchSort::UpdatedAt => results.sort_by(|left, right| {
right
.updated_at_ms
.cmp(&left.updated_at_ms)
.then_with(|| right.score.cmp(&left.score))
.then_with(|| left.path.cmp(&right.path))
}),
SearchSort::IndexedAt => results.sort_by(|left, right| {
right
.indexed_at_ms
.cmp(&left.indexed_at_ms)
.then_with(|| right.score.cmp(&left.score))
.then_with(|| left.path.cmp(&right.path))
}),
}
}
fn search_next_cursor(offset: usize, limit: usize, total_results: usize) -> Option<String> {
let next_offset = offset.saturating_add(limit);
(next_offset < total_results).then(|| encode_search_cursor(next_offset))
}
fn search_previous_cursor(offset: usize, limit: usize) -> Option<String> {
(offset > 0).then(|| encode_search_cursor(offset.saturating_sub(limit)))
}
fn encode_search_cursor(offset: usize) -> String {
format!("offset:{offset}")
}
fn decode_search_cursor(cursor: &str) -> Result<usize, String> {
let trimmed = cursor.trim();
let offset = trimmed
.strip_prefix("offset:")
.ok_or_else(|| format!("unsupported search cursor: {trimmed}"))?;
offset
.parse::<usize>()
.map_err(|_| format!("unsupported search cursor: {trimmed}"))
}
fn search_index_dir(config: &AppConfig) -> PathBuf {
config
.derived_dir
.join("search-index")
.join(SEARCH_INDEX_PROFILE)
}
fn search_index_state_path(index_dir: &Path) -> PathBuf {
index_dir.join("state.json")
}
fn read_search_index_state(index_dir: &Path) -> Result<Option<StoredSearchIndexState>, String> {
let path = search_index_state_path(index_dir);
if !path.is_file() {
return Ok(None);
}
let content = std::fs::read_to_string(&path)
.map_err(|error| format!("failed to read {}: {error}", path.display()))?;
serde_json::from_str(&content)
.map(Some)
.map_err(|error| format!("failed to parse {}: {error}", path.display()))
}
fn write_search_index_state(
index_dir: &Path,
state: &StoredSearchIndexState,
) -> Result<(), String> {
let path = search_index_state_path(index_dir);
std::fs::write(
&path,
serde_json::to_string_pretty(state)
.map_err(|error| format!("failed to encode search index state: {error}"))?,
)
.map_err(|error| format!("failed to write {}: {error}", path.display()))
}
fn read_chunk_texts(path: &Path) -> Result<Vec<String>, String> {
let content = std::fs::read_to_string(path)
.map_err(|error| format!("failed to read chunks {}: {error}", path.display()))?;
let items = serde_json::from_str::<Vec<serde_json::Value>>(&content)
.map_err(|error| format!("failed to parse chunks {}: {error}", path.display()))?;
Ok(items
.into_iter()
.filter_map(|item| {
item.get("text")
.and_then(|value| value.as_str())
.map(str::to_string)
})
.collect())
}
fn notebook_chunk_artifact_path(config: &AppConfig, relative: &Path) -> PathBuf {
let mut artifact = config.derived_dir.join("notebook-chunks").join(relative);
artifact.set_extension("json");
artifact
}
fn source_root_chunk_artifact_path(
config: &AppConfig,
source_root_id: &str,
relative: &Path,
) -> PathBuf {
let mut artifact = config
.derived_dir
.join("source-root-chunks")
.join(source_root_id)
.join(relative);
artifact.set_extension("json");
artifact
}
fn count_unique_documents(documents: &[SearchIndexDocument]) -> usize {
let mut ids = documents
.iter()
.map(|document| document.document_id.as_str())
.collect::<Vec<_>>();
ids.sort_unstable();
ids.dedup();
ids.len()
}
fn notebook_format_name(format: NotebookNoteFormat) -> &'static str {
match format {
NotebookNoteFormat::Markdown => "markdown",
NotebookNoteFormat::Typst => "typst",
}
}
fn source_text_format(path: &str) -> String {
match Path::new(path)
.extension()
.and_then(|extension| extension.to_str())
.map(str::to_lowercase)
.as_deref()
{
Some("md") => "markdown".to_string(),
Some("typ") => "typst".to_string(),
_ => "plaintext".to_string(),
}
}
fn search_result_title(path: &str) -> String {
path.rsplit(['/', '\\'])
.next()
.filter(|title| !title.is_empty())
.unwrap_or(path)
.to_string()
}
fn display_relative_path(root: &Path, path: &Path) -> String {
path.strip_prefix(root)
.map(display_path)
.unwrap_or_else(|_| display_path(path))
}
fn display_path(path: &Path) -> String {
path.to_string_lossy().replace('\\', "/")
}
fn file_fingerprint(path: &Path) -> Result<String, String> {
let metadata = std::fs::metadata(path)
.map_err(|error| format!("failed to read source metadata {}: {error}", path.display()))?;
let modified_ms = file_modified_millis_from_metadata(&metadata);
Ok(format!("{}:{modified_ms}", metadata.len()))
}
fn file_modified_millis(path: &Path) -> Result<u64, String> {
let metadata = std::fs::metadata(path)
.map_err(|error| format!("failed to read source metadata {}: {error}", path.display()))?;
Ok(file_modified_millis_from_metadata(&metadata))
}
fn file_modified_millis_from_metadata(metadata: &std::fs::Metadata) -> u64 {
metadata
.modified()
.ok()
.and_then(|time| time.duration_since(UNIX_EPOCH).ok())
.map(|duration| duration.as_millis().min(u128::from(u64::MAX)) as u64)
.unwrap_or_default()
}
fn search_result_from_index_document(
config: &AppConfig,
document: &TantivyDocument,
fields: &SearchIndexFields,
score: f32,
query: &SearchQuery,
) -> Result<SearchResult, String> {
let source_type = match stored_string(document, fields.source_type).as_str() {
"notebook" => SearchSourceType::Notebook,
"source_root" => SearchSourceType::SourceRoot,
value => {
return Err(format!(
"search index contains unsupported source_type: {value}"
));
}
};
let body = stored_string(document, fields.body);
let title = stored_string(document, fields.title);
let path = stored_string(document, fields.path);
let chunk_path = stored_string(document, fields.chunk_path);
let highlights = search_highlights(&format!("{title}\n{path}\n{body}"), query);
let snippet_source = if query.field == SearchFieldScope::Path {
path.as_str()
} else if lexical_score(&body, query) > 0 {
body.as_str()
} else if lexical_score(&title, query) > 0 {
title.as_str()
} else if lexical_score(&path, query) > 0 {
path.as_str()
} else {
body.as_str()
};
let snippet = search_snippet(snippet_source, 240);
Ok(SearchResult {
document_id: stored_string(document, fields.document_id),
chunk_id: Some(format!(
"{}#{}",
chunk_path,
stored_u64(document, fields.chunk_index)
)),
source_type,
path,
title,
format: Some(stored_string(document, fields.format)),
chunk_path: Some(chunk_path.clone()),
chunk_index: stored_u64(document, fields.chunk_index) as usize,
score: score_to_usize(score),
snippet,
highlights,
provenance: stored_string(document, fields.provenance),
status: search_document_status(
config,
&stored_string(document, fields.source_path),
&stored_string(document, fields.source_fingerprint),
&chunk_path,
),
updated_at_ms: stored_u64(document, fields.updated_at_ms),
indexed_at_ms: stored_u64(document, fields.indexed_at_ms),
})
}
fn search_result_matches_filters(
result: &SearchResult,
source_type_filter: Option<SearchSourceType>,
format_filter: Option<&str>,
) -> bool {
source_type_filter
.map(|source_type| result.source_type == source_type)
.unwrap_or(true)
&& format_filter
.map(str::trim)
.filter(|filter| !filter.is_empty())
.map(|filter| result.format.as_deref() == Some(filter))
.unwrap_or(true)
}
fn search_document_status(
config: &AppConfig,
source_path: &str,
source_fingerprint: &str,
chunk_path: &str,
) -> SearchDocumentStatus {
if source_path.is_empty() || source_fingerprint.is_empty() || chunk_path.is_empty() {
return SearchDocumentStatus::Error;
}
let source_path = Path::new(source_path);
if !source_path.is_file() {
return SearchDocumentStatus::Missing;
}
let derived_chunk = config.derived_dir.join(chunk_path);
if !derived_chunk.is_file() {
return SearchDocumentStatus::Missing;
}
match file_fingerprint(source_path) {
Ok(current) if current == source_fingerprint => SearchDocumentStatus::Ready,
Ok(_) => SearchDocumentStatus::Stale,
Err(_) => SearchDocumentStatus::Error,
}
}
fn stored_string(document: &TantivyDocument, field: Field) -> String {
document
.get_first(field)
.and_then(|value| value.as_str())
.unwrap_or_default()
.to_string()
}
fn stored_u64(document: &TantivyDocument, field: Field) -> u64 {
document
.get_first(field)
.and_then(|value| value.as_u64())
.unwrap_or_default()
}
fn score_to_usize(score: f32) -> usize {
(score.max(0.0) * 1000.0).round().max(1.0) as usize
}
fn unix_timestamp_string() -> String {
(unix_timestamp_millis() / 1000).to_string()
}
fn unix_timestamp_millis() -> u64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|duration| duration.as_millis().min(u128::from(u64::MAX)) as u64)
.unwrap_or_default()
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn rebuild_search_index_materializes_notebook_chunks_into_tantivy() {
let temp_dir = unique_temp_dir("search-index-rebuild");
let config = test_config(&temp_dir);
config.ensure_directories().expect("directories");
std::fs::write(
config.notebook_dir.join("daily.md"),
"# Daily\n\nsearchable note",
)
.expect("note");
let response = rebuild_search_index(config.clone(), Vec::new())
.await
.expect("rebuild");
let status = search_index_status(&config);
assert_eq!(response.indexed_documents, 1);
assert_eq!(response.indexed_chunks, 1);
assert!(response.status.ready);
assert!(status.ready);
assert_eq!(status.document_count, 1);
assert_eq!(status.chunk_count, 1);
assert!(search_index_dir(&config).join("meta.json").is_file());
let _ = std::fs::remove_dir_all(temp_dir);
}
#[tokio::test]
async fn search_index_query_returns_korean_and_english_terms() {
let temp_dir = unique_temp_dir("search-index-query");
let config = test_config(&temp_dir);
config.ensure_directories().expect("directories");
std::fs::write(
config.notebook_dir.join("mixed.md"),
"# 검색 계획\n\n검색 artifact roadmap",
)
.expect("note");
rebuild_search_index(config.clone(), Vec::new())
.await
.expect("rebuild");
let search = search_index_query(
config.clone(),
"검색 artifact".to_string(),
SearchIndexQueryOptions::default(),
)
.await
.expect("search");
assert_eq!(search.profile, SearchProfile::InteractiveSearch);
assert_eq!(search.limit, 25);
assert_eq!(search.offset, 0);
assert_eq!(search.total_results, 1);
assert_eq!(search.results.len(), 1);
assert_eq!(search.results[0].path, "mixed.md");
assert_eq!(search.results[0].status, SearchDocumentStatus::Ready);
assert_eq!(
search.results[0].highlights,
vec!["artifact".to_string(), "검색".to_string()]
);
assert!(search.results[0].provenance.contains("index=tantivy-v1"));
let _ = std::fs::remove_dir_all(temp_dir);
}
#[tokio::test]
async fn search_index_query_marks_changed_and_missing_sources() {
let temp_dir = unique_temp_dir("search-index-status");
let config = test_config(&temp_dir);
config.ensure_directories().expect("directories");
let stale_note = config.notebook_dir.join("stale.md");
let missing_note = config.notebook_dir.join("missing.md");
std::fs::write(&stale_note, "# Stale\n\nartifact stale").expect("stale note");
std::fs::write(&missing_note, "# Missing\n\nartifact missing").expect("missing note");
rebuild_search_index(config.clone(), Vec::new())
.await
.expect("rebuild");
std::fs::write(
&stale_note,
"# Stale\n\nartifact stale changed with more content",
)
.expect("update stale note");
std::fs::remove_file(&missing_note).expect("remove missing note");
let search = search_index_query(
config.clone(),
"artifact".to_string(),
SearchIndexQueryOptions::default(),
)
.await
.expect("search");
let stale = search
.results
.iter()
.find(|result| result.path == "stale.md")
.expect("stale result");
let missing = search
.results
.iter()
.find(|result| result.path == "missing.md")
.expect("missing result");
assert_eq!(stale.status, SearchDocumentStatus::Stale);
assert_eq!(missing.status, SearchDocumentStatus::Missing);
let _ = std::fs::remove_dir_all(temp_dir);
}
#[tokio::test]
async fn sync_search_index_updates_changed_documents_without_rebuilding_unchanged() {
let temp_dir = unique_temp_dir("search-index-sync");
let config = test_config(&temp_dir);
config.ensure_directories().expect("directories");
let keep_note = config.notebook_dir.join("keep.md");
let changed_note = config.notebook_dir.join("changed.md");
let removed_note = config.notebook_dir.join("removed.md");
std::fs::write(&keep_note, "# Keep\n\nneedle keep").expect("keep note");
std::fs::write(&changed_note, "# Changed\n\nneedle old").expect("changed note");
std::fs::write(&removed_note, "# Removed\n\nneedle removed").expect("removed note");
rebuild_search_index(config.clone(), Vec::new())
.await
.expect("rebuild");
let before = search_index_query(
config.clone(),
"needle".to_string(),
SearchIndexQueryOptions {
sort: Some(SearchSort::IndexedAt),
..SearchIndexQueryOptions::default()
},
)
.await
.expect("search before sync");
let keep_indexed_at = result_by_path(&before, "keep.md").indexed_at_ms;
let changed_indexed_at = result_by_path(&before, "changed.md").indexed_at_ms;
std::thread::sleep(std::time::Duration::from_millis(20));
std::fs::write(&changed_note, "# Changed\n\nneedle new").expect("rewrite changed note");
std::fs::remove_file(&removed_note).expect("remove note");
std::fs::write(config.notebook_dir.join("new.md"), "# New\n\nneedle new")
.expect("new note");
let sync = sync_search_index(config.clone(), Vec::new())
.await
.expect("sync");
assert_eq!(sync.indexed_documents, 3);
assert_eq!(sync.indexed_chunks, 3);
assert!(sync.status.ready);
assert!(sync.status.last_rebuilt_at.is_some());
assert!(sync.status.last_synced_at.is_some());
let after = search_index_query(
config.clone(),
"needle".to_string(),
SearchIndexQueryOptions {
sort: Some(SearchSort::IndexedAt),
..SearchIndexQueryOptions::default()
},
)
.await
.expect("search after sync");
assert_eq!(after.total_results, 3);
assert_eq!(
result_by_path(&after, "keep.md").indexed_at_ms,
keep_indexed_at
);
assert!(result_by_path(&after, "changed.md").indexed_at_ms > changed_indexed_at);
assert!(result_by_path(&after, "new.md").indexed_at_ms > 0);
assert!(
after
.results
.iter()
.all(|result| result.path != "removed.md")
);
let _ = std::fs::remove_dir_all(temp_dir);
}
#[tokio::test]
async fn search_index_query_applies_filters_before_result_limit() {
let temp_dir = unique_temp_dir("search-index-filter-limit");
let config = test_config(&temp_dir);
config.ensure_directories().expect("directories");
for index in 0..130 {
std::fs::write(
config.notebook_dir.join(format!("note-{index}.md")),
format!("# Needle {index}\n\nneedle notebook {index}"),
)
.expect("note");
}
let source_dir = temp_dir.join("source");
std::fs::create_dir_all(&source_dir).expect("source dir");
let source_file = source_dir.join("alpha.md");
std::fs::write(&source_file, "# Alpha\n\nneedle source root").expect("source file");
let source_chunk = source_root_chunk_artifact_path(&config, "root", Path::new("alpha.md"));
std::fs::create_dir_all(source_chunk.parent().expect("source chunk parent"))
.expect("source chunk dir");
std::fs::write(
&source_chunk,
r#"[{"index":0,"text":"needle source root"}]"#,
)
.expect("source chunk");
rebuild_search_index(
config.clone(),
vec![crate::storage::IndexedSourceFileRow {
source_root_id: "root".to_string(),
relative_path: "alpha.md".to_string(),
absolute_path: source_file.to_string_lossy().to_string(),
}],
)
.await
.expect("rebuild");
let search = search_index_query(
config.clone(),
"needle".to_string(),
SearchIndexQueryOptions {
source_type_filter: Some(SearchSourceType::SourceRoot),
..SearchIndexQueryOptions::default()
},
)
.await
.expect("search");
assert_eq!(search.results.len(), 1);
assert_eq!(search.results[0].source_type, SearchSourceType::SourceRoot);
assert_eq!(search.results[0].path, "source-root/root/alpha.md");
let _ = std::fs::remove_dir_all(temp_dir);
}
#[tokio::test]
async fn search_index_query_returns_paged_results_and_total_count() {
let temp_dir = unique_temp_dir("search-index-pagination");
let config = test_config(&temp_dir);
config.ensure_directories().expect("directories");
for index in 0..5 {
std::fs::write(
config.notebook_dir.join(format!("page-{index}.md")),
format!("# Page {index}\n\nneedle page {index}"),
)
.expect("note");
}
rebuild_search_index(config.clone(), Vec::new())
.await
.expect("rebuild");
let search = search_index_query(
config.clone(),
"needle".to_string(),
SearchIndexQueryOptions {
limit: Some(2),
offset: Some(2),
..SearchIndexQueryOptions::default()
},
)
.await
.expect("search");
assert_eq!(search.limit, 2);
assert_eq!(search.offset, 2);
assert_eq!(search.total_results, 5);
assert_eq!(search.previous_cursor.as_deref(), Some("offset:0"));
assert_eq!(search.next_cursor.as_deref(), Some("offset:4"));
assert_eq!(search.results.len(), 2);
let cursor_page = search_index_query(
config.clone(),
"needle".to_string(),
SearchIndexQueryOptions {
cursor: search.next_cursor.clone(),
limit: Some(2),
..SearchIndexQueryOptions::default()
},
)
.await
.expect("cursor search");
assert_eq!(cursor_page.offset, 4);
assert_eq!(cursor_page.total_results, 5);
assert_eq!(cursor_page.previous_cursor.as_deref(), Some("offset:2"));
assert!(cursor_page.next_cursor.is_none());
assert_eq!(cursor_page.results.len(), 1);
let empty_page = search_index_query(
config.clone(),
"needle".to_string(),
SearchIndexQueryOptions {
limit: Some(2),
offset: Some(usize::MAX),
..SearchIndexQueryOptions::default()
},
)
.await
.expect("search");
assert_eq!(empty_page.offset, usize::MAX);
assert_eq!(empty_page.total_results, 5);
assert!(empty_page.results.is_empty());
let _ = std::fs::remove_dir_all(temp_dir);
}
#[tokio::test]
async fn search_index_query_respects_field_scope() {
let temp_dir = unique_temp_dir("search-index-field-scope");
let config = test_config(&temp_dir);
config.ensure_directories().expect("directories");
std::fs::write(
config.notebook_dir.join("path-needle.md"),
"# Title Needle\n\nbody needle",
)
.expect("note");
std::fs::write(
config.notebook_dir.join("body-only.md"),
"# Unrelated\n\nbody-only-marker",
)
.expect("note");
rebuild_search_index(config.clone(), Vec::new())
.await
.expect("rebuild");
let title_search = search_index_query(
config.clone(),
"title".to_string(),
SearchIndexQueryOptions {
field_scope: Some(SearchFieldScope::Title),
..SearchIndexQueryOptions::default()
},
)
.await
.expect("title search");
assert_eq!(title_search.field, SearchFieldScope::Title);
assert_eq!(title_search.total_results, 1);
assert_eq!(title_search.results[0].path, "path-needle.md");
let body_search = search_index_query(
config.clone(),
"body-only-marker".to_string(),
SearchIndexQueryOptions {
field_scope: Some(SearchFieldScope::Body),
..SearchIndexQueryOptions::default()
},
)
.await
.expect("body search");
assert_eq!(body_search.total_results, 1);
assert_eq!(body_search.results[0].path, "body-only.md");
let path_search = search_index_query(
config.clone(),
"path-needle".to_string(),
SearchIndexQueryOptions {
field_scope: Some(SearchFieldScope::Path),
..SearchIndexQueryOptions::default()
},
)
.await
.expect("path search");
assert_eq!(path_search.total_results, 1);
assert_eq!(path_search.results[0].path, "path-needle.md");
let _ = std::fs::remove_dir_all(temp_dir);
}
#[tokio::test]
async fn search_index_query_sorts_by_updated_at() {
let temp_dir = unique_temp_dir("search-index-sort");
let config = test_config(&temp_dir);
config.ensure_directories().expect("directories");
std::fs::write(config.notebook_dir.join("older.md"), "# Older\n\nneedle").expect("note");
std::thread::sleep(std::time::Duration::from_millis(20));
std::fs::write(config.notebook_dir.join("newer.md"), "# Newer\n\nneedle").expect("note");
rebuild_search_index(config.clone(), Vec::new())
.await
.expect("rebuild");
let search = search_index_query(
config.clone(),
"needle".to_string(),
SearchIndexQueryOptions {
sort: Some(SearchSort::UpdatedAt),
..SearchIndexQueryOptions::default()
},
)
.await
.expect("updated sort search");
assert_eq!(search.sort, SearchSort::UpdatedAt);
assert_eq!(search.total_results, 2);
assert_eq!(search.results[0].path, "newer.md");
assert!(search.results[0].updated_at_ms >= search.results[1].updated_at_ms);
assert!(search.results.iter().all(|result| result.indexed_at_ms > 0));
let _ = std::fs::remove_dir_all(temp_dir);
}
#[test]
fn search_index_status_marks_missing_index_as_not_ready() {
let temp_dir = unique_temp_dir("search-index-missing");
let config = test_config(&temp_dir);
config.ensure_directories().expect("directories");
let status = search_index_status(&config);
assert!(!status.exists);
assert!(!status.ready);
assert!(
status
.diagnostics
.iter()
.any(|item| item.contains("rebuild required"))
);
let _ = std::fs::remove_dir_all(temp_dir);
}
#[test]
fn search_cursor_decoding_rejects_unsupported_values() {
assert_eq!(decode_search_cursor("offset:25").expect("cursor"), 25);
assert!(decode_search_cursor("25").is_err());
assert!(decode_search_cursor("offset:not-a-number").is_err());
}
fn result_by_path<'a>(response: &'a SearchResponse, path: &str) -> &'a SearchResult {
response
.results
.iter()
.find(|result| result.path == path)
.unwrap_or_else(|| panic!("missing result for {path}"))
}
fn test_config(temp_dir: &Path) -> AppConfig {
AppConfig {
app_name: "Soma Studio".to_string(),
bind_addr: "127.0.0.1:0".to_string(),
project_root: temp_dir.to_path_buf(),
data_dir: temp_dir.join("data"),
derived_dir: temp_dir.join("data").join("derived"),
notebook_dir: temp_dir.join("data").join("notebook"),
user_assets_dir: temp_dir.join("data").join("assets"),
db_path: temp_dir.join("data").join("soma-studio.db"),
web_build_dir: temp_dir.join("web").join("build"),
web_shell_file: temp_dir.join("web").join("build").join("spa.html"),
}
}
fn unique_temp_dir(label: &str) -> PathBuf {
let nanos = SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("system time")
.as_nanos();
std::env::temp_dir().join(format!(
"soma-studio-search-index-{label}-{}-{nanos}",
std::process::id()
))
}
}