use rusqlite::{params_from_iter, Connection, ToSql};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use crate::error::{Error, Result};
use crate::filtering::{get_db_path, SUBSET_COLUMN};
use crate::search::QueryResult;
const FTS_TABLE: &str = "METADATA_FTS";
const FTS_CONTENT_TABLE: &str = "METADATA_FTS_CONTENT";
const FTS_CONTENT_COLUMN: &str = "_fts_content_";
const FTS_CONFIG_TABLE: &str = "_FTS_SETTINGS_";
#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
pub enum FtsTokenizer {
#[default]
Unicode61,
Trigram,
}
impl FtsTokenizer {
fn fts5_tokenize_value(&self) -> &'static str {
match self {
FtsTokenizer::Unicode61 => "unicode61",
FtsTokenizer::Trigram => "trigram",
}
}
fn as_config_str(&self) -> &'static str {
match self {
FtsTokenizer::Unicode61 => "unicode61",
FtsTokenizer::Trigram => "trigram",
}
}
fn from_config_str(s: &str) -> Option<Self> {
match s {
"unicode61" => Some(FtsTokenizer::Unicode61),
"trigram" => Some(FtsTokenizer::Trigram),
_ => None,
}
}
}
pub fn metadata_to_text(value: &Value) -> String {
let mut parts = Vec::new();
collect_text_parts(value, &mut parts);
parts.join(" ")
}
fn collect_text_parts(value: &Value, parts: &mut Vec<String>) {
match value {
Value::String(s) => {
if !s.is_empty() {
parts.push(s.clone());
}
}
Value::Number(n) => parts.push(n.to_string()),
Value::Bool(b) => parts.push(b.to_string()),
Value::Object(map) => {
for v in map.values() {
collect_text_parts(v, parts);
}
}
Value::Array(arr) => {
for item in arr {
collect_text_parts(item, parts);
}
}
Value::Null => {}
}
}
fn ensure_tables(conn: &Connection, tokenizer: &FtsTokenizer) -> Result<()> {
conn.execute(
&format!(
"CREATE TABLE IF NOT EXISTS \"{}\" (\
key TEXT PRIMARY KEY, \
value TEXT NOT NULL\
)",
FTS_CONFIG_TABLE
),
[],
)
.map_err(|e| Error::Filtering(format!("Failed to create FTS config table: {}", e)))?;
let stored: Option<String> = conn
.query_row(
&format!(
"SELECT value FROM \"{}\" WHERE key = 'tokenizer'",
FTS_CONFIG_TABLE
),
[],
|row| row.get(0),
)
.ok();
if let Some(ref stored_str) = stored {
if stored_str != tokenizer.as_config_str() {
conn.execute(&format!("DROP TABLE IF EXISTS \"{}\"", FTS_TABLE), [])
.map_err(|e| Error::Filtering(format!("Failed to drop FTS5 table: {}", e)))?;
conn.execute(
&format!("DROP TABLE IF EXISTS \"{}\"", FTS_CONTENT_TABLE),
[],
)
.map_err(|e| Error::Filtering(format!("Failed to drop content table: {}", e)))?;
}
}
conn.execute(
&format!(
"CREATE TABLE IF NOT EXISTS \"{}\" (\
rowid INTEGER PRIMARY KEY, \
\"{}\" TEXT NOT NULL DEFAULT ''\
)",
FTS_CONTENT_TABLE, FTS_CONTENT_COLUMN
),
[],
)
.map_err(|e| Error::Filtering(format!("Failed to create FTS content table: {}", e)))?;
conn.execute(
&format!(
"CREATE VIRTUAL TABLE IF NOT EXISTS \"{}\" USING fts5(\
\"{}\", \
content='{}', \
content_rowid='rowid', \
tokenize='{}'\
)",
FTS_TABLE,
FTS_CONTENT_COLUMN,
FTS_CONTENT_TABLE,
tokenizer.fts5_tokenize_value()
),
[],
)
.map_err(|e| Error::Filtering(format!("Failed to create FTS5 table: {}", e)))?;
conn.execute(
&format!(
"INSERT OR REPLACE INTO \"{}\"(key, value) VALUES ('tokenizer', ?)",
FTS_CONFIG_TABLE
),
[tokenizer.as_config_str()],
)
.map_err(|e| Error::Filtering(format!("Failed to save FTS config: {}", e)))?;
Ok(())
}
fn insert_rows(conn: &Connection, metadata: &[Value], doc_ids: &[i64]) -> Result<()> {
conn.execute_batch("BEGIN")
.map_err(|e| Error::Filtering(format!("Failed to begin transaction: {}", e)))?;
let result = (|| -> Result<()> {
let content_sql = format!(
"INSERT OR REPLACE INTO \"{}\"(rowid, \"{}\") VALUES (?, ?)",
FTS_CONTENT_TABLE, FTS_CONTENT_COLUMN
);
let fts_sql = format!(
"INSERT INTO \"{}\"(rowid, \"{}\") VALUES (?, ?)",
FTS_TABLE, FTS_CONTENT_COLUMN
);
let mut content_stmt = conn
.prepare(&content_sql)
.map_err(|e| Error::Filtering(format!("Failed to prepare content insert: {}", e)))?;
let mut fts_stmt = conn
.prepare(&fts_sql)
.map_err(|e| Error::Filtering(format!("Failed to prepare FTS5 insert: {}", e)))?;
for (item, &doc_id) in metadata.iter().zip(doc_ids.iter()) {
let text = metadata_to_text(item);
content_stmt
.execute(rusqlite::params![doc_id, text])
.map_err(|e| Error::Filtering(format!("Failed to insert content row: {}", e)))?;
fts_stmt
.execute(rusqlite::params![doc_id, text])
.map_err(|e| Error::Filtering(format!("Failed to insert FTS5 row: {}", e)))?;
}
Ok(())
})();
if result.is_ok() {
conn.execute_batch("COMMIT")
.map_err(|e| Error::Filtering(format!("Failed to commit transaction: {}", e)))?;
} else {
let _ = conn.execute_batch("ROLLBACK");
}
result
}
pub fn index(
index_path: &str,
metadata: &[Value],
doc_ids: &[i64],
tokenizer: &FtsTokenizer,
) -> Result<()> {
if metadata.is_empty() {
return Ok(());
}
if metadata.len() != doc_ids.len() {
return Err(Error::Filtering(format!(
"metadata length ({}) must match doc_ids length ({})",
metadata.len(),
doc_ids.len()
)));
}
let db_path = get_db_path(index_path);
if !db_path.exists() {
return Err(Error::Filtering(
"No metadata database found. Create metadata first.".into(),
));
}
let conn = crate::filtering::open_db(&db_path)?;
ensure_tables(&conn, tokenizer)?;
insert_rows(&conn, metadata, doc_ids)?;
Ok(())
}
pub fn delete(index_path: &str, doc_ids: &[i64]) -> Result<()> {
if doc_ids.is_empty() {
return Ok(());
}
let db_path = get_db_path(index_path);
if !db_path.exists() {
return Ok(());
}
let conn = crate::filtering::open_db(&db_path)?;
let has_content: bool = conn
.query_row(
"SELECT COUNT(*) > 0 FROM sqlite_master WHERE type='table' AND name=?",
[FTS_CONTENT_TABLE],
|row| row.get(0),
)
.unwrap_or(false);
if !has_content {
return Ok(());
}
conn.execute_batch("BEGIN")
.map_err(|e| Error::Filtering(format!("Failed to begin transaction: {}", e)))?;
let read_sql = format!(
"SELECT \"{}\" FROM \"{}\" WHERE rowid = ?",
FTS_CONTENT_COLUMN, FTS_CONTENT_TABLE
);
let fts_delete_sql = format!(
"INSERT INTO \"{}\"(\"{}\", rowid, \"{}\") VALUES('delete', ?, ?)",
FTS_TABLE, FTS_TABLE, FTS_CONTENT_COLUMN
);
let content_delete_sql = format!("DELETE FROM \"{}\" WHERE rowid = ?", FTS_CONTENT_TABLE);
let mut read_stmt = conn.prepare(&read_sql)?;
let mut fts_del_stmt = conn.prepare(&fts_delete_sql)?;
let mut content_del_stmt = conn.prepare(&content_delete_sql)?;
for &doc_id in doc_ids {
let old_text: Option<String> = read_stmt.query_row([doc_id], |row| row.get(0)).ok();
if let Some(text) = old_text {
fts_del_stmt
.execute(rusqlite::params![doc_id, text])
.map_err(|e| {
Error::Filtering(format!("Failed to delete FTS5 row {}: {}", doc_id, e))
})?;
content_del_stmt.execute([doc_id]).map_err(|e| {
Error::Filtering(format!("Failed to delete content row {}: {}", doc_id, e))
})?;
}
}
conn.execute_batch("COMMIT")
.map_err(|e| Error::Filtering(format!("Failed to commit transaction: {}", e)))?;
Ok(())
}
pub fn update_rows(index_path: &str, doc_ids: &[i64]) -> Result<()> {
if doc_ids.is_empty() {
return Ok(());
}
let db_path = get_db_path(index_path);
if !db_path.exists() {
return Ok(());
}
let conn = crate::filtering::open_db(&db_path)?;
let has_content: bool = conn
.query_row(
"SELECT COUNT(*) > 0 FROM sqlite_master WHERE type='table' AND name=?",
[FTS_CONTENT_TABLE],
|row| row.get(0),
)
.unwrap_or(false);
if !has_content {
return Ok(());
}
let mut columns: Vec<String> = Vec::new();
{
let mut stmt = conn.prepare("PRAGMA table_info(METADATA)")?;
let rows = stmt.query_map([], |row| row.get::<_, String>(1))?;
for row in rows {
let col = row?;
if col != SUBSET_COLUMN {
columns.push(col);
}
}
}
let read_old_sql = format!(
"SELECT \"{}\" FROM \"{}\" WHERE rowid = ?",
FTS_CONTENT_COLUMN, FTS_CONTENT_TABLE
);
let fts_delete_sql = format!(
"INSERT INTO \"{}\"(\"{}\", rowid, \"{}\") VALUES('delete', ?, ?)",
FTS_TABLE, FTS_TABLE, FTS_CONTENT_COLUMN
);
let content_upsert_sql = format!(
"INSERT OR REPLACE INTO \"{}\"(rowid, \"{}\") VALUES (?, ?)",
FTS_CONTENT_TABLE, FTS_CONTENT_COLUMN
);
let fts_insert_sql = format!(
"INSERT INTO \"{}\"(rowid, \"{}\") VALUES (?, ?)",
FTS_TABLE, FTS_CONTENT_COLUMN
);
let col_refs: Vec<String> = columns.iter().map(|c| format!("\"{}\"", c)).collect();
let meta_select_sql = if columns.is_empty() {
format!(
"SELECT \"{}\" FROM METADATA WHERE \"{}\" = ?",
SUBSET_COLUMN, SUBSET_COLUMN
)
} else {
format!(
"SELECT \"{}\", {} FROM METADATA WHERE \"{}\" = ?",
SUBSET_COLUMN,
col_refs.join(", "),
SUBSET_COLUMN
)
};
conn.execute_batch("BEGIN")
.map_err(|e| Error::Filtering(format!("Failed to begin transaction: {}", e)))?;
let mut read_old_stmt = conn.prepare(&read_old_sql)?;
let mut fts_del_stmt = conn.prepare(&fts_delete_sql)?;
let mut content_upsert_stmt = conn.prepare(&content_upsert_sql)?;
let mut fts_ins_stmt = conn.prepare(&fts_insert_sql)?;
let mut meta_stmt = conn.prepare(&meta_select_sql)?;
for &doc_id in doc_ids {
if let Ok(old_text) = read_old_stmt.query_row([doc_id], |row| row.get::<_, String>(0)) {
fts_del_stmt
.execute(rusqlite::params![doc_id, old_text])
.map_err(|e| {
Error::Filtering(format!("Failed to delete old FTS5 row {}: {}", doc_id, e))
})?;
}
let new_text: Option<String> = meta_stmt
.query_row([doc_id], |row| {
let mut parts = Vec::new();
for i in 0..columns.len() {
if let Ok(s) = row.get::<_, String>(i + 1) {
if !s.is_empty() {
parts.push(s);
}
} else if let Ok(n) = row.get::<_, i64>(i + 1) {
parts.push(n.to_string());
} else if let Ok(f) = row.get::<_, f64>(i + 1) {
parts.push(f.to_string());
}
}
Ok(parts.join(" "))
})
.ok();
if let Some(text) = new_text {
content_upsert_stmt
.execute(rusqlite::params![doc_id, text])
.map_err(|e| {
Error::Filtering(format!("Failed to upsert content row {}: {}", doc_id, e))
})?;
fts_ins_stmt
.execute(rusqlite::params![doc_id, text])
.map_err(|e| {
Error::Filtering(format!("Failed to insert FTS5 row {}: {}", doc_id, e))
})?;
}
}
conn.execute_batch("COMMIT")
.map_err(|e| Error::Filtering(format!("Failed to commit transaction: {}", e)))?;
Ok(())
}
pub fn rebuild(index_path: &str) -> Result<()> {
let db_path = get_db_path(index_path);
if !db_path.exists() {
return Ok(());
}
let conn = crate::filtering::open_db(&db_path)?;
let tokenizer = conn
.query_row(
&format!(
"SELECT value FROM \"{}\" WHERE key = 'tokenizer'",
FTS_CONFIG_TABLE
),
[],
|row| row.get::<_, String>(0),
)
.ok()
.and_then(|s| FtsTokenizer::from_config_str(&s))
.unwrap_or_default();
conn.execute_batch("BEGIN")
.map_err(|e| Error::Filtering(format!("Failed to begin transaction: {}", e)))?;
conn.execute(&format!("DROP TABLE IF EXISTS \"{}\"", FTS_TABLE), [])
.map_err(|e| Error::Filtering(format!("Failed to drop FTS5 table: {}", e)))?;
conn.execute(
&format!("DROP TABLE IF EXISTS \"{}\"", FTS_CONTENT_TABLE),
[],
)
.map_err(|e| Error::Filtering(format!("Failed to drop content table: {}", e)))?;
ensure_tables(&conn, &tokenizer)?;
let mut columns: Vec<String> = Vec::new();
{
let mut stmt = conn.prepare("PRAGMA table_info(METADATA)")?;
let rows = stmt.query_map([], |row| row.get::<_, String>(1))?;
for row in rows {
let col = row?;
if col != SUBSET_COLUMN {
columns.push(col);
}
}
}
if columns.is_empty() {
let sql = format!(
"INSERT INTO \"{}\"(rowid, \"{}\") SELECT \"{}\", '' FROM METADATA ORDER BY \"{}\"",
FTS_CONTENT_TABLE, FTS_CONTENT_COLUMN, SUBSET_COLUMN, SUBSET_COLUMN
);
conn.execute(&sql, [])
.map_err(|e| Error::Filtering(format!("Failed to populate content table: {}", e)))?;
} else {
let col_refs: Vec<String> = columns.iter().map(|c| format!("\"{}\"", c)).collect();
let select_sql = format!(
"SELECT \"{}\", {} FROM METADATA ORDER BY \"{}\"",
SUBSET_COLUMN,
col_refs.join(", "),
SUBSET_COLUMN
);
let mut select_stmt = conn.prepare(&select_sql)?;
let mut rows = select_stmt.query([])?;
let insert_sql = format!(
"INSERT INTO \"{}\"(rowid, \"{}\") VALUES (?, ?)",
FTS_CONTENT_TABLE, FTS_CONTENT_COLUMN
);
let mut insert_stmt = conn.prepare(&insert_sql)?;
while let Some(row) = rows.next()? {
let doc_id: i64 = row.get(0)?;
let mut parts = Vec::new();
for i in 0..columns.len() {
if let Ok(s) = row.get::<_, String>(i + 1) {
if !s.is_empty() {
parts.push(s);
}
} else if let Ok(n) = row.get::<_, i64>(i + 1) {
parts.push(n.to_string());
} else if let Ok(f) = row.get::<_, f64>(i + 1) {
parts.push(f.to_string());
}
}
let text = parts.join(" ");
insert_stmt
.execute(rusqlite::params![doc_id, text])
.map_err(|e| Error::Filtering(format!("Failed to insert content row: {}", e)))?;
}
}
conn.execute(
&format!(
"INSERT INTO \"{}\"(\"{}\") VALUES('rebuild')",
FTS_TABLE, FTS_TABLE
),
[],
)
.map_err(|e| Error::Filtering(format!("FTS5 rebuild failed: {}", e)))?;
conn.execute_batch("COMMIT")
.map_err(|e| Error::Filtering(format!("Failed to commit transaction: {}", e)))?;
Ok(())
}
pub fn sanitize_fts5_query(query: &str) -> String {
let operators = ["AND", "OR", "NOT", "NEAR"];
query
.split_whitespace()
.filter_map(|word| {
let trimmed = word.trim_matches(|c: char| !c.is_alphanumeric());
if trimmed.is_empty() {
return None;
}
if operators.contains(&trimmed.to_uppercase().as_str()) {
return None;
}
let escaped = trimmed.replace('"', "\"\"");
Some(format!("\"{}\"", escaped))
})
.collect::<Vec<_>>()
.join(" ")
}
const RRF_K: f32 = 60.0;
pub fn fuse_rrf(sem_ids: &[i64], kw_ids: &[i64], alpha: f32, top_k: usize) -> (Vec<i64>, Vec<f32>) {
use std::collections::HashMap;
let mut scores: HashMap<i64, f32> = HashMap::new();
for (rank, &doc_id) in sem_ids.iter().enumerate() {
*scores.entry(doc_id).or_default() += alpha / (RRF_K + rank as f32 + 1.0);
}
for (rank, &doc_id) in kw_ids.iter().enumerate() {
*scores.entry(doc_id).or_default() += (1.0 - alpha) / (RRF_K + rank as f32 + 1.0);
}
let mut combined: Vec<(i64, f32)> = scores.into_iter().collect();
combined.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
combined.truncate(top_k);
let ids = combined.iter().map(|&(id, _)| id).collect();
let s = combined.iter().map(|&(_, score)| score).collect();
(ids, s)
}
pub fn fuse_relative_score(
sem_ids: &[i64],
sem_scores: &[f32],
kw_ids: &[i64],
kw_scores: &[f32],
alpha: f32,
top_k: usize,
) -> (Vec<i64>, Vec<f32>) {
use std::collections::HashMap;
fn min_max_normalize(ids: &[i64], scores: &[f32]) -> Vec<(i64, f32)> {
if scores.is_empty() {
return vec![];
}
let min = scores.iter().fold(f32::INFINITY, |a, &b| a.min(b));
let max = scores.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
let range = max - min;
if range == 0.0 {
return ids.iter().map(|&id| (id, 1.0)).collect();
}
ids.iter()
.zip(scores)
.map(|(&id, &s)| (id, (s - min) / range))
.collect()
}
let norm_sem = min_max_normalize(sem_ids, sem_scores);
let norm_kw = min_max_normalize(kw_ids, kw_scores);
let mut scores: HashMap<i64, f32> = HashMap::new();
for &(doc_id, s) in &norm_sem {
*scores.entry(doc_id).or_default() += alpha * s;
}
for &(doc_id, s) in &norm_kw {
*scores.entry(doc_id).or_default() += (1.0 - alpha) * s;
}
let mut combined: Vec<(i64, f32)> = scores.into_iter().collect();
combined.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
combined.truncate(top_k);
let ids = combined.iter().map(|&(id, _)| id).collect();
let s = combined.iter().map(|&(_, score)| score).collect();
(ids, s)
}
fn make_temp_table_name(prefix: &str) -> String {
use std::sync::atomic::{AtomicU64, Ordering};
static COUNTER: AtomicU64 = AtomicU64::new(0);
format!(
"_tmp_{}_{}_{}",
prefix,
std::process::id(),
COUNTER.fetch_add(1, Ordering::Relaxed)
)
}
const SQLITE_PARAM_LIMIT: usize = 900;
type InClause = (String, Vec<Box<dyn ToSql>>, Option<String>);
pub fn build_in_clause(conn: &Connection, ids: &[i64]) -> Result<InClause> {
if ids.len() <= SQLITE_PARAM_LIMIT {
let placeholders: Vec<&str> = std::iter::repeat_n("?", ids.len()).collect();
let sql = format!("IN ({})", placeholders.join(", "));
let params: Vec<Box<dyn ToSql>> = ids
.iter()
.map(|&id| Box::new(id) as Box<dyn ToSql>)
.collect();
Ok((sql, params, None))
} else {
let table_name = make_temp_table_name("in");
conn.execute(
&format!(
"CREATE TEMP TABLE \"{}\" (id INTEGER PRIMARY KEY)",
table_name
),
[],
)
.map_err(|e| Error::Filtering(format!("Failed to create temp table: {}", e)))?;
let mut ins = conn
.prepare(&format!(
"INSERT OR IGNORE INTO \"{}\"(id) VALUES (?)",
table_name
))
.map_err(|e| Error::Filtering(format!("Failed to prepare temp insert: {}", e)))?;
for &id in ids {
ins.execute([id]).map_err(|e| {
Error::Filtering(format!("Failed to insert into temp table: {}", e))
})?;
}
let sql = format!("IN (SELECT id FROM \"{}\")", table_name);
Ok((sql, Vec::new(), Some(table_name)))
}
}
pub fn drop_temp_table(conn: &Connection, table_name: &str) {
let _ = conn.execute(&format!("DROP TABLE IF EXISTS \"{}\"", table_name), []);
}
pub fn exists(index_path: &str) -> bool {
let db_path = get_db_path(index_path);
if !db_path.exists() {
return false;
}
let Ok(conn) = crate::filtering::open_db(&db_path) else {
return false;
};
conn.query_row(
"SELECT COUNT(*) > 0 FROM sqlite_master WHERE type='table' AND name=?",
[FTS_TABLE],
|row| row.get::<_, bool>(0),
)
.unwrap_or(false)
}
fn open_fts_conn(index_path: &str) -> Result<Connection> {
let db_path = get_db_path(index_path);
if !db_path.exists() {
return Err(Error::Filtering(format!(
"No metadata database found at {}",
db_path.display()
)));
}
let conn = crate::filtering::open_db(&db_path)?;
let fts_exists: bool = conn
.query_row(
"SELECT COUNT(*) > 0 FROM sqlite_master WHERE type='table' AND name=?",
[FTS_TABLE],
|row| row.get(0),
)
.unwrap_or(false);
if !fts_exists {
return Err(Error::Filtering(
"FTS5 index not found. Re-create metadata to build the full-text search index.".into(),
));
}
Ok(conn)
}
fn collect_fts_results(
stmt: &mut rusqlite::Statement,
params: &[&dyn ToSql],
) -> Result<QueryResult> {
let rows = stmt
.query_map(params_from_iter(params.iter().copied()), |row| {
Ok((row.get::<_, i64>(0)?, row.get::<_, f32>(1)?))
})
.map_err(|e| Error::Filtering(format!("FTS5 query failed: {}", e)))?;
let mut passage_ids = Vec::new();
let mut scores = Vec::new();
for row in rows {
let (doc_id, score) =
row.map_err(|e| Error::Filtering(format!("Failed to read FTS5 result: {}", e)))?;
passage_ids.push(doc_id);
scores.push(score);
}
Ok(QueryResult {
query_id: 0,
passage_ids,
scores,
})
}
pub fn search(index_path: &str, query: &str, top_k: usize) -> Result<QueryResult> {
if query.is_empty() {
return Ok(QueryResult {
query_id: 0,
passage_ids: vec![],
scores: vec![],
});
}
let conn = open_fts_conn(index_path)?;
let sql = format!(
"SELECT rowid, CAST(-bm25(\"{}\") AS REAL) AS score \
FROM \"{}\" WHERE \"{}\" MATCH ? ORDER BY score DESC LIMIT ?",
FTS_TABLE, FTS_TABLE, FTS_TABLE
);
let mut stmt = conn
.prepare(&sql)
.map_err(|e| Error::Filtering(format!("Failed to prepare FTS5 query: {}", e)))?;
let top_k_i64 = top_k as i64;
collect_fts_results(&mut stmt, &[&query as &dyn ToSql, &top_k_i64])
}
pub fn search_filtered(
index_path: &str,
query: &str,
top_k: usize,
subset: &[i64],
) -> Result<QueryResult> {
if subset.is_empty() {
return Ok(QueryResult {
query_id: 0,
passage_ids: vec![],
scores: vec![],
});
}
if query.is_empty() {
return Ok(QueryResult {
query_id: 0,
passage_ids: vec![],
scores: vec![],
});
}
let conn = open_fts_conn(index_path)?;
let (in_clause, in_params, temp_table) = build_in_clause(&conn, subset)?;
let sql = format!(
"SELECT rowid, CAST(-bm25(\"{}\") AS REAL) AS score \
FROM \"{}\" WHERE \"{}\" MATCH ? AND rowid {} ORDER BY score DESC LIMIT ?",
FTS_TABLE, FTS_TABLE, FTS_TABLE, in_clause
);
let mut params: Vec<Box<dyn ToSql>> = Vec::with_capacity(in_params.len() + 2);
params.push(Box::new(query.to_string()));
params.extend(in_params);
params.push(Box::new(top_k as i64));
let param_refs: Vec<&dyn ToSql> = params.iter().map(|v| v.as_ref()).collect();
let mut stmt = conn
.prepare(&sql)
.map_err(|e| Error::Filtering(format!("Failed to prepare FTS5 query: {}", e)))?;
let result = collect_fts_results(&mut stmt, ¶m_refs);
if let Some(ref table_name) = temp_table {
drop_temp_table(&conn, table_name);
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
use tempfile::TempDir;
fn setup_with_metadata(metadata: &[Value]) -> (TempDir, String) {
setup_with_metadata_tokenizer(metadata, &FtsTokenizer::default())
}
fn setup_with_metadata_tokenizer(
metadata: &[Value],
tokenizer: &FtsTokenizer,
) -> (TempDir, String) {
let dir = TempDir::new().unwrap();
let path = dir.path().to_str().unwrap().to_string();
let doc_ids: Vec<i64> = (0..metadata.len() as i64).collect();
crate::filtering::create(&path, metadata, &doc_ids).unwrap();
index(&path, metadata, &doc_ids, tokenizer).unwrap();
(dir, path)
}
#[test]
fn test_metadata_to_text() {
let meta = json!({"title": "Hello World", "content": "test", "n": 42});
let text = metadata_to_text(&meta);
assert!(text.contains("Hello World"));
assert!(text.contains("test"));
assert!(text.contains("42"));
}
#[test]
fn test_metadata_to_text_nested() {
let meta = json!({"title": "Doc", "tags": ["rust", "search"], "a": {"b": "deep"}});
let text = metadata_to_text(&meta);
assert!(text.contains("Doc"));
assert!(text.contains("rust"));
assert!(text.contains("deep"));
}
#[test]
fn test_metadata_to_text_nulls_skipped() {
let text = metadata_to_text(&json!({"a": "yes", "b": null}));
assert!(text.contains("yes"));
assert!(!text.contains("null"));
}
#[test]
fn test_search_basic() {
let metadata = vec![
json!({"title": "The quick brown fox", "body": "jumps over the lazy dog"}),
json!({"title": "A fast brown car", "body": "drives over the bridge"}),
json!({"title": "The fox is clever", "body": "and quick at hunting"}),
];
let (_dir, path) = setup_with_metadata(&metadata);
let result = search(&path, "quick fox", 10).unwrap();
assert!(!result.passage_ids.is_empty());
assert!(result.passage_ids.contains(&0));
assert!(result.passage_ids.contains(&2));
for &s in &result.scores {
assert!(s > 0.0, "BM25 scores should be positive, got {s}");
}
}
#[test]
fn test_search_no_results() {
let (_dir, path) = setup_with_metadata(&[json!({"title": "hello world"})]);
let result = search(&path, "nonexistent", 10).unwrap();
assert!(result.passage_ids.is_empty());
}
#[test]
fn test_search_top_k_limit() {
let metadata: Vec<Value> = (0..20)
.map(|i| json!({"c": format!("document about search {i}")}))
.collect();
let (_dir, path) = setup_with_metadata(&metadata);
let result = search(&path, "search", 5).unwrap();
assert!(result.passage_ids.len() <= 5);
}
#[test]
fn test_search_after_incremental_index() {
let dir = TempDir::new().unwrap();
let path = dir.path().to_str().unwrap();
let tok = FtsTokenizer::default();
let m1 = vec![json!({"title": "cats are great"})];
let ids1: Vec<i64> = vec![0];
crate::filtering::create(path, &m1, &ids1).unwrap();
index(path, &m1, &ids1, &tok).unwrap();
assert_eq!(search(path, "cats", 10).unwrap().passage_ids.len(), 1);
let m2 = vec![json!({"title": "dogs are great"})];
let ids2: Vec<i64> = vec![1];
crate::filtering::update(path, &m2, &ids2).unwrap();
index(path, &m2, &ids2, &tok).unwrap();
assert_eq!(search(path, "dogs", 10).unwrap().passage_ids[0], 1);
assert_eq!(search(path, "great", 10).unwrap().passage_ids.len(), 2);
}
#[test]
fn test_delete_incremental() {
let metadata = vec![
json!({"title": "Alpha document"}),
json!({"title": "Beta document"}),
json!({"title": "Gamma document"}),
];
let (_dir, path) = setup_with_metadata(&metadata);
delete(&path, &[1]).unwrap();
assert!(search(&path, "Beta", 10).unwrap().passage_ids.is_empty());
assert_eq!(search(&path, "Alpha", 10).unwrap().passage_ids, vec![0]);
assert_eq!(search(&path, "Gamma", 10).unwrap().passage_ids, vec![2]);
}
#[test]
fn test_search_after_delete_and_rebuild() {
let metadata = vec![
json!({"title": "Alpha document"}),
json!({"title": "Beta document"}),
json!({"title": "Gamma document"}),
];
let (_dir, path) = setup_with_metadata(&metadata);
crate::filtering::delete(&path, &[1]).unwrap();
rebuild(&path).unwrap();
let r = search(&path, "Alpha", 10).unwrap();
assert_eq!(r.passage_ids, vec![0]);
let r = search(&path, "Gamma", 10).unwrap();
assert_eq!(r.passage_ids, vec![1]);
assert!(search(&path, "Beta", 10).unwrap().passage_ids.is_empty());
}
#[test]
fn test_search_filtered() {
let metadata = vec![
json!({"title": "rust programming language"}),
json!({"title": "python programming language"}),
json!({"title": "rust systems programming"}),
];
let (_dir, path) = setup_with_metadata(&metadata);
let result = search_filtered(&path, "programming", 10, &[0, 1]).unwrap();
assert!(result.passage_ids.contains(&0));
assert!(result.passage_ids.contains(&1));
assert!(!result.passage_ids.contains(&2));
}
#[test]
fn test_search_with_empty_metadata() {
let metadata = vec![json!({}), json!({"title": "hello world"})];
let (_dir, path) = setup_with_metadata(&metadata);
let result = search(&path, "hello", 10).unwrap();
assert_eq!(result.passage_ids, vec![1]);
}
#[test]
fn test_search_numeric_metadata() {
let metadata = vec![
json!({"label": "item", "price": 42}),
json!({"label": "other", "price": 99}),
];
let (_dir, path) = setup_with_metadata(&metadata);
let result = search(&path, "42", 10).unwrap();
assert_eq!(result.passage_ids, vec![0]);
}
#[test]
fn test_no_fts_table_error() {
let dir = TempDir::new().unwrap();
let path = dir.path().to_str().unwrap();
let db_path = std::path::Path::new(path).join(crate::filtering::METADATA_DB_NAME);
let conn = Connection::open(&db_path).unwrap();
conn.execute(
&format!(
"CREATE TABLE METADATA (\"{}\" INTEGER PRIMARY KEY)",
SUBSET_COLUMN
),
[],
)
.unwrap();
drop(conn);
let result = search(path, "test", 10);
assert!(result.is_err());
assert!(result
.unwrap_err()
.to_string()
.contains("FTS5 index not found"));
}
#[test]
fn test_exists() {
let metadata = vec![json!({"title": "hello"})];
let (_dir, path) = setup_with_metadata(&metadata);
assert!(exists(&path));
let dir2 = TempDir::new().unwrap();
assert!(!exists(dir2.path().to_str().unwrap()));
}
#[test]
fn test_update_rows_syncs_fts() {
let metadata = vec![
json!({"title": "old cats document"}),
json!({"title": "old dogs document"}),
];
let (_dir, path) = setup_with_metadata(&metadata);
assert_eq!(search(&path, "cats", 10).unwrap().passage_ids, vec![0]);
assert_eq!(search(&path, "dogs", 10).unwrap().passage_ids, vec![1]);
crate::filtering::update_where(
&path,
"\"_subset_\" = ?",
&[json!(0)],
&json!({"title": "new elephants document"}),
)
.unwrap();
assert!(search(&path, "cats", 10).unwrap().passage_ids.is_empty());
assert_eq!(search(&path, "elephants", 10).unwrap().passage_ids, vec![0]);
assert_eq!(search(&path, "dogs", 10).unwrap().passage_ids, vec![1]);
}
#[test]
fn test_update_rows_multiple() {
let metadata = vec![
json!({"category": "A", "content": "hello world"}),
json!({"category": "A", "content": "hello rust"}),
json!({"category": "B", "content": "hello python"}),
];
let (_dir, path) = setup_with_metadata(&metadata);
crate::filtering::update_where(
&path,
"category = ?",
&[json!("A")],
&json!({"content": "goodbye universe"}),
)
.unwrap();
let r = search(&path, "hello", 10).unwrap();
assert_eq!(r.passage_ids, vec![2]);
let r = search(&path, "goodbye", 10).unwrap();
assert!(r.passage_ids.contains(&0));
assert!(r.passage_ids.contains(&1));
assert_eq!(r.passage_ids.len(), 2);
}
#[test]
fn test_trigram_substring_match() {
let metadata = vec![
json!({"func": "parse_arguments", "file": "cli.rs"}),
json!({"func": "render_template", "file": "views.rs"}),
json!({"func": "validate_input", "file": "forms.rs"}),
];
let (_dir, path) = setup_with_metadata_tokenizer(&metadata, &FtsTokenizer::Trigram);
let r = search(&path, "arg", 10).unwrap();
assert!(
r.passage_ids.contains(&0),
"trigram should match 'arg' in 'parse_arguments'"
);
let r = search(&path, "templ", 10).unwrap();
assert!(
r.passage_ids.contains(&1),
"trigram should match 'templ' in 'render_template'"
);
}
#[test]
fn test_trigram_code_identifiers() {
let metadata = vec![
json!({"symbol": "HashMap::insert"}),
json!({"symbol": "BTreeMap::entry"}),
json!({"symbol": "Vec::push"}),
];
let (_dir, path) = setup_with_metadata_tokenizer(&metadata, &FtsTokenizer::Trigram);
let r = search(&path, "Map", 10).unwrap();
assert!(r.passage_ids.contains(&0));
assert!(r.passage_ids.contains(&1));
assert!(!r.passage_ids.contains(&2));
}
#[test]
fn test_tokenizer_mismatch_triggers_rebuild() {
let metadata = vec![
json!({"title": "parse_arguments function"}),
json!({"title": "render_template function"}),
];
let (_dir, path) = setup_with_metadata_tokenizer(&metadata, &FtsTokenizer::Unicode61);
let r = search(&path, "arg", 10).unwrap();
assert!(
r.passage_ids.is_empty(),
"unicode61 should not match substring 'arg'"
);
let doc_ids: Vec<i64> = (0..metadata.len() as i64).collect();
index(&path, &metadata, &doc_ids, &FtsTokenizer::Trigram).unwrap();
let r = search(&path, "arg", 10).unwrap();
assert!(
r.passage_ids.contains(&0),
"after switching to trigram, 'arg' should match"
);
}
}