pub fn sanitize_fts_query(input: &str) -> String {
let tokens: Vec<&str> = input
.split(|c: char| !c.is_alphanumeric())
.filter(|s| s.chars().count() >= 2)
.collect();
if tokens.is_empty() {
return String::new();
}
let mut out = String::new();
for (i, t) in tokens.iter().enumerate() {
if i > 0 {
out.push(' ');
}
out.push_str(t);
if i == tokens.len() - 1 {
out.push('*');
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_input_returns_empty() {
assert_eq!(sanitize_fts_query(""), "");
}
#[test]
fn whitespace_only_returns_empty() {
assert_eq!(sanitize_fts_query(" "), "");
}
#[test]
fn all_punctuation_returns_empty() {
assert_eq!(sanitize_fts_query("()"), "");
assert_eq!(sanitize_fts_query(":-/"), "");
assert_eq!(sanitize_fts_query("\"\""), "");
}
#[test]
fn simple_word_gets_prefix() {
assert_eq!(sanitize_fts_query("pricing"), "pricing*");
}
#[test]
fn colon_meeting_name_x1_wealth() {
assert_eq!(sanitize_fts_query("x1: wealth"), "x1 wealth*");
}
#[test]
fn hyphenated_foo_bar() {
assert_eq!(sanitize_fts_query("foo-bar"), "foo bar*");
}
#[test]
fn apostrophe_splits_to_match_fts5_tokenizer() {
assert_eq!(sanitize_fts_query("Ryan's"), "Ryan*");
}
#[test]
fn apostrophe_ryans_standup() {
assert_eq!(sanitize_fts_query("Ryan's stand-up"), "Ryan stand up*");
}
#[test]
fn single_char_tokens_dropped() {
assert_eq!(sanitize_fts_query("I a"), "");
assert_eq!(sanitize_fts_query("x1 a"), "x1*");
}
#[test]
fn slash_mat_cathryn() {
assert_eq!(sanitize_fts_query("mat/cathryn"), "mat cathryn*");
}
#[test]
fn quoted_input_no_crash() {
assert_eq!(sanitize_fts_query("\"quoted text\""), "quoted text*");
}
#[test]
fn parens_no_crash() {
assert_eq!(sanitize_fts_query("foo (bar) baz"), "foo bar baz*");
}
#[test]
fn unicode_diacritics_preserved() {
assert_eq!(sanitize_fts_query("café résumé"), "café résumé*");
}
#[test]
fn prefix_wildcard_only_on_last_token() {
let s = sanitize_fts_query("alpha beta gamma");
assert_eq!(s, "alpha beta gamma*");
assert_eq!(s.matches('*').count(), 1);
}
#[test]
fn dangling_quote_not_a_crash() {
assert_eq!(sanitize_fts_query("\"x1"), "x1*");
}
#[test]
fn underscore_not_treated_as_punctuation() {
assert_eq!(sanitize_fts_query("foo_bar"), "foo bar*");
}
#[test]
fn integration_sanitized_query_matches_document() {
use rusqlite::{params, Connection};
let conn = Connection::open_in_memory().expect("open in-memory db");
conn.execute_batch(
"CREATE VIRTUAL TABLE ft USING fts5(
title, body,
tokenize='porter unicode61 remove_diacritics 2',
prefix='2 3 4'
);",
)
.expect("create fts5 table");
conn.execute(
"INSERT INTO ft (rowid, title, body) VALUES (1, ?, ?)",
params![
"X1: Wealth Strategy Call",
"talked about pricing tiers and Ryan's stand-up feedback"
],
)
.expect("insert");
for input in &[
"x1: wealth",
"x1",
"wealth",
"Ryan", "stand-up",
"pricing",
"tier", "weal", ] {
let q = sanitize_fts_query(input);
assert!(
!q.is_empty(),
"input {:?} should sanitize to non-empty",
input
);
let n: i64 = conn
.query_row("SELECT COUNT(*) FROM ft WHERE ft MATCH ?", [&q], |r| {
r.get(0)
})
.unwrap_or_else(|e| panic!("MATCH {:?} (sanitized {:?}) failed: {}", input, q, e));
assert_eq!(
n, 1,
"input {:?} (sanitized {:?}) should match the row",
input, q
);
}
}
#[test]
fn integration_punctuation_only_does_not_error() {
assert_eq!(sanitize_fts_query("()"), "");
}
}