Skip to main content

coding_agent_search/pages/
fts.rs

1//! FTS5 Query Utilities for Pages Export
2//!
3//! Provides query escaping and formatting for safe FTS5 search queries
4//! in the exported SQLite database. Supports both natural language (porter)
5//! and code-aware (unicode61) search modes.
6
7/// Escape a query string for safe use with FTS5 MATCH.
8///
9/// FTS5 has special characters that must be escaped to prevent injection
10/// or syntax errors. This function wraps each term in double-quotes,
11/// escaping any internal double-quotes by doubling them.
12///
13/// # Examples
14///
15/// ```
16/// use coding_agent_search::pages::fts::escape_fts5_query;
17///
18/// // Simple query
19/// assert_eq!(escape_fts5_query("hello world"), r#""hello" "world""#);
20///
21/// // Query with special characters
22/// assert_eq!(escape_fts5_query("foo\"bar"), r#""foo""bar""#);
23///
24/// // Code-like query
25/// assert_eq!(escape_fts5_query("my_function"), r#""my_function""#);
26/// ```
27pub fn escape_fts5_query(query: &str) -> String {
28    query
29        .split_whitespace()
30        .filter(|t| !t.is_empty())
31        .map(|t| format!("\"{}\"", t.replace('"', "\"\"")))
32        .collect::<Vec<_>>()
33        .join(" ")
34}
35
36/// Query mode for FTS5 search routing
37#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
38pub enum Fts5SearchMode {
39    /// Natural language search using porter stemmer (messages_fts)
40    /// Good for: English prose, documentation, explanations
41    NaturalLanguage,
42    /// Code-aware search using unicode61 with special tokenchars (messages_code_fts)
43    /// Good for: identifiers, file paths, snake_case, camelCase
44    Code,
45    /// Automatic detection based on query content
46    #[default]
47    Auto,
48}
49
50/// Detect the appropriate search mode based on query content.
51///
52/// Returns `Code` mode if the query contains:
53/// - Underscores (snake_case identifiers)
54/// - Dots (file extensions, method calls)
55/// - camelCase patterns (lowercase followed by uppercase)
56/// - File path separators
57/// - Colons (namespaces, type annotations)
58/// - Hashes (CSS selectors, preprocessor directives)
59/// - At-signs (decorators, email-like patterns)
60/// - Dollar signs (variables in shell/PHP)
61/// - Percent signs (URL encoding, format specifiers)
62/// - Hyphens between letters (kebab-case)
63///
64/// Uses prose indicators to avoid false positives:
65/// - Question words (how, what, why, when, where)
66/// - Common articles (the, is, are, was, were)
67/// - Multiple space-separated words (>3 words)
68///
69/// Otherwise returns `NaturalLanguage` mode.
70///
71/// # Examples
72///
73/// ```
74/// use coding_agent_search::pages::fts::{detect_search_mode, Fts5SearchMode};
75///
76/// assert_eq!(detect_search_mode("hello world"), Fts5SearchMode::NaturalLanguage);
77/// assert_eq!(detect_search_mode("my_function"), Fts5SearchMode::Code);
78/// assert_eq!(detect_search_mode("AuthController.ts"), Fts5SearchMode::Code);
79/// assert_eq!(detect_search_mode("getUserById"), Fts5SearchMode::Code);
80/// assert_eq!(detect_search_mode("std::io::Result"), Fts5SearchMode::Code);
81/// assert_eq!(detect_search_mode("my-component"), Fts5SearchMode::Code);
82/// assert_eq!(detect_search_mode("how does auth work"), Fts5SearchMode::NaturalLanguage);
83/// ```
84pub fn detect_search_mode(query: &str) -> Fts5SearchMode {
85    // Check for code-like patterns
86    let has_code_chars = query.contains('_')
87        || query.contains('.')
88        || query.contains('/')
89        || query.contains('\\')
90        || query.contains("::")
91        || query.contains('#')
92        || query.contains('@')
93        || query.contains('$')
94        || query.contains('%');
95
96    let has_code_patterns = has_camel_case(query) || has_kebab_case(query);
97
98    let is_code_query = has_code_chars || has_code_patterns;
99
100    // Check for prose indicators (to avoid false positives)
101    let words: Vec<&str> = query.split_whitespace().collect();
102    let word_count = words.len();
103    let lower = query.to_lowercase();
104
105    let has_prose_indicators = word_count > 3
106        || lower.starts_with("how ")
107        || lower.starts_with("what ")
108        || lower.starts_with("why ")
109        || lower.starts_with("when ")
110        || lower.starts_with("where ")
111        || lower.contains(" the ")
112        || lower.contains(" is ")
113        || lower.contains(" are ")
114        || lower.contains(" was ")
115        || lower.contains(" were ");
116
117    // Code patterns win unless prose indicators are strong
118    if is_code_query && !has_prose_indicators {
119        Fts5SearchMode::Code
120    } else if has_prose_indicators && !is_code_query {
121        Fts5SearchMode::NaturalLanguage
122    } else if is_code_query {
123        // Both indicators present - code chars are more specific
124        Fts5SearchMode::Code
125    } else {
126        Fts5SearchMode::NaturalLanguage
127    }
128}
129
130/// Check if string contains kebab-case pattern (letter-hyphen-letter).
131fn has_kebab_case(s: &str) -> bool {
132    let chars: Vec<char> = s.chars().collect();
133    for i in 2..chars.len() {
134        if chars[i - 1] == '-' && chars[i - 2].is_alphabetic() && chars[i].is_alphabetic() {
135            return true;
136        }
137    }
138    false
139}
140
141/// Check if string contains camelCase pattern (lowercase followed by uppercase).
142fn has_camel_case(s: &str) -> bool {
143    let chars: Vec<char> = s.chars().collect();
144    for i in 1..chars.len() {
145        if chars[i - 1].is_lowercase() && chars[i].is_uppercase() {
146            return true;
147        }
148    }
149    false
150}
151
152/// Format a query for the appropriate FTS5 table based on mode.
153///
154/// Returns a tuple of (table_name, escaped_query).
155///
156/// # Examples
157///
158/// ```
159/// use coding_agent_search::pages::fts::{format_fts5_query, Fts5SearchMode};
160///
161/// let (table, query) = format_fts5_query("error handling", Fts5SearchMode::NaturalLanguage);
162/// assert_eq!(table, "messages_fts");
163///
164/// let (table, query) = format_fts5_query("my_function", Fts5SearchMode::Code);
165/// assert_eq!(table, "messages_code_fts");
166/// ```
167pub fn format_fts5_query(query: &str, mode: Fts5SearchMode) -> (&'static str, String) {
168    let actual_mode = match mode {
169        Fts5SearchMode::Auto => detect_search_mode(query),
170        other => other,
171    };
172
173    let table = match actual_mode {
174        Fts5SearchMode::NaturalLanguage | Fts5SearchMode::Auto => "messages_fts",
175        Fts5SearchMode::Code => "messages_code_fts",
176    };
177
178    (table, escape_fts5_query(query))
179}
180
181/// Build a complete FTS5 search SQL query.
182///
183/// Generates a SELECT statement with JOIN to messages and conversations tables,
184/// including BM25 ranking, snippets, and optional agent filtering.
185///
186/// # Arguments
187///
188/// * `fts_table` - The FTS5 table name ("messages_fts" or "messages_code_fts")
189/// * `snippet_length` - Maximum snippet length (passed to FTS5 snippet())
190/// * `with_agent_filter` - Whether to include agent filter placeholder
191///
192/// # Example SQL Generated
193///
194/// ```sql
195/// SELECT
196///     m.conversation_id,
197///     m.id as message_id,
198///     m.role,
199///     snippet(messages_fts, 0, '<mark>', '</mark>', '...', 64) as snippet,
200///     c.agent,
201///     c.workspace,
202///     c.title,
203///     c.started_at,
204///     bm25(messages_fts) as score
205/// FROM messages_fts
206/// JOIN messages m ON messages_fts.rowid = m.id
207/// JOIN conversations c ON m.conversation_id = c.id
208/// WHERE messages_fts MATCH ?
209/// ORDER BY score
210/// LIMIT ? OFFSET ?
211/// ```
212pub fn build_fts5_search_sql(
213    fts_table: &str,
214    snippet_length: u32,
215    with_agent_filter: bool,
216) -> String {
217    let mut sql = format!(
218        r#"SELECT
219    m.conversation_id,
220    m.id as message_id,
221    m.role,
222    snippet({fts_table}, 0, '<mark>', '</mark>', '...', {snippet_length}) as snippet,
223    c.agent,
224    c.workspace,
225    c.title,
226    c.started_at,
227    bm25({fts_table}) as score
228FROM {fts_table}
229JOIN messages m ON {fts_table}.rowid = m.id
230JOIN conversations c ON m.conversation_id = c.id
231WHERE {fts_table} MATCH ?"#
232    );
233
234    if with_agent_filter {
235        sql.push_str("\n    AND c.agent = ?");
236    }
237
238    sql.push_str("\nORDER BY score\nLIMIT ? OFFSET ?");
239
240    sql
241}
242
243/// Validate that a query is safe and non-empty for FTS5.
244///
245/// Returns `None` if the query is empty or contains only whitespace.
246/// Returns `Some(cleaned_query)` with trimmed whitespace otherwise.
247///
248/// # Examples
249///
250/// ```
251/// use coding_agent_search::pages::fts::validate_fts5_query;
252///
253/// assert_eq!(validate_fts5_query("  hello  "), Some("hello".to_string()));
254/// assert_eq!(validate_fts5_query("   "), None);
255/// assert_eq!(validate_fts5_query(""), None);
256/// ```
257pub fn validate_fts5_query(query: &str) -> Option<String> {
258    let trimmed = query.trim();
259    if trimmed.is_empty() {
260        None
261    } else {
262        Some(trimmed.to_string())
263    }
264}
265
266#[cfg(test)]
267mod tests {
268    use super::*;
269
270    #[test]
271    fn test_escape_fts5_query_simple() {
272        assert_eq!(escape_fts5_query("hello"), r#""hello""#);
273        assert_eq!(escape_fts5_query("hello world"), r#""hello" "world""#);
274    }
275
276    #[test]
277    fn test_escape_fts5_query_with_quotes() {
278        // Internal quotes are doubled
279        // Input: foo"bar → Output: "foo""bar" (quote doubled, then wrapped)
280        assert_eq!(escape_fts5_query(r#"foo"bar"#), r#""foo""bar""#);
281        // Input: say "hello" → Output: "say" """hello"""
282        // The token "hello" has quotes at both ends, each doubled = ""hello""
283        // Then wrapped in outer quotes = """hello"""
284        assert_eq!(
285            escape_fts5_query("say \"hello\""),
286            "\"say\" \"\"\"hello\"\"\""
287        );
288    }
289
290    #[test]
291    fn test_escape_fts5_query_special_chars() {
292        // FTS5 operators should be safely quoted
293        assert_eq!(escape_fts5_query("foo*"), r#""foo*""#);
294        assert_eq!(escape_fts5_query("foo+bar"), r#""foo+bar""#);
295        assert_eq!(escape_fts5_query("foo-bar"), r#""foo-bar""#);
296        assert_eq!(escape_fts5_query("foo:bar"), r#""foo:bar""#);
297        assert_eq!(escape_fts5_query("(foo)"), r#""(foo)""#);
298    }
299
300    #[test]
301    fn test_escape_fts5_query_empty() {
302        assert_eq!(escape_fts5_query(""), "");
303        assert_eq!(escape_fts5_query("   "), "");
304    }
305
306    #[test]
307    fn test_escape_fts5_query_code_identifiers() {
308        assert_eq!(escape_fts5_query("my_function"), r#""my_function""#);
309        assert_eq!(
310            escape_fts5_query("AuthController.ts"),
311            r#""AuthController.ts""#
312        );
313        assert_eq!(escape_fts5_query("src/lib.rs"), r#""src/lib.rs""#);
314    }
315
316    #[test]
317    fn test_detect_search_mode_natural() {
318        assert_eq!(detect_search_mode("hello"), Fts5SearchMode::NaturalLanguage);
319        assert_eq!(
320            detect_search_mode("error handling"),
321            Fts5SearchMode::NaturalLanguage
322        );
323        assert_eq!(
324            detect_search_mode("running test"),
325            Fts5SearchMode::NaturalLanguage
326        );
327    }
328
329    #[test]
330    fn test_detect_search_mode_code_underscore() {
331        assert_eq!(detect_search_mode("my_function"), Fts5SearchMode::Code);
332        assert_eq!(detect_search_mode("get_user_by_id"), Fts5SearchMode::Code);
333    }
334
335    #[test]
336    fn test_detect_search_mode_code_dot() {
337        assert_eq!(
338            detect_search_mode("AuthController.ts"),
339            Fts5SearchMode::Code
340        );
341        assert_eq!(detect_search_mode("file.rs"), Fts5SearchMode::Code);
342    }
343
344    #[test]
345    fn test_detect_search_mode_code_camelcase() {
346        assert_eq!(detect_search_mode("getUserById"), Fts5SearchMode::Code);
347        assert_eq!(detect_search_mode("AuthController"), Fts5SearchMode::Code);
348    }
349
350    #[test]
351    fn test_detect_search_mode_code_path() {
352        assert_eq!(detect_search_mode("src/lib.rs"), Fts5SearchMode::Code);
353        assert_eq!(detect_search_mode("path\\to\\file"), Fts5SearchMode::Code);
354    }
355
356    #[test]
357    fn test_detect_search_mode_code_namespace() {
358        assert_eq!(detect_search_mode("std::io::Result"), Fts5SearchMode::Code);
359        assert_eq!(detect_search_mode("Vec::new()"), Fts5SearchMode::Code);
360    }
361
362    #[test]
363    fn test_detect_search_mode_code_kebab() {
364        assert_eq!(detect_search_mode("my-component"), Fts5SearchMode::Code);
365        assert_eq!(detect_search_mode("button-primary"), Fts5SearchMode::Code);
366    }
367
368    #[test]
369    fn test_detect_search_mode_code_special_chars() {
370        assert_eq!(detect_search_mode("#define"), Fts5SearchMode::Code);
371        assert_eq!(detect_search_mode("@decorator"), Fts5SearchMode::Code);
372        assert_eq!(detect_search_mode("$variable"), Fts5SearchMode::Code);
373        assert_eq!(detect_search_mode("%s"), Fts5SearchMode::Code);
374    }
375
376    #[test]
377    fn test_detect_search_mode_prose_questions() {
378        assert_eq!(
379            detect_search_mode("how does auth work"),
380            Fts5SearchMode::NaturalLanguage
381        );
382        assert_eq!(
383            detect_search_mode("what is the error"),
384            Fts5SearchMode::NaturalLanguage
385        );
386        assert_eq!(
387            detect_search_mode("why is it failing"),
388            Fts5SearchMode::NaturalLanguage
389        );
390    }
391
392    #[test]
393    fn test_detect_search_mode_prose_multiword() {
394        assert_eq!(
395            detect_search_mode("the quick brown fox jumps"),
396            Fts5SearchMode::NaturalLanguage
397        );
398    }
399
400    #[test]
401    fn test_has_kebab_case() {
402        assert!(has_kebab_case("my-component"));
403        assert!(has_kebab_case("button-primary"));
404        assert!(has_kebab_case("a-b"));
405        assert!(!has_kebab_case("hello"));
406        assert!(!has_kebab_case("-start"));
407        assert!(!has_kebab_case("end-"));
408        assert!(!has_kebab_case("1-2"));
409    }
410
411    #[test]
412    fn test_format_fts5_query_auto() {
413        let (table, _) = format_fts5_query("hello world", Fts5SearchMode::Auto);
414        assert_eq!(table, "messages_fts");
415
416        let (table, _) = format_fts5_query("my_function", Fts5SearchMode::Auto);
417        assert_eq!(table, "messages_code_fts");
418    }
419
420    #[test]
421    fn test_format_fts5_query_explicit() {
422        let (table, query) = format_fts5_query("running", Fts5SearchMode::NaturalLanguage);
423        assert_eq!(table, "messages_fts");
424        assert_eq!(query, r#""running""#);
425
426        let (table, query) = format_fts5_query("running", Fts5SearchMode::Code);
427        assert_eq!(table, "messages_code_fts");
428        assert_eq!(query, r#""running""#);
429    }
430
431    #[test]
432    fn test_build_fts5_search_sql() {
433        let sql = build_fts5_search_sql("messages_fts", 64, false);
434        assert!(sql.contains("FROM messages_fts"));
435        assert!(sql.contains("snippet(messages_fts"));
436        assert!(sql.contains("bm25(messages_fts)"));
437        assert!(sql.contains("WHERE messages_fts MATCH ?"));
438        assert!(!sql.contains("AND c.agent = ?"));
439
440        let sql_with_agent = build_fts5_search_sql("messages_code_fts", 32, true);
441        assert!(sql_with_agent.contains("FROM messages_code_fts"));
442        assert!(sql_with_agent.contains("AND c.agent = ?"));
443    }
444
445    #[test]
446    fn test_validate_fts5_query() {
447        assert_eq!(validate_fts5_query("hello"), Some("hello".to_string()));
448        assert_eq!(validate_fts5_query("  hello  "), Some("hello".to_string()));
449        assert_eq!(validate_fts5_query(""), None);
450        assert_eq!(validate_fts5_query("   "), None);
451        assert_eq!(validate_fts5_query("\t\n"), None);
452    }
453
454    #[test]
455    fn test_has_camel_case() {
456        assert!(has_camel_case("getUserById"));
457        assert!(has_camel_case("AuthController"));
458        assert!(has_camel_case("aB"));
459        assert!(!has_camel_case("hello"));
460        assert!(!has_camel_case("HELLO"));
461        assert!(!has_camel_case("hello_world"));
462        assert!(!has_camel_case(""));
463    }
464}