intent_engine/
search.rs

1//! Search utilities for intent-engine
2//!
3//! This module provides:
4//! 1. CJK (Chinese, Japanese, Korean) search utilities for detecting when to use
5//!    LIKE fallback vs FTS5 trigram search
6//! 2. Unified search across tasks and events
7//!
8//! **Background**: SQLite FTS5 with trigram tokenizer requires at least 3 consecutive
9//! characters to match. This is problematic for CJK languages where single-character
10//! or two-character searches are common (e.g., "用户", "认证").
11//!
12//! **Solution**: For short CJK queries, we fallback to LIKE search which supports
13//! any length substring matching, albeit slower.
14
15/// Check if a character is a CJK character
16pub fn is_cjk_char(c: char) -> bool {
17    let code = c as u32;
18    matches!(code,
19        // CJK Unified Ideographs (most common Chinese characters)
20        0x4E00..=0x9FFF |
21        // CJK Extension A
22        0x3400..=0x4DBF |
23        // CJK Extension B-F (less common, but included for completeness)
24        0x20000..=0x2A6DF |
25        0x2A700..=0x2B73F |
26        0x2B740..=0x2B81F |
27        0x2B820..=0x2CEAF |
28        0x2CEB0..=0x2EBEF |
29        // Hiragana (Japanese)
30        0x3040..=0x309F |
31        // Katakana (Japanese)
32        0x30A0..=0x30FF |
33        // Hangul Syllables (Korean)
34        0xAC00..=0xD7AF
35    )
36}
37
38/// Determine if a query should use LIKE fallback instead of FTS5 trigram
39///
40/// Returns `true` if:
41/// - Query is a single CJK character, OR
42/// - Query is two CJK characters
43///
44/// Trigram tokenizer requires 3+ characters for matching, so we use LIKE
45/// for shorter CJK queries to ensure they work.
46pub fn needs_like_fallback(query: &str) -> bool {
47    let chars: Vec<char> = query.chars().collect();
48
49    // Single-character CJK
50    if chars.len() == 1 && is_cjk_char(chars[0]) {
51        return true;
52    }
53
54    // Two-character all-CJK
55    // This is optional - could also let trigram handle it, but trigram
56    // needs minimum 3 chars so two-char CJK won't work well
57    if chars.len() == 2 && chars.iter().all(|c| is_cjk_char(*c)) {
58        return true;
59    }
60
61    false
62}
63
64/// Escape FTS5 special characters in a query string
65///
66/// FTS5 queries support advanced syntax (AND, OR, NOT, *, "phrase search", etc.).
67/// This function only escapes double quotes, which is the most common case where
68/// user input needs escaping.
69///
70/// # Arguments
71/// * `query` - The query string to escape
72///
73/// # Returns
74/// The escaped query string with double quotes escaped as `""`
75///
76/// # Example
77/// ```ignore
78/// use crate::search::escape_fts5;
79///
80/// let escaped = escape_fts5("user \"admin\" role");
81/// assert_eq!(escaped, "user \"\"admin\"\" role");
82/// ```
83pub fn escape_fts5(query: &str) -> String {
84    query.replace('"', "\"\"")
85}
86
87#[cfg(test)]
88mod tests {
89    use super::*;
90
91    #[test]
92    fn test_is_cjk_char() {
93        // Chinese characters
94        assert!(is_cjk_char('中'));
95        assert!(is_cjk_char('文'));
96        assert!(is_cjk_char('认'));
97        assert!(is_cjk_char('证'));
98
99        // Japanese Hiragana
100        assert!(is_cjk_char('あ'));
101        assert!(is_cjk_char('い'));
102
103        // Japanese Katakana
104        assert!(is_cjk_char('ア'));
105        assert!(is_cjk_char('イ'));
106
107        // Korean Hangul
108        assert!(is_cjk_char('가'));
109        assert!(is_cjk_char('나'));
110
111        // Non-CJK
112        assert!(!is_cjk_char('a'));
113        assert!(!is_cjk_char('A'));
114        assert!(!is_cjk_char('1'));
115        assert!(!is_cjk_char(' '));
116        assert!(!is_cjk_char('.'));
117    }
118
119    #[test]
120    fn test_needs_like_fallback() {
121        // Single CJK character - needs fallback
122        assert!(needs_like_fallback("中"));
123        assert!(needs_like_fallback("认"));
124        assert!(needs_like_fallback("あ"));
125        assert!(needs_like_fallback("가"));
126
127        // Two CJK characters - needs fallback
128        assert!(needs_like_fallback("中文"));
129        assert!(needs_like_fallback("认证"));
130        assert!(needs_like_fallback("用户"));
131
132        // Three+ CJK characters - can use FTS5
133        assert!(!needs_like_fallback("用户认"));
134        assert!(!needs_like_fallback("用户认证"));
135
136        // English - can use FTS5
137        assert!(!needs_like_fallback("JWT"));
138        assert!(!needs_like_fallback("auth"));
139        assert!(!needs_like_fallback("a")); // Single ASCII char, not CJK
140
141        // Mixed - can use FTS5
142        assert!(!needs_like_fallback("JWT认证"));
143        assert!(!needs_like_fallback("API接口"));
144    }
145
146    #[test]
147    fn test_needs_like_fallback_mixed_cjk_ascii() {
148        // Two characters: one CJK + one ASCII - should NOT need fallback
149        // because not all chars are CJK
150        assert!(!needs_like_fallback("中a"));
151        assert!(!needs_like_fallback("a中"));
152        assert!(!needs_like_fallback("認1"));
153
154        // Three+ characters with mixed CJK/ASCII - can use FTS5
155        assert!(!needs_like_fallback("中文API"));
156        assert!(!needs_like_fallback("JWT认证系统"));
157        assert!(!needs_like_fallback("API中文文档"));
158    }
159
160    #[test]
161    fn test_needs_like_fallback_edge_cases() {
162        // Empty string - no fallback needed
163        assert!(!needs_like_fallback(""));
164
165        // Whitespace only - no fallback
166        assert!(!needs_like_fallback(" "));
167        assert!(!needs_like_fallback("  "));
168
169        // Single non-CJK - no fallback
170        assert!(!needs_like_fallback("1"));
171        assert!(!needs_like_fallback("@"));
172        assert!(!needs_like_fallback(" "));
173
174        // Two non-CJK - no fallback
175        assert!(!needs_like_fallback("ab"));
176        assert!(!needs_like_fallback("12"));
177    }
178
179    #[test]
180    fn test_is_cjk_char_extension_ranges() {
181        // CJK Extension A (U+3400..U+4DBF)
182        assert!(is_cjk_char('\u{3400}')); // First char of Extension A
183        assert!(is_cjk_char('\u{4DBF}')); // Last char of Extension A
184
185        // CJK Unified Ideographs (U+4E00..U+9FFF) - common range
186        assert!(is_cjk_char('\u{4E00}')); // First common CJK
187        assert!(is_cjk_char('\u{9FFF}')); // Last common CJK
188
189        // Characters just outside ranges - should NOT be CJK
190        assert!(!is_cjk_char('\u{33FF}')); // Just before Extension A
191        assert!(!is_cjk_char('\u{4DC0}')); // Just after Extension A
192        assert!(!is_cjk_char('\u{4DFF}')); // Just before Unified Ideographs
193        assert!(!is_cjk_char('\u{A000}')); // Just after Unified Ideographs
194    }
195
196    #[test]
197    fn test_is_cjk_char_japanese() {
198        // Hiragana range (U+3040..U+309F)
199        assert!(is_cjk_char('\u{3040}')); // First Hiragana
200        assert!(is_cjk_char('ひ')); // Middle Hiragana
201        assert!(is_cjk_char('\u{309F}')); // Last Hiragana
202
203        // Katakana range (U+30A0..U+30FF)
204        assert!(is_cjk_char('\u{30A0}')); // First Katakana
205        assert!(is_cjk_char('カ')); // Middle Katakana
206        assert!(is_cjk_char('\u{30FF}')); // Last Katakana
207
208        // Just outside Japanese ranges
209        assert!(!is_cjk_char('\u{303F}')); // Before Hiragana
210        assert!(!is_cjk_char('\u{3100}')); // After Katakana (Bopomofo, not CJK by our definition)
211    }
212
213    #[test]
214    fn test_is_cjk_char_korean() {
215        // Hangul Syllables (U+AC00..U+D7AF)
216        assert!(is_cjk_char('\u{AC00}')); // First Hangul syllable (가)
217        assert!(is_cjk_char('한')); // Middle Hangul
218        assert!(is_cjk_char('\u{D7AF}')); // Last Hangul syllable
219
220        // Just outside Korean range
221        assert!(!is_cjk_char('\u{ABFF}')); // Before Hangul
222        assert!(!is_cjk_char('\u{D7B0}')); // After Hangul
223    }
224
225    #[test]
226    fn test_escape_fts5_basic() {
227        // No quotes - no escaping needed
228        assert_eq!(escape_fts5("hello world"), "hello world");
229        assert_eq!(escape_fts5("JWT authentication"), "JWT authentication");
230
231        // Single quote (not escaped by this function, only double quotes)
232        assert_eq!(escape_fts5("user's task"), "user's task");
233    }
234
235    #[test]
236    fn test_escape_fts5_double_quotes() {
237        // Single double quote
238        assert_eq!(escape_fts5("\"admin\""), "\"\"admin\"\"");
239
240        // Multiple double quotes
241        assert_eq!(
242            escape_fts5("\"user\" and \"admin\""),
243            "\"\"user\"\" and \"\"admin\"\""
244        );
245
246        // Double quotes at different positions
247        assert_eq!(
248            escape_fts5("start \"middle\" end"),
249            "start \"\"middle\"\" end"
250        );
251        assert_eq!(escape_fts5("\"start"), "\"\"start");
252        assert_eq!(escape_fts5("end\""), "end\"\"");
253    }
254
255    #[test]
256    fn test_escape_fts5_complex_queries() {
257        // Mixed quotes and special characters
258        assert_eq!(
259            escape_fts5("search for \"exact phrase\" here"),
260            "search for \"\"exact phrase\"\" here"
261        );
262
263        // Empty string
264        assert_eq!(escape_fts5(""), "");
265
266        // Only quotes
267        assert_eq!(escape_fts5("\""), "\"\"");
268        assert_eq!(escape_fts5("\"\""), "\"\"\"\"");
269        assert_eq!(escape_fts5("\"\"\""), "\"\"\"\"\"\"");
270    }
271
272    #[test]
273    fn test_escape_fts5_cjk_with_quotes() {
274        // CJK text with quotes
275        assert_eq!(escape_fts5("用户\"管理员\"权限"), "用户\"\"管理员\"\"权限");
276        assert_eq!(escape_fts5("\"認証\"システム"), "\"\"認証\"\"システム");
277
278        // Mixed CJK and English with quotes
279        assert_eq!(
280            escape_fts5("API\"接口\"documentation"),
281            "API\"\"接口\"\"documentation"
282        );
283    }
284
285    #[test]
286    fn test_needs_like_fallback_unicode_normalization() {
287        // Test with different Unicode representations
288        // Most CJK characters don't have composition, but test general behavior
289
290        // Standard CJK characters
291        assert!(needs_like_fallback("中"));
292        assert!(needs_like_fallback("日"));
293
294        // Two CJK characters
295        assert!(needs_like_fallback("中日"));
296        assert!(needs_like_fallback("認證"));
297    }
298}
299
300// ============================================================================
301// Unified Search
302// ============================================================================
303
304use crate::db::models::UnifiedSearchResult;
305use crate::error::Result;
306use crate::events::EventManager;
307use crate::tasks::TaskManager;
308use sqlx::SqlitePool;
309
310pub struct SearchManager<'a> {
311    pool: &'a SqlitePool,
312}
313
314impl<'a> SearchManager<'a> {
315    pub fn new(pool: &'a SqlitePool) -> Self {
316        Self { pool }
317    }
318
319    /// Unified search across tasks and events
320    ///
321    /// # Parameters
322    /// - `query`: FTS5 search query string
323    /// - `include_tasks`: Whether to search in tasks
324    /// - `include_events`: Whether to search in events
325    /// - `limit`: Maximum number of total results (default: 20)
326    ///
327    /// # Returns
328    /// A mixed vector of task and event search results, ordered by relevance (FTS5 rank)
329    pub async fn unified_search(
330        &self,
331        query: &str,
332        include_tasks: bool,
333        include_events: bool,
334        limit: Option<i64>,
335    ) -> Result<Vec<UnifiedSearchResult>> {
336        let total_limit = limit.unwrap_or(20);
337        let mut results = Vec::new();
338
339        // Calculate limits for each source
340        let (task_limit, event_limit) = match (include_tasks, include_events) {
341            (true, true) => (total_limit / 2, total_limit / 2),
342            (true, false) => (total_limit, 0),
343            (false, true) => (0, total_limit),
344            (false, false) => return Ok(results), // Early return if nothing to search
345        };
346
347        // Search tasks if enabled
348        if include_tasks && task_limit > 0 {
349            let task_mgr = TaskManager::new(self.pool);
350            let mut task_results = task_mgr.search_tasks(query).await?;
351
352            // Apply limit
353            task_results.truncate(task_limit as usize);
354
355            for task_result in task_results {
356                // Determine which field matched based on snippet content
357                let match_field = if task_result
358                    .match_snippet
359                    .to_lowercase()
360                    .contains(&task_result.task.name.to_lowercase())
361                {
362                    "name".to_string()
363                } else {
364                    "spec".to_string()
365                };
366
367                results.push(UnifiedSearchResult::Task {
368                    task: task_result.task,
369                    match_snippet: task_result.match_snippet,
370                    match_field,
371                });
372            }
373        }
374
375        // Search events if enabled
376        if include_events && event_limit > 0 {
377            let event_mgr = EventManager::new(self.pool);
378            let event_results = event_mgr
379                .search_events_fts5(query, Some(event_limit))
380                .await?;
381
382            let task_mgr = TaskManager::new(self.pool);
383            for event_result in event_results {
384                // Get task ancestry chain for this event
385                let task_chain = task_mgr
386                    .get_task_ancestry(event_result.event.task_id)
387                    .await?;
388
389                results.push(UnifiedSearchResult::Event {
390                    event: event_result.event,
391                    task_chain,
392                    match_snippet: event_result.match_snippet,
393                });
394            }
395        }
396
397        // Limit to total_limit (in case we got more from both sources)
398        results.truncate(total_limit as usize);
399
400        Ok(results)
401    }
402}
403
404#[cfg(test)]
405mod unified_search_tests {
406    use super::*;
407    use crate::test_utils::test_helpers::TestContext;
408
409    #[tokio::test]
410    async fn test_unified_search_basic() {
411        let ctx = TestContext::new().await;
412        let task_mgr = TaskManager::new(ctx.pool());
413        let event_mgr = EventManager::new(ctx.pool());
414        let search_mgr = SearchManager::new(ctx.pool());
415
416        // Create test task
417        let task = task_mgr
418            .add_task("JWT Authentication", Some("Implement JWT auth"), None)
419            .await
420            .unwrap();
421
422        // Add test event
423        event_mgr
424            .add_event(task.id, "decision", "Chose JWT over OAuth")
425            .await
426            .unwrap();
427
428        // Search for "JWT" - should find both task and event
429        let results = search_mgr
430            .unified_search("JWT", true, true, None)
431            .await
432            .unwrap();
433
434        assert!(results.len() >= 2);
435
436        // Verify we got both task and event results
437        let has_task = results
438            .iter()
439            .any(|r| matches!(r, UnifiedSearchResult::Task { .. }));
440        let has_event = results
441            .iter()
442            .any(|r| matches!(r, UnifiedSearchResult::Event { .. }));
443
444        assert!(has_task);
445        assert!(has_event);
446    }
447
448    #[tokio::test]
449    async fn test_unified_search_tasks_only() {
450        let ctx = TestContext::new().await;
451        let task_mgr = TaskManager::new(ctx.pool());
452        let search_mgr = SearchManager::new(ctx.pool());
453
454        // Create test task
455        task_mgr
456            .add_task("OAuth Implementation", None, None)
457            .await
458            .unwrap();
459
460        // Search tasks only
461        let results = search_mgr
462            .unified_search("OAuth", true, false, None)
463            .await
464            .unwrap();
465
466        assert!(!results.is_empty());
467
468        // All results should be tasks
469        for result in results {
470            assert!(matches!(result, UnifiedSearchResult::Task { .. }));
471        }
472    }
473
474    #[tokio::test]
475    async fn test_unified_search_events_only() {
476        let ctx = TestContext::new().await;
477        let task_mgr = TaskManager::new(ctx.pool());
478        let event_mgr = EventManager::new(ctx.pool());
479        let search_mgr = SearchManager::new(ctx.pool());
480
481        // Create test task and event
482        let task = task_mgr.add_task("Test task", None, None).await.unwrap();
483
484        event_mgr
485            .add_event(task.id, "blocker", "OAuth library missing")
486            .await
487            .unwrap();
488
489        // Search events only
490        let results = search_mgr
491            .unified_search("OAuth", false, true, None)
492            .await
493            .unwrap();
494
495        assert!(!results.is_empty());
496
497        // All results should be events
498        for result in results {
499            assert!(matches!(result, UnifiedSearchResult::Event { .. }));
500        }
501    }
502
503    #[tokio::test]
504    async fn test_unified_search_with_limit() {
505        let ctx = TestContext::new().await;
506        let task_mgr = TaskManager::new(ctx.pool());
507        let search_mgr = SearchManager::new(ctx.pool());
508
509        // Create multiple test tasks
510        for i in 0..10 {
511            task_mgr
512                .add_task(&format!("Test task {}", i), None, None)
513                .await
514                .unwrap();
515        }
516
517        // Search with limit of 3
518        let results = search_mgr
519            .unified_search("Test", true, true, Some(3))
520            .await
521            .unwrap();
522
523        assert!(results.len() <= 3);
524    }
525}