intent_engine/
search.rs

1//! CJK (Chinese, Japanese, Korean) search utilities
2//!
3//! This module provides utilities for detecting CJK characters and determining
4//! when to use LIKE fallback vs FTS5 trigram search.
5//!
6//! **Background**: SQLite FTS5 with trigram tokenizer requires at least 3 consecutive
7//! characters to match. This is problematic for CJK languages where single-character
8//! or two-character searches are common (e.g., "用户", "认证").
9//!
10//! **Solution**: For short CJK queries, we fallback to LIKE search which supports
11//! any length substring matching, albeit slower.
12
13/// Check if a character is a CJK character
14pub fn is_cjk_char(c: char) -> bool {
15    let code = c as u32;
16    matches!(code,
17        // CJK Unified Ideographs (most common Chinese characters)
18        0x4E00..=0x9FFF |
19        // CJK Extension A
20        0x3400..=0x4DBF |
21        // CJK Extension B-F (less common, but included for completeness)
22        0x20000..=0x2A6DF |
23        0x2A700..=0x2B73F |
24        0x2B740..=0x2B81F |
25        0x2B820..=0x2CEAF |
26        0x2CEB0..=0x2EBEF |
27        // Hiragana (Japanese)
28        0x3040..=0x309F |
29        // Katakana (Japanese)
30        0x30A0..=0x30FF |
31        // Hangul Syllables (Korean)
32        0xAC00..=0xD7AF
33    )
34}
35
36/// Determine if a query should use LIKE fallback instead of FTS5 trigram
37///
38/// Returns `true` if:
39/// - Query is a single CJK character, OR
40/// - Query is two CJK characters
41///
42/// Trigram tokenizer requires 3+ characters for matching, so we use LIKE
43/// for shorter CJK queries to ensure they work.
44pub fn needs_like_fallback(query: &str) -> bool {
45    let chars: Vec<char> = query.chars().collect();
46
47    // Single-character CJK
48    if chars.len() == 1 && is_cjk_char(chars[0]) {
49        return true;
50    }
51
52    // Two-character all-CJK
53    // This is optional - could also let trigram handle it, but trigram
54    // needs minimum 3 chars so two-char CJK won't work well
55    if chars.len() == 2 && chars.iter().all(|c| is_cjk_char(*c)) {
56        return true;
57    }
58
59    false
60}
61
62#[cfg(test)]
63mod tests {
64    use super::*;
65
66    #[test]
67    fn test_is_cjk_char() {
68        // Chinese characters
69        assert!(is_cjk_char('中'));
70        assert!(is_cjk_char('文'));
71        assert!(is_cjk_char('认'));
72        assert!(is_cjk_char('证'));
73
74        // Japanese Hiragana
75        assert!(is_cjk_char('あ'));
76        assert!(is_cjk_char('い'));
77
78        // Japanese Katakana
79        assert!(is_cjk_char('ア'));
80        assert!(is_cjk_char('イ'));
81
82        // Korean Hangul
83        assert!(is_cjk_char('가'));
84        assert!(is_cjk_char('나'));
85
86        // Non-CJK
87        assert!(!is_cjk_char('a'));
88        assert!(!is_cjk_char('A'));
89        assert!(!is_cjk_char('1'));
90        assert!(!is_cjk_char(' '));
91        assert!(!is_cjk_char('.'));
92    }
93
94    #[test]
95    fn test_needs_like_fallback() {
96        // Single CJK character - needs fallback
97        assert!(needs_like_fallback("中"));
98        assert!(needs_like_fallback("认"));
99        assert!(needs_like_fallback("あ"));
100        assert!(needs_like_fallback("가"));
101
102        // Two CJK characters - needs fallback
103        assert!(needs_like_fallback("中文"));
104        assert!(needs_like_fallback("认证"));
105        assert!(needs_like_fallback("用户"));
106
107        // Three+ CJK characters - can use FTS5
108        assert!(!needs_like_fallback("用户认"));
109        assert!(!needs_like_fallback("用户认证"));
110
111        // English - can use FTS5
112        assert!(!needs_like_fallback("JWT"));
113        assert!(!needs_like_fallback("auth"));
114        assert!(!needs_like_fallback("a")); // Single ASCII char, not CJK
115
116        // Mixed - can use FTS5
117        assert!(!needs_like_fallback("JWT认证"));
118        assert!(!needs_like_fallback("API接口"));
119    }
120}