Skip to main content

helios_persistence/backends/sqlite/search/
fts.rs

1//! FTS5 Full-Text Search integration.
2//!
3//! Provides optional FTS5-based searching for string and text content.
4//! Supports FHIR _text and _content search parameters.
5
6use serde_json::Value;
7
8use super::query_builder::{SqlFragment, SqlParam};
9
10/// Content extracted from a resource for full-text search.
11#[derive(Debug, Clone, Default)]
12pub struct SearchableContent {
13    /// Narrative text from the resource's text.div element (HTML stripped).
14    /// Used for _text searches.
15    pub narrative: String,
16    /// Full text content extracted from all string fields in the resource.
17    /// Used for _content searches.
18    pub full_content: String,
19}
20
21impl SearchableContent {
22    /// Creates a new empty SearchableContent.
23    pub fn new() -> Self {
24        Self::default()
25    }
26
27    /// Returns true if both narrative and full_content are empty.
28    pub fn is_empty(&self) -> bool {
29        self.narrative.is_empty() && self.full_content.is_empty()
30    }
31}
32
33/// Extracts searchable text content from a FHIR resource.
34///
35/// Extracts:
36/// - Narrative text from text.div (with HTML stripped)
37/// - Full content from all string values in the resource
38pub fn extract_searchable_content(resource: &Value) -> SearchableContent {
39    SearchableContent {
40        // _text: Extract and strip HTML from narrative
41        narrative: extract_narrative(resource),
42        // _content: Extract all string values recursively
43        full_content: extract_all_strings(resource),
44    }
45}
46
47/// Extracts narrative text from a resource's text.div element.
48///
49/// Strips HTML tags and returns plain text.
50fn extract_narrative(resource: &Value) -> String {
51    resource
52        .get("text")
53        .and_then(|t| t.get("div"))
54        .and_then(|d| d.as_str())
55        .map(strip_html_tags)
56        .unwrap_or_default()
57}
58
59/// Strips HTML tags from a string and decodes HTML entities.
60///
61/// Handles:
62/// - HTML tags (removes everything between < and >)
63/// - CDATA sections (extracts content)
64/// - HTML entities (&lt;, &gt;, &amp;, &nbsp;, &quot;, &apos;, &#123;, &#x1F;, etc.)
65fn strip_html_tags(html: &str) -> String {
66    let mut result = String::with_capacity(html.len());
67    let mut in_tag = false;
68    let mut chars = html.chars().peekable();
69
70    while let Some(c) = chars.next() {
71        match c {
72            '<' => {
73                // Check for CDATA section: <![CDATA[...]]>
74                if chars.peek() == Some(&'!') {
75                    let lookahead: String = chars.clone().take(8).collect();
76                    if lookahead.starts_with("![CDATA[") {
77                        // Skip the "![CDATA[" prefix
78                        for _ in 0..8 {
79                            chars.next();
80                        }
81                        // Extract until ]]>
82                        let mut cdata_content = String::new();
83                        while let Some(ch) = chars.next() {
84                            if ch == ']' {
85                                let next_two: String = chars.clone().take(2).collect();
86                                if next_two == "]>" {
87                                    chars.next(); // skip ]
88                                    chars.next(); // skip >
89                                    break;
90                                }
91                            }
92                            cdata_content.push(ch);
93                        }
94                        result.push_str(&cdata_content);
95                        result.push(' ');
96                        continue;
97                    }
98                }
99                in_tag = true;
100            }
101            '>' if in_tag => {
102                in_tag = false;
103            }
104            '&' if !in_tag => {
105                // Collect entity up to ';' (max 10 chars for safety)
106                let mut entity = String::new();
107                let mut found_semicolon = false;
108                for _ in 0..10 {
109                    if let Some(&ch) = chars.peek() {
110                        if ch == ';' {
111                            chars.next(); // consume the semicolon
112                            found_semicolon = true;
113                            break;
114                        } else if ch.is_alphanumeric() || ch == '#' {
115                            entity.push(ch);
116                            chars.next();
117                        } else {
118                            break;
119                        }
120                    } else {
121                        break;
122                    }
123                }
124                if found_semicolon {
125                    if let Some(decoded) = decode_html_entity(&entity) {
126                        result.push(decoded);
127                    } else {
128                        // Unknown entity, keep as-is
129                        result.push('&');
130                        result.push_str(&entity);
131                        result.push(';');
132                    }
133                } else {
134                    // Not a valid entity, keep the ampersand and collected chars
135                    result.push('&');
136                    result.push_str(&entity);
137                }
138            }
139            _ if !in_tag => result.push(c),
140            _ => {}
141        }
142    }
143
144    // Normalize whitespace
145    result.split_whitespace().collect::<Vec<_>>().join(" ")
146}
147
148/// Decodes an HTML entity to its character equivalent.
149///
150/// Supports named entities (lt, gt, amp, nbsp, quot, apos) and
151/// numeric entities (&#123; decimal, &#x1F; hexadecimal).
152fn decode_html_entity(entity: &str) -> Option<char> {
153    match entity {
154        "lt" => Some('<'),
155        "gt" => Some('>'),
156        "amp" => Some('&'),
157        "nbsp" => Some(' '),
158        "quot" => Some('"'),
159        "apos" => Some('\''),
160        s if s.starts_with('#') => {
161            let num = s.strip_prefix('#')?;
162            let code = if let Some(hex) = num.strip_prefix('x').or_else(|| num.strip_prefix('X')) {
163                u32::from_str_radix(hex, 16).ok()?
164            } else {
165                num.parse().ok()?
166            };
167            char::from_u32(code)
168        }
169        _ => None,
170    }
171}
172
173/// Extracts all string values from a JSON value recursively.
174///
175/// Concatenates all string values found in the resource, separated by spaces.
176fn extract_all_strings(value: &Value) -> String {
177    let mut strings = Vec::new();
178    extract_strings_recursive(value, &mut strings);
179    strings.join(" ")
180}
181
182/// Recursively extracts string values from a JSON value.
183fn extract_strings_recursive(value: &Value, strings: &mut Vec<String>) {
184    match value {
185        Value::String(s) => {
186            // Skip empty strings and URLs (typically not useful for text search)
187            if !s.is_empty() && !s.starts_with("http://") && !s.starts_with("https://") {
188                strings.push(s.clone());
189            }
190        }
191        Value::Array(arr) => {
192            for item in arr {
193                extract_strings_recursive(item, strings);
194            }
195        }
196        Value::Object(obj) => {
197            for (key, val) in obj {
198                // Skip technical fields that aren't useful for text search
199                if !matches!(
200                    key.as_str(),
201                    "resourceType" | "id" | "meta" | "extension" | "url" | "reference"
202                ) {
203                    extract_strings_recursive(val, strings);
204                }
205            }
206        }
207        _ => {}
208    }
209}
210
211/// FTS5 search helper for full-text search operations.
212pub struct Fts5Search;
213
214impl Fts5Search {
215    /// The name of the FTS5 virtual table.
216    pub const FTS_TABLE_NAME: &'static str = "search_index_fts";
217
218    /// Generates the SQL to create the FTS5 virtual table.
219    pub fn create_table_sql() -> &'static str {
220        r#"
221        CREATE VIRTUAL TABLE IF NOT EXISTS search_index_fts USING fts5(
222            text_content,
223            content='search_index',
224            content_rowid='rowid',
225            tokenize='porter unicode61'
226        )
227        "#
228    }
229
230    /// Generates triggers to keep FTS5 table in sync with search_index.
231    ///
232    /// Indexes both `value_string` and `value_token_display` columns
233    /// to support both regular string search and :text-advanced modifier
234    /// on token display text.
235    pub fn create_triggers_sql() -> &'static str {
236        r#"
237        -- Trigger for INSERT (indexes value_string and value_token_display)
238        CREATE TRIGGER IF NOT EXISTS search_index_fts_insert AFTER INSERT ON search_index
239        WHEN new.value_string IS NOT NULL OR new.value_token_display IS NOT NULL
240        BEGIN
241            INSERT INTO search_index_fts(rowid, text_content)
242            VALUES (new.rowid, COALESCE(new.value_string, '') || ' ' || COALESCE(new.value_token_display, ''));
243        END;
244
245        -- Trigger for DELETE
246        CREATE TRIGGER IF NOT EXISTS search_index_fts_delete AFTER DELETE ON search_index
247        WHEN old.value_string IS NOT NULL OR old.value_token_display IS NOT NULL
248        BEGIN
249            INSERT INTO search_index_fts(search_index_fts, rowid, text_content)
250            VALUES ('delete', old.rowid, COALESCE(old.value_string, '') || ' ' || COALESCE(old.value_token_display, ''));
251        END;
252
253        -- Trigger for UPDATE
254        CREATE TRIGGER IF NOT EXISTS search_index_fts_update AFTER UPDATE ON search_index
255        WHEN old.value_string IS NOT NULL OR new.value_string IS NOT NULL
256             OR old.value_token_display IS NOT NULL OR new.value_token_display IS NOT NULL
257        BEGIN
258            INSERT INTO search_index_fts(search_index_fts, rowid, text_content)
259            VALUES ('delete', old.rowid, COALESCE(old.value_string, '') || ' ' || COALESCE(old.value_token_display, ''));
260            INSERT INTO search_index_fts(rowid, text_content)
261            VALUES (new.rowid, COALESCE(new.value_string, '') || ' ' || COALESCE(new.value_token_display, ''));
262        END;
263        "#
264    }
265
266    /// Builds a full-text search query using FTS5 MATCH syntax.
267    ///
268    /// The search_term is escaped for safe use in FTS5 queries.
269    pub fn build_fts_query(search_term: &str, param_num: usize) -> SqlFragment {
270        SqlFragment::with_params(
271            format!(
272                "rowid IN (SELECT rowid FROM {} WHERE {} MATCH ?{})",
273                Self::FTS_TABLE_NAME,
274                Self::FTS_TABLE_NAME,
275                param_num
276            ),
277            vec![SqlParam::string(Self::escape_fts_query(search_term))],
278        )
279    }
280
281    /// Builds an FTS5 query with phrase matching.
282    pub fn build_phrase_query(phrase: &str, param_num: usize) -> SqlFragment {
283        let escaped = Self::escape_fts_query(phrase);
284        SqlFragment::with_params(
285            format!(
286                "rowid IN (SELECT rowid FROM {} WHERE {} MATCH ?{})",
287                Self::FTS_TABLE_NAME,
288                Self::FTS_TABLE_NAME,
289                param_num
290            ),
291            vec![SqlParam::string(format!("\"{}\"", escaped))],
292        )
293    }
294
295    /// Builds an FTS5 prefix search query.
296    pub fn build_prefix_query(prefix: &str, param_num: usize) -> SqlFragment {
297        let escaped = Self::escape_fts_query(prefix);
298        SqlFragment::with_params(
299            format!(
300                "rowid IN (SELECT rowid FROM {} WHERE {} MATCH ?{})",
301                Self::FTS_TABLE_NAME,
302                Self::FTS_TABLE_NAME,
303                param_num
304            ),
305            vec![SqlParam::string(format!("{}*", escaped))],
306        )
307    }
308
309    /// Escapes special characters for FTS5 queries.
310    pub fn escape_fts_query(term: &str) -> String {
311        // FTS5 special characters that need escaping in queries
312        let mut result = String::with_capacity(term.len());
313        for c in term.chars() {
314            match c {
315                '"' | '*' | ':' | '^' | '(' | ')' | '+' | '-' | '~' => {
316                    // Skip these special characters or escape them
317                    result.push(' ');
318                }
319                _ => result.push(c),
320            }
321        }
322        result.trim().to_string()
323    }
324
325    /// Checks if FTS5 is available in the database.
326    ///
327    /// This should be called during backend initialization.
328    pub fn check_fts5_available_sql() -> &'static str {
329        "SELECT sqlite_compileoption_used('ENABLE_FTS5')"
330    }
331
332    /// Rebuilds the FTS5 index from the search_index table.
333    ///
334    /// Call this after bulk imports or if the FTS index gets out of sync.
335    pub fn rebuild_index_sql() -> String {
336        format!(
337            "INSERT INTO {}({}) VALUES ('rebuild')",
338            Self::FTS_TABLE_NAME,
339            Self::FTS_TABLE_NAME
340        )
341    }
342
343    /// Optimizes the FTS5 index for better query performance.
344    pub fn optimize_index_sql() -> String {
345        format!(
346            "INSERT INTO {}({}) VALUES ('optimize')",
347            Self::FTS_TABLE_NAME,
348            Self::FTS_TABLE_NAME
349        )
350    }
351
352    /// Builds an advanced FTS5 query with boolean operator support.
353    ///
354    /// This supports the `:text-advanced` modifier from FHIR v6.0.0.
355    ///
356    /// Query syntax:
357    /// - `term1 term2` → implicit AND
358    /// - `term1 OR term2` → either term
359    /// - `"exact phrase"` → phrase match
360    /// - `term*` → prefix match
361    /// - `-term` or `NOT term` → exclude term
362    /// - `term1 NEAR term2` → proximity match (within 10 words)
363    /// - `term1 NEAR/5 term2` → proximity match within 5 words
364    pub fn build_advanced_query(query: &str, param_num: usize) -> SqlFragment {
365        let fts_query = Self::parse_advanced_query(query);
366        SqlFragment::with_params(
367            format!(
368                "rowid IN (SELECT rowid FROM {} WHERE {} MATCH ?{})",
369                Self::FTS_TABLE_NAME,
370                Self::FTS_TABLE_NAME,
371                param_num
372            ),
373            vec![SqlParam::string(fts_query)],
374        )
375    }
376
377    /// Parses a user-friendly query into FTS5 syntax.
378    ///
379    /// Transforms user input into valid FTS5 query syntax:
380    /// - Preserves quoted phrases
381    /// - Handles OR operator (passed through to FTS5)
382    /// - Handles NOT / - prefix (converts to NOT)
383    /// - Handles NEAR operator (passed through to FTS5)
384    /// - Handles prefix wildcard (term* stays as-is)
385    /// - Escapes special characters in regular terms
386    /// - Joins remaining terms with implicit AND
387    pub fn parse_advanced_query(query: &str) -> String {
388        let tokens = Self::tokenize_advanced_query(query);
389        Self::tokens_to_fts5(&tokens)
390    }
391
392    /// Tokenizes an advanced query, preserving quoted phrases and operators.
393    fn tokenize_advanced_query(query: &str) -> Vec<String> {
394        let mut tokens = Vec::new();
395        let chars = query.chars().peekable();
396        let mut current = String::new();
397        let mut in_quote = false;
398
399        for c in chars {
400            match c {
401                '"' => {
402                    if in_quote {
403                        // End of quoted phrase
404                        if !current.is_empty() {
405                            tokens.push(format!("\"{}\"", current));
406                            current.clear();
407                        }
408                        in_quote = false;
409                    } else {
410                        // Start of quoted phrase - save current token first
411                        if !current.is_empty() {
412                            tokens.push(current.clone());
413                            current.clear();
414                        }
415                        in_quote = true;
416                    }
417                }
418                ' ' | '\t' | '\n' if !in_quote => {
419                    if !current.is_empty() {
420                        tokens.push(current.clone());
421                        current.clear();
422                    }
423                }
424                _ => {
425                    current.push(c);
426                }
427            }
428        }
429
430        // Handle any remaining content
431        if !current.is_empty() {
432            if in_quote {
433                // Unclosed quote - treat as phrase anyway
434                tokens.push(format!("\"{}\"", current));
435            } else {
436                tokens.push(current);
437            }
438        }
439
440        tokens
441    }
442
443    /// Converts parsed tokens to FTS5 query syntax.
444    fn tokens_to_fts5(tokens: &[String]) -> String {
445        let mut result = Vec::new();
446        let mut i = 0;
447
448        while i < tokens.len() {
449            let token = &tokens[i];
450            let upper = token.to_uppercase();
451
452            // Check for operators
453            if upper == "OR" || upper == "AND" {
454                // Keep operators as-is
455                result.push(upper);
456            } else if upper == "NOT" {
457                // NOT operator
458                result.push("NOT".to_string());
459            } else if upper == "NEAR" || upper.starts_with("NEAR/") {
460                // NEAR operator (with optional distance)
461                result.push(upper);
462            } else if token.starts_with('-') && token.len() > 1 {
463                // -term becomes NOT term
464                result.push("NOT".to_string());
465                let term = &token[1..];
466                result.push(Self::escape_term_for_fts5(term));
467            } else if token.starts_with('"') {
468                // Quoted phrase - already formatted, just escape inner content
469                let inner = token.trim_matches('"');
470                result.push(format!("\"{}\"", Self::escape_fts_query(inner)));
471            } else if token.ends_with('*') {
472                // Prefix search - escape the base term and add *
473                let base = &token[..token.len() - 1];
474                if !base.is_empty() {
475                    result.push(format!("{}*", Self::escape_term_for_fts5(base)));
476                }
477            } else {
478                // Regular term
479                result.push(Self::escape_term_for_fts5(token));
480            }
481            i += 1;
482        }
483
484        // Join tokens with implicit AND between adjacent non-operator terms
485        Self::join_with_implicit_and(&result)
486    }
487
488    /// Escapes a single term for FTS5 query.
489    fn escape_term_for_fts5(term: &str) -> String {
490        Self::escape_fts_query(term)
491    }
492
493    /// Joins terms with implicit AND between adjacent non-operator terms.
494    ///
495    /// FTS5 requires explicit AND between terms for conjunction.
496    /// This inserts AND between adjacent terms that are not already
497    /// separated by an operator (OR, AND, NOT, NEAR).
498    fn join_with_implicit_and(terms: &[String]) -> String {
499        if terms.is_empty() {
500            return String::new();
501        }
502
503        let mut result = Vec::new();
504        let operators = ["OR", "AND", "NOT"];
505
506        for (i, term) in terms.iter().enumerate() {
507            result.push(term.clone());
508
509            // Check if we need to insert AND before the next term
510            if i < terms.len() - 1 {
511                let next = &terms[i + 1];
512                let current_is_op = operators.contains(&term.to_uppercase().as_str())
513                    || term.to_uppercase().starts_with("NEAR");
514                let next_is_op = operators.contains(&next.to_uppercase().as_str())
515                    || next.to_uppercase().starts_with("NEAR");
516
517                // Insert AND if current is not an operator and next is not an operator or NOT
518                if !current_is_op && !next_is_op && next.to_uppercase() != "NOT" {
519                    result.push("AND".to_string());
520                }
521            }
522        }
523
524        result.join(" ")
525    }
526}
527
528#[cfg(test)]
529mod tests {
530    use super::*;
531    use serde_json::json;
532
533    #[test]
534    fn test_escape_fts_query() {
535        assert_eq!(Fts5Search::escape_fts_query("simple"), "simple");
536        assert_eq!(Fts5Search::escape_fts_query("has\"quotes"), "has quotes");
537        assert_eq!(Fts5Search::escape_fts_query("star*"), "star");
538        assert_eq!(
539            Fts5Search::escape_fts_query("complex:query+term"),
540            "complex query term"
541        );
542    }
543
544    #[test]
545    fn test_build_fts_query() {
546        let frag = Fts5Search::build_fts_query("smith", 1);
547
548        assert!(frag.sql.contains("search_index_fts"));
549        assert!(frag.sql.contains("MATCH"));
550        assert_eq!(frag.params.len(), 1);
551    }
552
553    #[test]
554    fn test_build_phrase_query() {
555        let frag = Fts5Search::build_phrase_query("john smith", 1);
556
557        assert!(frag.sql.contains("MATCH"));
558        // The param should be quoted for phrase search
559    }
560
561    #[test]
562    fn test_build_prefix_query() {
563        let frag = Fts5Search::build_prefix_query("smi", 1);
564
565        assert!(frag.sql.contains("MATCH"));
566    }
567
568    // ============================================================================
569    // Advanced Query Parser Tests (:text-advanced modifier)
570    // ============================================================================
571
572    #[test]
573    fn test_parse_advanced_query_simple() {
574        assert_eq!(Fts5Search::parse_advanced_query("headache"), "headache");
575    }
576
577    #[test]
578    fn test_parse_advanced_query_multiple_terms() {
579        // Multiple terms should be joined with AND
580        assert_eq!(
581            Fts5Search::parse_advanced_query("heart attack"),
582            "heart AND attack"
583        );
584    }
585
586    #[test]
587    fn test_parse_advanced_query_phrase() {
588        assert_eq!(
589            Fts5Search::parse_advanced_query("\"heart attack\""),
590            "\"heart attack\""
591        );
592    }
593
594    #[test]
595    fn test_parse_advanced_query_or() {
596        assert_eq!(
597            Fts5Search::parse_advanced_query("headache OR migraine"),
598            "headache OR migraine"
599        );
600    }
601
602    #[test]
603    fn test_parse_advanced_query_prefix() {
604        assert_eq!(Fts5Search::parse_advanced_query("cardio*"), "cardio*");
605    }
606
607    #[test]
608    fn test_parse_advanced_query_not_minus() {
609        // -term should become NOT term
610        assert_eq!(Fts5Search::parse_advanced_query("-surgery"), "NOT surgery");
611    }
612
613    #[test]
614    fn test_parse_advanced_query_not_keyword() {
615        // NOT term should stay as NOT term
616        assert_eq!(
617            Fts5Search::parse_advanced_query("NOT surgery"),
618            "NOT surgery"
619        );
620    }
621
622    #[test]
623    fn test_parse_advanced_query_near() {
624        assert_eq!(
625            Fts5Search::parse_advanced_query("heart NEAR attack"),
626            "heart NEAR attack"
627        );
628    }
629
630    #[test]
631    fn test_parse_advanced_query_near_with_distance() {
632        assert_eq!(
633            Fts5Search::parse_advanced_query("heart NEAR/5 attack"),
634            "heart NEAR/5 attack"
635        );
636    }
637
638    #[test]
639    fn test_parse_advanced_query_complex() {
640        // Complex query: heart OR cardiac with exclusion
641        assert_eq!(
642            Fts5Search::parse_advanced_query("heart OR cardiac -surgery"),
643            "heart OR cardiac NOT surgery"
644        );
645    }
646
647    #[test]
648    fn test_parse_advanced_query_mixed() {
649        // Mix of phrase, prefix, and boolean
650        assert_eq!(
651            Fts5Search::parse_advanced_query("\"chest pain\" cardio* OR thoracic"),
652            "\"chest pain\" AND cardio* OR thoracic"
653        );
654    }
655
656    #[test]
657    fn test_parse_advanced_query_case_insensitive_operators() {
658        // Operators should work case-insensitively
659        assert_eq!(
660            Fts5Search::parse_advanced_query("heart or cardiac"),
661            "heart OR cardiac"
662        );
663        assert_eq!(
664            Fts5Search::parse_advanced_query("pain not chronic"),
665            "pain NOT chronic"
666        );
667    }
668
669    #[test]
670    fn test_build_advanced_query() {
671        let frag = Fts5Search::build_advanced_query("heart OR cardiac -surgery", 1);
672
673        assert!(frag.sql.contains("search_index_fts"));
674        assert!(frag.sql.contains("MATCH"));
675        assert_eq!(frag.params.len(), 1);
676
677        // The query should be properly formatted
678        if let SqlParam::String(s) = &frag.params[0] {
679            assert!(s.contains("OR"));
680            assert!(s.contains("NOT"));
681        }
682    }
683
684    #[test]
685    fn test_strip_html_tags() {
686        assert_eq!(strip_html_tags("<p>Hello</p>"), "Hello");
687        assert_eq!(
688            strip_html_tags("<div><p>Hello <b>world</b></p></div>"),
689            "Hello world"
690        );
691        assert_eq!(strip_html_tags("No tags here"), "No tags here");
692        assert_eq!(strip_html_tags("<br/>"), "");
693        assert_eq!(
694            strip_html_tags("<div xmlns=\"http://www.w3.org/1999/xhtml\">Test</div>"),
695            "Test"
696        );
697    }
698
699    #[test]
700    fn test_strip_html_entities() {
701        // Named entities
702        assert_eq!(strip_html_tags("&lt;tag&gt;"), "<tag>");
703        assert_eq!(strip_html_tags("Tom &amp; Jerry"), "Tom & Jerry");
704        assert_eq!(
705            strip_html_tags("He said &quot;hello&quot;"),
706            "He said \"hello\""
707        );
708        assert_eq!(strip_html_tags("It&apos;s fine"), "It's fine");
709        assert_eq!(strip_html_tags("Non&nbsp;breaking"), "Non breaking");
710
711        // Numeric entities (decimal)
712        assert_eq!(strip_html_tags("&#60;&#62;"), "<>");
713        assert_eq!(strip_html_tags("&#65;&#66;&#67;"), "ABC");
714
715        // Numeric entities (hexadecimal)
716        assert_eq!(strip_html_tags("&#x3C;&#x3E;"), "<>");
717        assert_eq!(strip_html_tags("&#x41;&#x42;&#x43;"), "ABC");
718        assert_eq!(strip_html_tags("&#X41;&#X42;"), "AB"); // uppercase X
719
720        // Mixed content with entities
721        assert_eq!(
722            strip_html_tags("<p>Price: &lt;$100 &amp; discount</p>"),
723            "Price: <$100 & discount"
724        );
725    }
726
727    #[test]
728    fn test_strip_html_cdata() {
729        assert_eq!(
730            strip_html_tags("<![CDATA[Some raw content]]>"),
731            "Some raw content"
732        );
733        assert_eq!(
734            strip_html_tags("<div><![CDATA[Inner CDATA]]></div>"),
735            "Inner CDATA"
736        );
737        assert_eq!(
738            strip_html_tags("Before <![CDATA[inside]]> after"),
739            "Before inside after"
740        );
741        // CDATA with special characters
742        assert_eq!(
743            strip_html_tags("<![CDATA[<script>alert('hi')</script>]]>"),
744            "<script>alert('hi')</script>"
745        );
746    }
747
748    #[test]
749    fn test_strip_html_edge_cases() {
750        // Unclosed entity (should preserve as-is)
751        assert_eq!(strip_html_tags("a & b"), "a & b");
752        assert_eq!(strip_html_tags("a &unknown; b"), "a &unknown; b");
753
754        // Empty input
755        assert_eq!(strip_html_tags(""), "");
756
757        // Only whitespace
758        assert_eq!(strip_html_tags("   "), "");
759
760        // Self-closing tags
761        assert_eq!(strip_html_tags("<br/><hr/>text"), "text");
762
763        // Complex FHIR narrative
764        let fhir_narrative = r#"<div xmlns="http://www.w3.org/1999/xhtml">
765            <p>Patient: John Smith &amp; family</p>
766            <p>DOB: &lt;1970-01-15&gt;</p>
767        </div>"#;
768        assert_eq!(
769            strip_html_tags(fhir_narrative),
770            "Patient: John Smith & family DOB: <1970-01-15>"
771        );
772    }
773
774    #[test]
775    fn test_decode_html_entity() {
776        assert_eq!(decode_html_entity("lt"), Some('<'));
777        assert_eq!(decode_html_entity("gt"), Some('>'));
778        assert_eq!(decode_html_entity("amp"), Some('&'));
779        assert_eq!(decode_html_entity("nbsp"), Some(' '));
780        assert_eq!(decode_html_entity("quot"), Some('"'));
781        assert_eq!(decode_html_entity("apos"), Some('\''));
782
783        // Decimal numeric
784        assert_eq!(decode_html_entity("#65"), Some('A'));
785        assert_eq!(decode_html_entity("#97"), Some('a'));
786
787        // Hexadecimal numeric
788        assert_eq!(decode_html_entity("#x41"), Some('A'));
789        assert_eq!(decode_html_entity("#X41"), Some('A'));
790        assert_eq!(decode_html_entity("#x1F600"), Some('😀')); // emoji
791
792        // Unknown
793        assert_eq!(decode_html_entity("unknown"), None);
794        assert_eq!(decode_html_entity("#invalid"), None);
795    }
796
797    #[test]
798    fn test_extract_narrative() {
799        let patient = json!({
800            "resourceType": "Patient",
801            "text": {
802                "status": "generated",
803                "div": "<div xmlns=\"http://www.w3.org/1999/xhtml\"><p>John Smith, born 1970-01-15</p></div>"
804            }
805        });
806
807        let narrative = extract_narrative(&patient);
808        assert!(narrative.contains("John Smith"));
809        assert!(narrative.contains("born"));
810        assert!(!narrative.contains("<"));
811    }
812
813    #[test]
814    fn test_extract_narrative_no_text() {
815        let patient = json!({
816            "resourceType": "Patient",
817            "name": [{"family": "Smith"}]
818        });
819
820        let narrative = extract_narrative(&patient);
821        assert!(narrative.is_empty());
822    }
823
824    #[test]
825    fn test_extract_all_strings() {
826        let patient = json!({
827            "resourceType": "Patient",
828            "id": "123",
829            "name": [{
830                "family": "Smith",
831                "given": ["John", "James"]
832            }],
833            "address": [{
834                "city": "Boston",
835                "state": "MA"
836            }]
837        });
838
839        let content = extract_all_strings(&patient);
840        assert!(content.contains("Smith"));
841        assert!(content.contains("John"));
842        assert!(content.contains("James"));
843        assert!(content.contains("Boston"));
844        // Should skip resourceType and id
845        assert!(!content.contains("Patient"));
846    }
847
848    #[test]
849    fn test_extract_searchable_content() {
850        let patient = json!({
851            "resourceType": "Patient",
852            "text": {
853                "div": "<div>John Smith from Boston</div>"
854            },
855            "name": [{"family": "Smith", "given": ["John"]}],
856            "address": [{"city": "Boston"}]
857        });
858
859        let content = extract_searchable_content(&patient);
860        assert!(!content.is_empty());
861        assert!(content.narrative.contains("John Smith"));
862        assert!(content.full_content.contains("Smith"));
863        assert!(content.full_content.contains("Boston"));
864    }
865
866    #[test]
867    fn test_searchable_content_is_empty() {
868        let content = SearchableContent::new();
869        assert!(content.is_empty());
870
871        let content = SearchableContent {
872            narrative: "test".to_string(),
873            full_content: String::new(),
874        };
875        assert!(!content.is_empty());
876    }
877}