helios_persistence/backends/sqlite/search/
fts.rs1use serde_json::Value;
7
8use super::query_builder::{SqlFragment, SqlParam};
9
10#[derive(Debug, Clone, Default)]
12pub struct SearchableContent {
13 pub narrative: String,
16 pub full_content: String,
19}
20
21impl SearchableContent {
22 pub fn new() -> Self {
24 Self::default()
25 }
26
27 pub fn is_empty(&self) -> bool {
29 self.narrative.is_empty() && self.full_content.is_empty()
30 }
31}
32
33pub fn extract_searchable_content(resource: &Value) -> SearchableContent {
39 SearchableContent {
40 narrative: extract_narrative(resource),
42 full_content: extract_all_strings(resource),
44 }
45}
46
47fn extract_narrative(resource: &Value) -> String {
51 resource
52 .get("text")
53 .and_then(|t| t.get("div"))
54 .and_then(|d| d.as_str())
55 .map(strip_html_tags)
56 .unwrap_or_default()
57}
58
59fn strip_html_tags(html: &str) -> String {
66 let mut result = String::with_capacity(html.len());
67 let mut in_tag = false;
68 let mut chars = html.chars().peekable();
69
70 while let Some(c) = chars.next() {
71 match c {
72 '<' => {
73 if chars.peek() == Some(&'!') {
75 let lookahead: String = chars.clone().take(8).collect();
76 if lookahead.starts_with("![CDATA[") {
77 for _ in 0..8 {
79 chars.next();
80 }
81 let mut cdata_content = String::new();
83 while let Some(ch) = chars.next() {
84 if ch == ']' {
85 let next_two: String = chars.clone().take(2).collect();
86 if next_two == "]>" {
87 chars.next(); chars.next(); break;
90 }
91 }
92 cdata_content.push(ch);
93 }
94 result.push_str(&cdata_content);
95 result.push(' ');
96 continue;
97 }
98 }
99 in_tag = true;
100 }
101 '>' if in_tag => {
102 in_tag = false;
103 }
104 '&' if !in_tag => {
105 let mut entity = String::new();
107 let mut found_semicolon = false;
108 for _ in 0..10 {
109 if let Some(&ch) = chars.peek() {
110 if ch == ';' {
111 chars.next(); found_semicolon = true;
113 break;
114 } else if ch.is_alphanumeric() || ch == '#' {
115 entity.push(ch);
116 chars.next();
117 } else {
118 break;
119 }
120 } else {
121 break;
122 }
123 }
124 if found_semicolon {
125 if let Some(decoded) = decode_html_entity(&entity) {
126 result.push(decoded);
127 } else {
128 result.push('&');
130 result.push_str(&entity);
131 result.push(';');
132 }
133 } else {
134 result.push('&');
136 result.push_str(&entity);
137 }
138 }
139 _ if !in_tag => result.push(c),
140 _ => {}
141 }
142 }
143
144 result.split_whitespace().collect::<Vec<_>>().join(" ")
146}
147
148fn decode_html_entity(entity: &str) -> Option<char> {
153 match entity {
154 "lt" => Some('<'),
155 "gt" => Some('>'),
156 "amp" => Some('&'),
157 "nbsp" => Some(' '),
158 "quot" => Some('"'),
159 "apos" => Some('\''),
160 s if s.starts_with('#') => {
161 let num = s.strip_prefix('#')?;
162 let code = if let Some(hex) = num.strip_prefix('x').or_else(|| num.strip_prefix('X')) {
163 u32::from_str_radix(hex, 16).ok()?
164 } else {
165 num.parse().ok()?
166 };
167 char::from_u32(code)
168 }
169 _ => None,
170 }
171}
172
173fn extract_all_strings(value: &Value) -> String {
177 let mut strings = Vec::new();
178 extract_strings_recursive(value, &mut strings);
179 strings.join(" ")
180}
181
182fn extract_strings_recursive(value: &Value, strings: &mut Vec<String>) {
184 match value {
185 Value::String(s) => {
186 if !s.is_empty() && !s.starts_with("http://") && !s.starts_with("https://") {
188 strings.push(s.clone());
189 }
190 }
191 Value::Array(arr) => {
192 for item in arr {
193 extract_strings_recursive(item, strings);
194 }
195 }
196 Value::Object(obj) => {
197 for (key, val) in obj {
198 if !matches!(
200 key.as_str(),
201 "resourceType" | "id" | "meta" | "extension" | "url" | "reference"
202 ) {
203 extract_strings_recursive(val, strings);
204 }
205 }
206 }
207 _ => {}
208 }
209}
210
211pub struct Fts5Search;
213
214impl Fts5Search {
215 pub const FTS_TABLE_NAME: &'static str = "search_index_fts";
217
218 pub fn create_table_sql() -> &'static str {
220 r#"
221 CREATE VIRTUAL TABLE IF NOT EXISTS search_index_fts USING fts5(
222 text_content,
223 content='search_index',
224 content_rowid='rowid',
225 tokenize='porter unicode61'
226 )
227 "#
228 }
229
230 pub fn create_triggers_sql() -> &'static str {
236 r#"
237 -- Trigger for INSERT (indexes value_string and value_token_display)
238 CREATE TRIGGER IF NOT EXISTS search_index_fts_insert AFTER INSERT ON search_index
239 WHEN new.value_string IS NOT NULL OR new.value_token_display IS NOT NULL
240 BEGIN
241 INSERT INTO search_index_fts(rowid, text_content)
242 VALUES (new.rowid, COALESCE(new.value_string, '') || ' ' || COALESCE(new.value_token_display, ''));
243 END;
244
245 -- Trigger for DELETE
246 CREATE TRIGGER IF NOT EXISTS search_index_fts_delete AFTER DELETE ON search_index
247 WHEN old.value_string IS NOT NULL OR old.value_token_display IS NOT NULL
248 BEGIN
249 INSERT INTO search_index_fts(search_index_fts, rowid, text_content)
250 VALUES ('delete', old.rowid, COALESCE(old.value_string, '') || ' ' || COALESCE(old.value_token_display, ''));
251 END;
252
253 -- Trigger for UPDATE
254 CREATE TRIGGER IF NOT EXISTS search_index_fts_update AFTER UPDATE ON search_index
255 WHEN old.value_string IS NOT NULL OR new.value_string IS NOT NULL
256 OR old.value_token_display IS NOT NULL OR new.value_token_display IS NOT NULL
257 BEGIN
258 INSERT INTO search_index_fts(search_index_fts, rowid, text_content)
259 VALUES ('delete', old.rowid, COALESCE(old.value_string, '') || ' ' || COALESCE(old.value_token_display, ''));
260 INSERT INTO search_index_fts(rowid, text_content)
261 VALUES (new.rowid, COALESCE(new.value_string, '') || ' ' || COALESCE(new.value_token_display, ''));
262 END;
263 "#
264 }
265
266 pub fn build_fts_query(search_term: &str, param_num: usize) -> SqlFragment {
270 SqlFragment::with_params(
271 format!(
272 "rowid IN (SELECT rowid FROM {} WHERE {} MATCH ?{})",
273 Self::FTS_TABLE_NAME,
274 Self::FTS_TABLE_NAME,
275 param_num
276 ),
277 vec![SqlParam::string(Self::escape_fts_query(search_term))],
278 )
279 }
280
281 pub fn build_phrase_query(phrase: &str, param_num: usize) -> SqlFragment {
283 let escaped = Self::escape_fts_query(phrase);
284 SqlFragment::with_params(
285 format!(
286 "rowid IN (SELECT rowid FROM {} WHERE {} MATCH ?{})",
287 Self::FTS_TABLE_NAME,
288 Self::FTS_TABLE_NAME,
289 param_num
290 ),
291 vec![SqlParam::string(format!("\"{}\"", escaped))],
292 )
293 }
294
295 pub fn build_prefix_query(prefix: &str, param_num: usize) -> SqlFragment {
297 let escaped = Self::escape_fts_query(prefix);
298 SqlFragment::with_params(
299 format!(
300 "rowid IN (SELECT rowid FROM {} WHERE {} MATCH ?{})",
301 Self::FTS_TABLE_NAME,
302 Self::FTS_TABLE_NAME,
303 param_num
304 ),
305 vec![SqlParam::string(format!("{}*", escaped))],
306 )
307 }
308
309 pub fn escape_fts_query(term: &str) -> String {
311 let mut result = String::with_capacity(term.len());
313 for c in term.chars() {
314 match c {
315 '"' | '*' | ':' | '^' | '(' | ')' | '+' | '-' | '~' => {
316 result.push(' ');
318 }
319 _ => result.push(c),
320 }
321 }
322 result.trim().to_string()
323 }
324
325 pub fn check_fts5_available_sql() -> &'static str {
329 "SELECT sqlite_compileoption_used('ENABLE_FTS5')"
330 }
331
332 pub fn rebuild_index_sql() -> String {
336 format!(
337 "INSERT INTO {}({}) VALUES ('rebuild')",
338 Self::FTS_TABLE_NAME,
339 Self::FTS_TABLE_NAME
340 )
341 }
342
343 pub fn optimize_index_sql() -> String {
345 format!(
346 "INSERT INTO {}({}) VALUES ('optimize')",
347 Self::FTS_TABLE_NAME,
348 Self::FTS_TABLE_NAME
349 )
350 }
351
352 pub fn build_advanced_query(query: &str, param_num: usize) -> SqlFragment {
365 let fts_query = Self::parse_advanced_query(query);
366 SqlFragment::with_params(
367 format!(
368 "rowid IN (SELECT rowid FROM {} WHERE {} MATCH ?{})",
369 Self::FTS_TABLE_NAME,
370 Self::FTS_TABLE_NAME,
371 param_num
372 ),
373 vec![SqlParam::string(fts_query)],
374 )
375 }
376
377 pub fn parse_advanced_query(query: &str) -> String {
388 let tokens = Self::tokenize_advanced_query(query);
389 Self::tokens_to_fts5(&tokens)
390 }
391
392 fn tokenize_advanced_query(query: &str) -> Vec<String> {
394 let mut tokens = Vec::new();
395 let chars = query.chars().peekable();
396 let mut current = String::new();
397 let mut in_quote = false;
398
399 for c in chars {
400 match c {
401 '"' => {
402 if in_quote {
403 if !current.is_empty() {
405 tokens.push(format!("\"{}\"", current));
406 current.clear();
407 }
408 in_quote = false;
409 } else {
410 if !current.is_empty() {
412 tokens.push(current.clone());
413 current.clear();
414 }
415 in_quote = true;
416 }
417 }
418 ' ' | '\t' | '\n' if !in_quote => {
419 if !current.is_empty() {
420 tokens.push(current.clone());
421 current.clear();
422 }
423 }
424 _ => {
425 current.push(c);
426 }
427 }
428 }
429
430 if !current.is_empty() {
432 if in_quote {
433 tokens.push(format!("\"{}\"", current));
435 } else {
436 tokens.push(current);
437 }
438 }
439
440 tokens
441 }
442
443 fn tokens_to_fts5(tokens: &[String]) -> String {
445 let mut result = Vec::new();
446 let mut i = 0;
447
448 while i < tokens.len() {
449 let token = &tokens[i];
450 let upper = token.to_uppercase();
451
452 if upper == "OR" || upper == "AND" {
454 result.push(upper);
456 } else if upper == "NOT" {
457 result.push("NOT".to_string());
459 } else if upper == "NEAR" || upper.starts_with("NEAR/") {
460 result.push(upper);
462 } else if token.starts_with('-') && token.len() > 1 {
463 result.push("NOT".to_string());
465 let term = &token[1..];
466 result.push(Self::escape_term_for_fts5(term));
467 } else if token.starts_with('"') {
468 let inner = token.trim_matches('"');
470 result.push(format!("\"{}\"", Self::escape_fts_query(inner)));
471 } else if token.ends_with('*') {
472 let base = &token[..token.len() - 1];
474 if !base.is_empty() {
475 result.push(format!("{}*", Self::escape_term_for_fts5(base)));
476 }
477 } else {
478 result.push(Self::escape_term_for_fts5(token));
480 }
481 i += 1;
482 }
483
484 Self::join_with_implicit_and(&result)
486 }
487
488 fn escape_term_for_fts5(term: &str) -> String {
490 Self::escape_fts_query(term)
491 }
492
493 fn join_with_implicit_and(terms: &[String]) -> String {
499 if terms.is_empty() {
500 return String::new();
501 }
502
503 let mut result = Vec::new();
504 let operators = ["OR", "AND", "NOT"];
505
506 for (i, term) in terms.iter().enumerate() {
507 result.push(term.clone());
508
509 if i < terms.len() - 1 {
511 let next = &terms[i + 1];
512 let current_is_op = operators.contains(&term.to_uppercase().as_str())
513 || term.to_uppercase().starts_with("NEAR");
514 let next_is_op = operators.contains(&next.to_uppercase().as_str())
515 || next.to_uppercase().starts_with("NEAR");
516
517 if !current_is_op && !next_is_op && next.to_uppercase() != "NOT" {
519 result.push("AND".to_string());
520 }
521 }
522 }
523
524 result.join(" ")
525 }
526}
527
528#[cfg(test)]
529mod tests {
530 use super::*;
531 use serde_json::json;
532
533 #[test]
534 fn test_escape_fts_query() {
535 assert_eq!(Fts5Search::escape_fts_query("simple"), "simple");
536 assert_eq!(Fts5Search::escape_fts_query("has\"quotes"), "has quotes");
537 assert_eq!(Fts5Search::escape_fts_query("star*"), "star");
538 assert_eq!(
539 Fts5Search::escape_fts_query("complex:query+term"),
540 "complex query term"
541 );
542 }
543
544 #[test]
545 fn test_build_fts_query() {
546 let frag = Fts5Search::build_fts_query("smith", 1);
547
548 assert!(frag.sql.contains("search_index_fts"));
549 assert!(frag.sql.contains("MATCH"));
550 assert_eq!(frag.params.len(), 1);
551 }
552
553 #[test]
554 fn test_build_phrase_query() {
555 let frag = Fts5Search::build_phrase_query("john smith", 1);
556
557 assert!(frag.sql.contains("MATCH"));
558 }
560
561 #[test]
562 fn test_build_prefix_query() {
563 let frag = Fts5Search::build_prefix_query("smi", 1);
564
565 assert!(frag.sql.contains("MATCH"));
566 }
567
568 #[test]
573 fn test_parse_advanced_query_simple() {
574 assert_eq!(Fts5Search::parse_advanced_query("headache"), "headache");
575 }
576
577 #[test]
578 fn test_parse_advanced_query_multiple_terms() {
579 assert_eq!(
581 Fts5Search::parse_advanced_query("heart attack"),
582 "heart AND attack"
583 );
584 }
585
586 #[test]
587 fn test_parse_advanced_query_phrase() {
588 assert_eq!(
589 Fts5Search::parse_advanced_query("\"heart attack\""),
590 "\"heart attack\""
591 );
592 }
593
594 #[test]
595 fn test_parse_advanced_query_or() {
596 assert_eq!(
597 Fts5Search::parse_advanced_query("headache OR migraine"),
598 "headache OR migraine"
599 );
600 }
601
602 #[test]
603 fn test_parse_advanced_query_prefix() {
604 assert_eq!(Fts5Search::parse_advanced_query("cardio*"), "cardio*");
605 }
606
607 #[test]
608 fn test_parse_advanced_query_not_minus() {
609 assert_eq!(Fts5Search::parse_advanced_query("-surgery"), "NOT surgery");
611 }
612
613 #[test]
614 fn test_parse_advanced_query_not_keyword() {
615 assert_eq!(
617 Fts5Search::parse_advanced_query("NOT surgery"),
618 "NOT surgery"
619 );
620 }
621
622 #[test]
623 fn test_parse_advanced_query_near() {
624 assert_eq!(
625 Fts5Search::parse_advanced_query("heart NEAR attack"),
626 "heart NEAR attack"
627 );
628 }
629
630 #[test]
631 fn test_parse_advanced_query_near_with_distance() {
632 assert_eq!(
633 Fts5Search::parse_advanced_query("heart NEAR/5 attack"),
634 "heart NEAR/5 attack"
635 );
636 }
637
638 #[test]
639 fn test_parse_advanced_query_complex() {
640 assert_eq!(
642 Fts5Search::parse_advanced_query("heart OR cardiac -surgery"),
643 "heart OR cardiac NOT surgery"
644 );
645 }
646
647 #[test]
648 fn test_parse_advanced_query_mixed() {
649 assert_eq!(
651 Fts5Search::parse_advanced_query("\"chest pain\" cardio* OR thoracic"),
652 "\"chest pain\" AND cardio* OR thoracic"
653 );
654 }
655
656 #[test]
657 fn test_parse_advanced_query_case_insensitive_operators() {
658 assert_eq!(
660 Fts5Search::parse_advanced_query("heart or cardiac"),
661 "heart OR cardiac"
662 );
663 assert_eq!(
664 Fts5Search::parse_advanced_query("pain not chronic"),
665 "pain NOT chronic"
666 );
667 }
668
669 #[test]
670 fn test_build_advanced_query() {
671 let frag = Fts5Search::build_advanced_query("heart OR cardiac -surgery", 1);
672
673 assert!(frag.sql.contains("search_index_fts"));
674 assert!(frag.sql.contains("MATCH"));
675 assert_eq!(frag.params.len(), 1);
676
677 if let SqlParam::String(s) = &frag.params[0] {
679 assert!(s.contains("OR"));
680 assert!(s.contains("NOT"));
681 }
682 }
683
684 #[test]
685 fn test_strip_html_tags() {
686 assert_eq!(strip_html_tags("<p>Hello</p>"), "Hello");
687 assert_eq!(
688 strip_html_tags("<div><p>Hello <b>world</b></p></div>"),
689 "Hello world"
690 );
691 assert_eq!(strip_html_tags("No tags here"), "No tags here");
692 assert_eq!(strip_html_tags("<br/>"), "");
693 assert_eq!(
694 strip_html_tags("<div xmlns=\"http://www.w3.org/1999/xhtml\">Test</div>"),
695 "Test"
696 );
697 }
698
699 #[test]
700 fn test_strip_html_entities() {
701 assert_eq!(strip_html_tags("<tag>"), "<tag>");
703 assert_eq!(strip_html_tags("Tom & Jerry"), "Tom & Jerry");
704 assert_eq!(
705 strip_html_tags("He said "hello""),
706 "He said \"hello\""
707 );
708 assert_eq!(strip_html_tags("It's fine"), "It's fine");
709 assert_eq!(strip_html_tags("Non breaking"), "Non breaking");
710
711 assert_eq!(strip_html_tags("<>"), "<>");
713 assert_eq!(strip_html_tags("ABC"), "ABC");
714
715 assert_eq!(strip_html_tags("<>"), "<>");
717 assert_eq!(strip_html_tags("ABC"), "ABC");
718 assert_eq!(strip_html_tags("AB"), "AB"); assert_eq!(
722 strip_html_tags("<p>Price: <$100 & discount</p>"),
723 "Price: <$100 & discount"
724 );
725 }
726
727 #[test]
728 fn test_strip_html_cdata() {
729 assert_eq!(
730 strip_html_tags("<![CDATA[Some raw content]]>"),
731 "Some raw content"
732 );
733 assert_eq!(
734 strip_html_tags("<div><![CDATA[Inner CDATA]]></div>"),
735 "Inner CDATA"
736 );
737 assert_eq!(
738 strip_html_tags("Before <![CDATA[inside]]> after"),
739 "Before inside after"
740 );
741 assert_eq!(
743 strip_html_tags("<![CDATA[<script>alert('hi')</script>]]>"),
744 "<script>alert('hi')</script>"
745 );
746 }
747
748 #[test]
749 fn test_strip_html_edge_cases() {
750 assert_eq!(strip_html_tags("a & b"), "a & b");
752 assert_eq!(strip_html_tags("a &unknown; b"), "a &unknown; b");
753
754 assert_eq!(strip_html_tags(""), "");
756
757 assert_eq!(strip_html_tags(" "), "");
759
760 assert_eq!(strip_html_tags("<br/><hr/>text"), "text");
762
763 let fhir_narrative = r#"<div xmlns="http://www.w3.org/1999/xhtml">
765 <p>Patient: John Smith & family</p>
766 <p>DOB: <1970-01-15></p>
767 </div>"#;
768 assert_eq!(
769 strip_html_tags(fhir_narrative),
770 "Patient: John Smith & family DOB: <1970-01-15>"
771 );
772 }
773
774 #[test]
775 fn test_decode_html_entity() {
776 assert_eq!(decode_html_entity("lt"), Some('<'));
777 assert_eq!(decode_html_entity("gt"), Some('>'));
778 assert_eq!(decode_html_entity("amp"), Some('&'));
779 assert_eq!(decode_html_entity("nbsp"), Some(' '));
780 assert_eq!(decode_html_entity("quot"), Some('"'));
781 assert_eq!(decode_html_entity("apos"), Some('\''));
782
783 assert_eq!(decode_html_entity("#65"), Some('A'));
785 assert_eq!(decode_html_entity("#97"), Some('a'));
786
787 assert_eq!(decode_html_entity("#x41"), Some('A'));
789 assert_eq!(decode_html_entity("#X41"), Some('A'));
790 assert_eq!(decode_html_entity("#x1F600"), Some('😀')); assert_eq!(decode_html_entity("unknown"), None);
794 assert_eq!(decode_html_entity("#invalid"), None);
795 }
796
797 #[test]
798 fn test_extract_narrative() {
799 let patient = json!({
800 "resourceType": "Patient",
801 "text": {
802 "status": "generated",
803 "div": "<div xmlns=\"http://www.w3.org/1999/xhtml\"><p>John Smith, born 1970-01-15</p></div>"
804 }
805 });
806
807 let narrative = extract_narrative(&patient);
808 assert!(narrative.contains("John Smith"));
809 assert!(narrative.contains("born"));
810 assert!(!narrative.contains("<"));
811 }
812
813 #[test]
814 fn test_extract_narrative_no_text() {
815 let patient = json!({
816 "resourceType": "Patient",
817 "name": [{"family": "Smith"}]
818 });
819
820 let narrative = extract_narrative(&patient);
821 assert!(narrative.is_empty());
822 }
823
824 #[test]
825 fn test_extract_all_strings() {
826 let patient = json!({
827 "resourceType": "Patient",
828 "id": "123",
829 "name": [{
830 "family": "Smith",
831 "given": ["John", "James"]
832 }],
833 "address": [{
834 "city": "Boston",
835 "state": "MA"
836 }]
837 });
838
839 let content = extract_all_strings(&patient);
840 assert!(content.contains("Smith"));
841 assert!(content.contains("John"));
842 assert!(content.contains("James"));
843 assert!(content.contains("Boston"));
844 assert!(!content.contains("Patient"));
846 }
847
848 #[test]
849 fn test_extract_searchable_content() {
850 let patient = json!({
851 "resourceType": "Patient",
852 "text": {
853 "div": "<div>John Smith from Boston</div>"
854 },
855 "name": [{"family": "Smith", "given": ["John"]}],
856 "address": [{"city": "Boston"}]
857 });
858
859 let content = extract_searchable_content(&patient);
860 assert!(!content.is_empty());
861 assert!(content.narrative.contains("John Smith"));
862 assert!(content.full_content.contains("Smith"));
863 assert!(content.full_content.contains("Boston"));
864 }
865
866 #[test]
867 fn test_searchable_content_is_empty() {
868 let content = SearchableContent::new();
869 assert!(content.is_empty());
870
871 let content = SearchableContent {
872 narrative: "test".to_string(),
873 full_content: String::new(),
874 };
875 assert!(!content.is_empty());
876 }
877}