1use ahash::AHashMap;
2use terraphim_automata::builder::{Logseq, ThesaurusBuilder};
3use terraphim_automata::load_thesaurus;
4use terraphim_automata::{LinkType, replace_matches};
5use terraphim_config::{ConfigState, Role};
6use terraphim_middleware::thesaurus::build_thesaurus_from_haystack;
7use terraphim_persistence::Persistable;
8use terraphim_rolegraph::{RoleGraph, RoleGraphSync};
9use terraphim_types::{
10 Document, Index, IndexedDocument, Layer, NormalizedTermValue, RelevanceFunction, RoleName,
11 SearchQuery, Thesaurus,
12};
13mod score;
14use crate::score::Query;
15
16#[cfg(feature = "openrouter")]
17pub mod openrouter;
18
19pub mod llm;
21
22pub mod llm_proxy;
28
29pub mod http_client;
33
34pub mod logging;
36
37pub mod conversation_service;
39pub mod rate_limiter;
40pub mod summarization_manager;
41pub mod summarization_queue;
42pub mod summarization_worker;
43
44pub mod error;
46
47pub mod context;
49
50#[cfg(test)]
51mod context_tests;
52
53fn normalize_filename_to_id(filename: &str) -> String {
57 let re = regex::Regex::new(r"[^a-zA-Z0-9]+").expect("Failed to create regex");
58 re.replace_all(filename, "").to_lowercase()
59}
60
61#[derive(thiserror::Error, Debug)]
62pub enum ServiceError {
63 #[error("Middleware error: {0}")]
64 Middleware(#[from] terraphim_middleware::Error),
65
66 #[error("OpenDal error: {0}")]
67 OpenDal(Box<opendal::Error>),
68
69 #[error("Persistence error: {0}")]
70 Persistence(#[from] terraphim_persistence::Error),
71
72 #[error("Config error: {0}")]
73 Config(String),
74
75 #[cfg(feature = "openrouter")]
76 #[error("OpenRouter error: {0}")]
77 OpenRouter(#[from] crate::openrouter::OpenRouterError),
78
79 #[error("Common error: {0}")]
80 Common(#[from] crate::error::CommonError),
81}
82
83impl From<opendal::Error> for ServiceError {
84 fn from(err: opendal::Error) -> Self {
85 ServiceError::OpenDal(Box::new(err))
86 }
87}
88
89impl crate::error::TerraphimError for ServiceError {
90 fn category(&self) -> crate::error::ErrorCategory {
91 use crate::error::ErrorCategory;
92 match self {
93 ServiceError::Middleware(_) => ErrorCategory::Integration,
94 ServiceError::OpenDal(_) => ErrorCategory::Storage,
95 ServiceError::Persistence(_) => ErrorCategory::Storage,
96 ServiceError::Config(_) => ErrorCategory::Configuration,
97 #[cfg(feature = "openrouter")]
98 ServiceError::OpenRouter(_) => ErrorCategory::Integration,
99 ServiceError::Common(err) => err.category(),
100 }
101 }
102
103 fn is_recoverable(&self) -> bool {
104 match self {
105 ServiceError::Middleware(_) => true,
106 ServiceError::OpenDal(_) => false,
107 ServiceError::Persistence(_) => false,
108 ServiceError::Config(_) => false,
109 #[cfg(feature = "openrouter")]
110 ServiceError::OpenRouter(_) => true,
111 ServiceError::Common(err) => err.is_recoverable(),
112 }
113 }
114}
115
116pub type Result<T> = std::result::Result<T, ServiceError>;
117
118pub struct TerraphimService {
119 config_state: ConfigState,
120}
121
122impl TerraphimService {
123 pub fn new(config_state: ConfigState) -> Self {
125 Self { config_state }
126 }
127
128 async fn build_thesaurus(&mut self, search_query: &SearchQuery) -> Result<()> {
130 Ok(build_thesaurus_from_haystack(&mut self.config_state, search_query).await?)
131 }
132 pub async fn ensure_thesaurus_loaded(&mut self, role_name: &RoleName) -> Result<Thesaurus> {
134 async fn load_thesaurus_from_automata_path(
135 config_state: &ConfigState,
136 role_name: &RoleName,
137 rolegraphs: &mut AHashMap<RoleName, RoleGraphSync>,
138 ) -> Result<Thesaurus> {
139 let config = config_state.config.lock().await;
140 let Some(role) = config.roles.get(role_name).cloned() else {
141 return Err(ServiceError::Config(format!(
142 "Role '{}' not found in config",
143 role_name
144 )));
145 };
146 if let Some(kg) = &role.kg {
147 if let Some(automata_path) = &kg.automata_path {
148 log::info!("Loading Role `{}` - URL: {:?}", role_name, automata_path);
149
150 match load_thesaurus(automata_path).await {
152 Ok(mut thesaurus) => {
153 log::info!("Successfully loaded thesaurus from automata path");
154
155 match thesaurus.save().await {
157 Ok(_) => {
158 log::info!(
159 "Thesaurus for role `{}` saved to persistence",
160 role_name
161 );
162 match thesaurus.load().await {
164 Ok(persisted_thesaurus) => {
165 thesaurus = persisted_thesaurus;
166 log::debug!("Reloaded thesaurus from persistence");
167 }
168 Err(e) => {
169 log::warn!(
170 "Failed to reload thesaurus from persistence, using in-memory version: {:?}",
171 e
172 );
173 }
174 }
175 }
176 Err(e) => {
177 log::warn!("Failed to save thesaurus to persistence: {:?}", e);
178 }
179 }
180
181 let rolegraph =
182 RoleGraph::new(role_name.clone(), thesaurus.clone()).await;
183 match rolegraph {
184 Ok(rolegraph) => {
185 let rolegraph_value = RoleGraphSync::from(rolegraph);
186 rolegraphs.insert(role_name.clone(), rolegraph_value);
187 }
188 Err(e) => {
189 log::error!("Failed to update role and thesaurus: {:?}", e)
190 }
191 }
192 Ok(thesaurus)
193 }
194 Err(e) => {
195 log::warn!("Failed to load thesaurus from automata path: {:?}", e);
196 if let Some(kg_local) = &kg.knowledge_graph_local {
198 log::info!(
199 "Fallback: building thesaurus from local KG for role {}",
200 role_name
201 );
202 let logseq_builder = Logseq::default();
203 match logseq_builder
204 .build(
205 role_name.as_lowercase().to_string(),
206 kg_local.path.clone(),
207 )
208 .await
209 {
210 Ok(mut thesaurus) => {
211 match thesaurus.save().await {
213 Ok(_) => {
214 log::info!(
215 "Fallback thesaurus for role `{}` saved to persistence",
216 role_name
217 );
218 match thesaurus.load().await {
220 Ok(persisted_thesaurus) => {
221 thesaurus = persisted_thesaurus;
222 log::debug!(
223 "Reloaded fallback thesaurus from persistence"
224 );
225 }
226 Err(e) => {
227 log::warn!(
228 "Failed to reload fallback thesaurus from persistence, using in-memory version: {:?}",
229 e
230 );
231 }
232 }
233 }
234 Err(e) => {
235 log::warn!(
236 "Failed to save fallback thesaurus to persistence: {:?}",
237 e
238 );
239 }
240 }
241
242 let rolegraph =
243 RoleGraph::new(role_name.clone(), thesaurus.clone())
244 .await;
245 match rolegraph {
246 Ok(rolegraph) => {
247 let rolegraph_value =
248 RoleGraphSync::from(rolegraph);
249 rolegraphs
250 .insert(role_name.clone(), rolegraph_value);
251 }
252 Err(e) => log::error!(
253 "Failed to update role and thesaurus: {:?}",
254 e
255 ),
256 }
257
258 Ok(thesaurus)
259 }
260 Err(e) => {
261 let is_file_not_found =
264 e.to_string().contains("file not found")
265 || e.to_string().contains("not found:");
266
267 if is_file_not_found {
268 log::debug!(
269 "Failed to build thesaurus from local KG (optional file not found) for role {}: {:?}",
270 role_name,
271 e
272 );
273 } else {
274 log::error!(
275 "Failed to build thesaurus from local KG for role {}: {:?}",
276 role_name,
277 e
278 );
279 }
280 Err(ServiceError::Config(
281 "Failed to load or build thesaurus".into(),
282 ))
283 }
284 }
285 } else {
286 log::error!(
287 "No fallback available for role {}: no local KG path configured",
288 role_name
289 );
290 Err(ServiceError::Config(
291 "No automata path and no local KG available".into(),
292 ))
293 }
294 }
295 }
296 } else if let Some(kg_local) = &kg.knowledge_graph_local {
297 log::info!(
299 "Role {} has no automata_path, building thesaurus from local KG files at {:?}",
300 role_name,
301 kg_local.path
302 );
303 let logseq_builder = Logseq::default();
304 match logseq_builder
305 .build(role_name.as_lowercase().to_string(), kg_local.path.clone())
306 .await
307 {
308 Ok(mut thesaurus) => {
309 log::info!(
310 "Successfully built thesaurus from local KG for role {}",
311 role_name
312 );
313
314 match thesaurus.save().await {
316 Ok(_) => {
317 log::info!(
318 "Local KG thesaurus for role `{}` saved to persistence",
319 role_name
320 );
321 match thesaurus.load().await {
323 Ok(persisted_thesaurus) => {
324 log::info!(
325 "Reloaded local KG thesaurus from persistence: {} entries",
326 persisted_thesaurus.len()
327 );
328 thesaurus = persisted_thesaurus;
329 }
330 Err(e) => {
331 log::warn!(
332 "Failed to reload local KG thesaurus from persistence, using in-memory version: {:?}",
333 e
334 );
335 }
336 }
337 }
338 Err(e) => {
339 log::warn!(
340 "Failed to save local KG thesaurus to persistence: {:?}",
341 e
342 );
343 }
344 }
345
346 let rolegraph =
347 RoleGraph::new(role_name.clone(), thesaurus.clone()).await;
348 match rolegraph {
349 Ok(rolegraph) => {
350 let rolegraph_value = RoleGraphSync::from(rolegraph);
351 rolegraphs.insert(role_name.clone(), rolegraph_value);
352 }
353 Err(e) => {
354 log::error!("Failed to update role and thesaurus: {:?}", e)
355 }
356 }
357
358 Ok(thesaurus)
359 }
360 Err(e) => {
361 let is_file_not_found = e.to_string().contains("file not found");
364
365 if is_file_not_found {
366 log::debug!(
367 "Failed to build thesaurus from local KG (optional file not found) for role {}: {:?}",
368 role_name,
369 e
370 );
371 } else {
372 log::error!(
373 "Failed to build thesaurus from local KG for role {}: {:?}",
374 role_name,
375 e
376 );
377 }
378 Err(ServiceError::Config(format!(
379 "Failed to build thesaurus from local KG for role {}: {}",
380 role_name, e
381 )))
382 }
383 }
384 } else {
385 log::warn!(
386 "Role {} is configured for TerraphimGraph but has neither automata_path nor knowledge_graph_local defined.",
387 role_name
388 );
389 if let Some(kg_local) = &kg.knowledge_graph_local {
390 log::info!(
392 "Building thesaurus from local KG files for role {} at {:?}",
393 role_name,
394 kg_local.path
395 );
396 let logseq_builder = Logseq::default();
397 match logseq_builder
398 .build(role_name.as_lowercase().to_string(), kg_local.path.clone())
399 .await
400 {
401 Ok(mut thesaurus) => {
402 log::info!(
403 "Successfully built thesaurus from local KG for role {}",
404 role_name
405 );
406
407 match thesaurus.save().await {
409 Ok(_) => {
410 log::info!(
411 "No-automata thesaurus for role `{}` saved to persistence",
412 role_name
413 );
414 match thesaurus.load().await {
416 Ok(persisted_thesaurus) => {
417 thesaurus = persisted_thesaurus;
418 log::debug!(
419 "Reloaded no-automata thesaurus from persistence"
420 );
421 }
422 Err(e) => {
423 log::warn!(
424 "Failed to reload no-automata thesaurus from persistence, using in-memory version: {:?}",
425 e
426 );
427 }
428 }
429 }
430 Err(e) => {
431 log::warn!(
432 "Failed to save no-automata thesaurus to persistence: {:?}",
433 e
434 );
435 }
436 }
437
438 let rolegraph =
439 RoleGraph::new(role_name.clone(), thesaurus.clone()).await;
440 match rolegraph {
441 Ok(rolegraph) => {
442 let rolegraph_value = RoleGraphSync::from(rolegraph);
443 rolegraphs.insert(role_name.clone(), rolegraph_value);
444 }
445 Err(e) => {
446 let is_file_not_found =
449 e.to_string().contains("file not found");
450
451 if is_file_not_found {
452 log::debug!(
453 "Failed to update role and thesaurus (optional file not found): {:?}",
454 e
455 );
456 } else {
457 log::error!(
458 "Failed to update role and thesaurus: {:?}",
459 e
460 );
461 }
462 }
463 }
464
465 Ok(thesaurus)
466 }
467 Err(e) => {
468 log::error!(
469 "Failed to build thesaurus from local KG for role {}: {:?}",
470 role_name,
471 e
472 );
473 Err(ServiceError::Config(
474 "Failed to build thesaurus from local KG".into(),
475 ))
476 }
477 }
478 } else {
479 Err(ServiceError::Config(
480 "No local knowledge graph path available".into(),
481 ))
482 }
483 }
484 } else {
485 Err(ServiceError::Config(
486 "Knowledge graph not configured".into(),
487 ))
488 }
489 }
490
491 log::debug!("Loading thesaurus for role: {}", role_name);
492 log::debug!("Role keys {:?}", self.config_state.roles.keys());
493
494 if let Some(rolegraph_value) = self.config_state.roles.get(role_name) {
495 let thesaurus_result = rolegraph_value.lock().await.thesaurus.clone().load().await;
496 match thesaurus_result {
497 Ok(thesaurus) => {
498 log::debug!("Thesaurus loaded: {:?}", thesaurus);
499 log::info!("Rolegraph loaded: for role name {:?}", role_name);
500 Ok(thesaurus)
501 }
502 Err(e) => {
503 let is_file_not_found = e.to_string().contains("file not found")
506 || e.to_string().contains("not found:");
507
508 if is_file_not_found {
509 log::debug!("Thesaurus file not found (optional): {:?}", e);
510 } else {
511 log::error!("Failed to load thesaurus: {:?}", e);
512 }
513 let mut rolegraphs = self.config_state.roles.clone();
515 let result = load_thesaurus_from_automata_path(
516 &self.config_state,
517 role_name,
518 &mut rolegraphs,
519 )
520 .await;
521
522 if result.is_ok() {
524 if let Some(updated_rolegraph) = rolegraphs.get(role_name) {
525 self.config_state
526 .roles
527 .insert(role_name.clone(), updated_rolegraph.clone());
528 log::info!(
529 "Updated config_state with new rolegraph for role: {}",
530 role_name
531 );
532 }
533 }
534
535 result
536 }
537 }
538 } else {
539 let mut rolegraphs = self.config_state.roles.clone();
541 let result =
542 load_thesaurus_from_automata_path(&self.config_state, role_name, &mut rolegraphs)
543 .await;
544
545 if result.is_ok() {
547 if let Some(new_rolegraph) = rolegraphs.get(role_name) {
548 self.config_state
549 .roles
550 .insert(role_name.clone(), new_rolegraph.clone());
551 log::info!(
552 "Added new rolegraph to config_state for role: {}",
553 role_name
554 );
555 }
556 }
557
558 result
559 }
560 }
561
562 pub async fn preprocess_document_content(
568 &mut self,
569 mut document: Document,
570 role: &Role,
571 ) -> Result<Document> {
572 if !role.terraphim_it {
574 log::info!(
575 "🔍 terraphim_it disabled for role '{}', skipping KG preprocessing",
576 role.name
577 );
578 return Ok(document);
579 }
580
581 let Some(_kg) = &role.kg else {
582 log::info!(
583 "⚠️ No KG configured for role '{}', skipping KG preprocessing",
584 role.name
585 );
586 return Ok(document);
587 };
588
589 log::info!(
590 "🧠 Starting KG preprocessing for document '{}' in role '{}' (terraphim_it enabled)",
591 document.title,
592 role.name
593 );
594 log::debug!(
595 "📄 Document preview: {} characters starting with: {}",
596 document.body.len(),
597 &document.body.chars().take(100).collect::<String>()
598 );
599
600 let thesaurus = match self.ensure_thesaurus_loaded(&role.name).await {
602 Ok(thesaurus) => thesaurus,
603 Err(e) => {
604 log::warn!("Failed to load thesaurus for role {}: {:?}", role.name, e);
605 return Ok(document); }
607 };
608
609 let mut kg_thesaurus = Thesaurus::new(format!("kg_links_{}", role.name));
611
612 let important_kg_terms = [
615 "graph",
616 "haystack",
617 "service",
618 "terraphim",
619 "knowledge",
620 "embedding",
621 "search",
622 "automata",
623 "thesaurus",
624 "rolegraph",
625 ];
626
627 let excluded_common_terms = [
629 "system",
630 "config",
631 "configuration",
632 "type",
633 "method",
634 "function",
635 "class",
636 "component",
637 "module",
638 "library",
639 "framework",
640 "interface",
641 "api",
642 "data",
643 "file",
644 "path",
645 "url",
646 "string",
647 "number",
648 "value",
649 "option",
650 "parameter",
651 "field",
652 "property",
653 "attribute",
654 "element",
655 "item",
656 "object",
657 "array",
658 "list",
659 "map",
660 "set",
661 "collection",
662 "server",
663 "client",
664 "request",
665 "response",
666 "error",
667 "result",
668 "success",
669 "failure",
670 "true",
671 "false",
672 "null",
673 "undefined",
674 "empty",
675 "full",
676 "start",
677 "end",
678 "begin",
679 "finish",
680 "create",
681 "delete",
682 "update",
683 "read",
684 "write",
685 "load",
686 "save",
687 "process",
688 "handle",
689 "manage",
690 "control",
691 "execute",
692 "run",
693 "call",
694 "invoke",
695 "trigger",
696 "event",
697 "action",
698 "command",
699 "query",
700 "search",
701 "filter",
702 "sort",
703 "order",
704 "group",
705 "match",
706 "find",
707 "replace",
708 "insert",
709 "remove",
710 "add",
711 "set",
712 "get",
713 "put",
714 "post",
715 "head",
716 "patch",
717 "delete",
718 ];
719
720 let mut sorted_terms: Vec<_> = (&thesaurus)
721 .into_iter()
722 .filter(|(key, _)| {
723 let term = key.as_str();
724
725 if term.is_empty() || term.len() < 3 {
727 return false;
728 }
729
730 if important_kg_terms.contains(&term) {
732 return true;
733 }
734
735 if excluded_common_terms.contains(&term) {
737 return false;
738 }
739
740 term.len() > 5
746 || term.contains('-')
747 || term.contains('_')
748 || term.chars().next().is_some_and(|c| c.is_uppercase())
749 })
750 .collect();
751
752 sorted_terms.sort_by(|a, b| {
754 let a_important = important_kg_terms.contains(&a.0.as_str());
755 let b_important = important_kg_terms.contains(&b.0.as_str());
756
757 match (a_important, b_important) {
758 (true, false) => std::cmp::Ordering::Less, (false, true) => std::cmp::Ordering::Greater, _ => b.1.id.cmp(&a.1.id), }
762 });
763
764 let max_kg_terms = 8;
766 for (key, value) in sorted_terms.into_iter().take(max_kg_terms) {
767 let mut kg_value = value.clone();
768 kg_value.value = key.clone(); kg_value.url = Some(format!("kg:{}", value.value)); kg_thesaurus.insert(key.clone(), kg_value);
774 }
775
776 let kg_terms_count = kg_thesaurus.len();
777 log::info!(
778 "📋 KG thesaurus filtering: {} → {} terms (prioritizing: {}, filters: len>5, hyphenated, or important KG terms)",
779 thesaurus.len(),
780 kg_terms_count,
781 important_kg_terms.join(", ")
782 );
783
784 if kg_terms_count > 0 {
786 let terms: Vec<String> = (&kg_thesaurus)
787 .into_iter()
788 .map(|(k, v)| format!("'{}' → kg:{}", k, v.value))
789 .collect();
790 log::info!("🔍 KG terms selected for linking: {}", terms.join(", "));
791 } else {
792 log::info!(
793 "⚠️ No KG terms passed filtering criteria - document '{}' will have no KG links",
794 document.title
795 );
796 }
797
798 if !kg_thesaurus.is_empty() {
800 let debug_thesaurus: Vec<String> = (&kg_thesaurus)
802 .into_iter()
803 .map(|(k, v)| format!("'{}' -> '{}' (url: {:?})", k, v.value, v.url))
804 .take(3) .collect();
806 log::info!(
807 "🔧 Passing to replace_matches: {} (total terms: {})",
808 debug_thesaurus.join(", "),
809 kg_thesaurus.len()
810 );
811 let preview = if document.body.chars().count() > 200 {
812 document.body.chars().take(200).collect::<String>() + "..."
813 } else {
814 document.body.clone()
815 };
816 log::info!("📝 Document body preview (first 200 chars): {}", preview);
817
818 match replace_matches(&document.body, kg_thesaurus, LinkType::MarkdownLinks) {
819 Ok(processed_bytes) => {
820 match String::from_utf8(processed_bytes) {
821 Ok(processed_content) => {
822 log::info!(
823 "✅ Successfully preprocessed document '{}' with {} KG terms → created [term](kg:concept) links",
824 document.title,
825 kg_terms_count
826 );
827
828 let content_changed = processed_content != document.body;
830 log::info!(
831 "🔄 Content changed: {} (original: {} chars, processed: {} chars)",
832 content_changed,
833 document.body.len(),
834 processed_content.len()
835 );
836
837 let kg_links: Vec<&str> = processed_content
839 .split("[")
840 .filter_map(|s| s.find("](kg:").map(|closing| &s[..closing]))
841 .collect();
842
843 if !kg_links.is_empty() {
844 log::info!(
845 "🔗 Found KG links in processed content: [{}](kg:...)",
846 kg_links.join("], [")
847 );
848
849 if let Some(first_link_pos) = processed_content.find("](kg:") {
851 let start = first_link_pos.saturating_sub(50);
852 let end = (first_link_pos + 100).min(processed_content.len());
853 log::info!(
854 "📄 Content snippet with KG link: ...{}...",
855 &processed_content[start..end]
856 );
857 }
858 } else {
859 log::warn!(
860 "⚠️ No KG links found in processed content despite successful replacement"
861 );
862 }
863
864 document.body = processed_content;
865 }
866 Err(e) => {
867 log::warn!(
868 "Failed to convert processed content to UTF-8 for document '{}': {:?}",
869 document.title,
870 e
871 );
872 }
873 }
874 }
875 Err(e) => {
876 log::warn!(
877 "Failed to replace KG terms in document '{}': {:?}",
878 document.title,
879 e
880 );
881 }
882 }
883 } else {
884 log::info!(
885 "💭 No specific KG terms found for document '{}' (filters excluded generic terms)",
886 document.title
887 );
888 }
889
890 Ok(document)
891 }
892
893 pub async fn preprocess_document_content_with_search(
895 &mut self,
896 document: Document,
897 role: &Role,
898 search_query: Option<&SearchQuery>,
899 ) -> Result<Document> {
900 let mut processed_doc = self.preprocess_document_content(document, role).await?;
902
903 if let Some(query) = search_query {
905 log::debug!(
906 "Applying search term highlighting to document '{}'",
907 processed_doc.title
908 );
909 processed_doc.body = Self::highlight_search_terms(&processed_doc.body, query);
910 }
911
912 Ok(processed_doc)
913 }
914
915 pub async fn create_document(&mut self, document: Document) -> Result<Document> {
917 document.save().await?;
920
921 self.config_state.add_to_roles(&document).await?;
924
925 use terraphim_config::ServiceType;
929 use terraphim_middleware::indexer::RipgrepIndexer;
930
931 let ripgrep = RipgrepIndexer::default();
932 let config_snapshot = { self.config_state.config.lock().await.clone() };
933
934 for role in config_snapshot.roles.values() {
935 for haystack in &role.haystacks {
936 if haystack.service == ServiceType::Ripgrep && !haystack.read_only {
937 if let Err(e) = ripgrep.update_document(&document).await {
938 log::warn!(
939 "Failed to write document {} to haystack {:?}: {:?}",
940 document.id,
941 haystack.location,
942 e
943 );
944 }
945 }
946 }
947 }
948
949 Ok(document)
950 }
951
952 pub async fn get_document_by_id(&mut self, document_id: &str) -> Result<Option<Document>> {
958 log::debug!("Getting document by ID: '{}'", document_id);
959
960 if document_id.trim().is_empty() {
962 log::warn!("Empty or whitespace-only document_id provided");
963 return Ok(None);
964 }
965
966 let mut placeholder = Document {
968 id: document_id.to_string(),
969 ..Default::default()
970 };
971 match placeholder.load().await {
972 Ok(doc) => {
973 log::debug!("Found document '{}' with direct ID lookup", document_id);
974 return self.apply_kg_preprocessing_if_needed(doc).await.map(Some);
975 }
976 Err(e) => {
977 log::debug!(
978 "Document '{}' not found with direct lookup: {:?}",
979 document_id,
980 e
981 );
982 }
983 }
984
985 if document_id.contains('.') || document_id.contains('-') || document_id.contains('_') {
987 let normalized_id = normalize_filename_to_id(document_id);
988 log::debug!(
989 "Trying normalized ID '{}' for filename '{}'",
990 normalized_id,
991 document_id
992 );
993
994 let mut normalized_placeholder = Document {
995 id: normalized_id.clone(),
996 ..Default::default()
997 };
998 match normalized_placeholder.load().await {
999 Ok(doc) => {
1000 log::debug!(
1001 "Found document '{}' with normalized ID '{}'",
1002 document_id,
1003 normalized_id
1004 );
1005 return self.apply_kg_preprocessing_if_needed(doc).await.map(Some);
1006 }
1007 Err(e) => {
1008 log::debug!(
1009 "Document '{}' not found with normalized ID '{}': {:?}",
1010 document_id,
1011 normalized_id,
1012 e
1013 );
1014 }
1015 }
1016 }
1017
1018 log::debug!("Falling back to search for document '{}'", document_id);
1020 let search_query = SearchQuery {
1021 search_term: NormalizedTermValue::new(document_id.to_string()),
1022 search_terms: None,
1023 operator: None,
1024 limit: Some(5), skip: None,
1026 role: None,
1027 layer: Layer::default(),
1028 include_pinned: false,
1029 };
1030
1031 let documents = self.search(&search_query).await?;
1032
1033 for doc in documents {
1035 if doc.title == document_id || doc.id == document_id {
1036 log::debug!("Found document '{}' via search fallback", document_id);
1037 return self.apply_kg_preprocessing_if_needed(doc).await.map(Some);
1038 }
1039 }
1040
1041 log::debug!("Document '{}' not found anywhere", document_id);
1042 Ok(None)
1043 }
1044
1045 async fn apply_kg_preprocessing_if_needed(&mut self, document: Document) -> Result<Document> {
1051 log::debug!(
1052 "🔍 [KG-DEBUG] apply_kg_preprocessing_if_needed called for document: '{}'",
1053 document.title
1054 );
1055 log::debug!(
1056 "🔍 [KG-DEBUG] Document body preview: {}",
1057 document.body.chars().take(100).collect::<String>()
1058 );
1059
1060 let role = {
1061 let config = self.config_state.config.lock().await;
1062 let selected_role = &config.selected_role;
1063
1064 log::debug!("🔍 [KG-DEBUG] Selected role: '{}'", selected_role);
1065
1066 match config.roles.get(selected_role) {
1067 Some(role) => {
1068 log::debug!(
1069 "🔍 [KG-DEBUG] Role found: '{}', terraphim_it: {}",
1070 role.name,
1071 role.terraphim_it
1072 );
1073 role.clone() }
1075 None => {
1076 log::warn!(
1077 "❌ [KG-DEBUG] Selected role '{}' not found in config, skipping KG preprocessing",
1078 selected_role
1079 );
1080 return Ok(document);
1081 }
1082 }
1083 }; if !role.terraphim_it {
1087 log::info!(
1088 "🔍 [KG-DEBUG] terraphim_it disabled for role '{}', skipping KG preprocessing",
1089 role.name
1090 );
1091 return Ok(document);
1092 }
1093
1094 let has_existing_kg_links = document.body.contains("](kg:");
1096 log::debug!(
1097 "🔍 [KG-DEBUG] Document already has KG links: {}",
1098 has_existing_kg_links
1099 );
1100 if has_existing_kg_links {
1101 log::info!(
1102 "🔍 [KG-DEBUG] Document '{}' already has KG links, skipping preprocessing to prevent double processing",
1103 document.title
1104 );
1105 return Ok(document);
1106 }
1107
1108 log::info!(
1109 "🧠 [KG-DEBUG] Starting KG preprocessing for document '{}' with role '{}' (terraphim_it enabled)",
1110 document.title,
1111 role.name
1112 );
1113
1114 let document_title = document.title.clone(); let processed_doc = match self.preprocess_document_content(document, &role).await {
1117 Ok(doc) => {
1118 let links_added = doc.body.contains("](kg:");
1119 log::info!(
1120 "✅ [KG-DEBUG] KG preprocessing completed for document '{}'. Links added: {}",
1121 doc.title,
1122 links_added
1123 );
1124 if links_added {
1125 log::debug!(
1126 "🔍 [KG-DEBUG] Processed body preview: {}",
1127 doc.body.chars().take(200).collect::<String>()
1128 );
1129 }
1130 doc
1131 }
1132 Err(e) => {
1133 log::error!(
1134 "❌ [KG-DEBUG] KG preprocessing failed for document '{}': {:?}",
1135 document_title,
1136 e
1137 );
1138 return Err(e);
1139 }
1140 };
1141
1142 Ok(processed_doc)
1143 }
1144
1145 #[allow(dead_code)] async fn enhance_descriptions_with_ai(
1151 &self,
1152 mut documents: Vec<Document>,
1153 role: &Role,
1154 ) -> Result<Vec<Document>> {
1155 use crate::llm::{SummarizeOptions, build_llm_from_role};
1156
1157 eprintln!("🤖 Attempting to build LLM client for role: {}", role.name);
1158 let llm = match build_llm_from_role(role) {
1159 Some(client) => {
1160 eprintln!("✅ LLM client successfully created: {}", client.name());
1161 client
1162 }
1163 None => {
1164 eprintln!("❌ No LLM client available for role: {}", role.name);
1165 return Ok(documents);
1166 }
1167 };
1168
1169 log::info!(
1170 "Enhancing {} document descriptions with LLM provider: {}",
1171 documents.len(),
1172 llm.name()
1173 );
1174
1175 let mut enhanced_count = 0;
1176 let mut error_count = 0;
1177
1178 for document in &mut documents {
1179 if self.should_generate_ai_summary(document) {
1180 let summary_length = 250;
1181 match llm
1182 .summarize(
1183 &document.body,
1184 SummarizeOptions {
1185 max_length: summary_length,
1186 },
1187 )
1188 .await
1189 {
1190 Ok(ai_summary) => {
1191 log::debug!(
1192 "Generated AI summary for '{}': {} characters",
1193 document.title,
1194 ai_summary.len()
1195 );
1196 document.description = Some(ai_summary);
1197 enhanced_count += 1;
1198 }
1199 Err(e) => {
1200 log::warn!(
1201 "Failed to generate AI summary for '{}': {}",
1202 document.title,
1203 e
1204 );
1205 error_count += 1;
1206 }
1207 }
1208 }
1209 }
1210
1211 log::info!(
1212 "LLM enhancement complete: {} enhanced, {} errors, {} skipped",
1213 enhanced_count,
1214 error_count,
1215 documents.len() - enhanced_count - error_count
1216 );
1217
1218 Ok(documents)
1219 }
1220
1221 #[allow(dead_code)] fn should_generate_ai_summary(&self, document: &Document) -> bool {
1227 if document.body.trim().len() < 200 {
1229 return false;
1230 }
1231
1232 if let Some(ref description) = document.description {
1234 if description.len() > 100 && !description.ends_with("...") {
1236 return false;
1237 }
1238 }
1239
1240 if document.body.len() > 8000 {
1242 return false;
1243 }
1244
1245 true
1247 }
1248
1249 async fn get_search_role(&self, search_query: &SearchQuery) -> Result<Role> {
1251 let search_role = match &search_query.role {
1252 Some(role) => role.clone(),
1253 None => self.config_state.get_default_role().await,
1254 };
1255
1256 log::debug!("Searching for role: {:?}", search_role);
1257 let Some(role) = self.config_state.get_role(&search_role).await else {
1258 return Err(ServiceError::Config(format!(
1259 "Role `{}` not found in config",
1260 search_role
1261 )));
1262 };
1263 Ok(role)
1264 }
1265
1266 fn is_word_boundary_char(c: char) -> bool {
1269 !c.is_alphanumeric() && c != '_'
1270 }
1271
1272 fn is_at_word_boundary(text: &str, start: usize, end: usize) -> bool {
1276 let before_ok = if start == 0 {
1277 true
1278 } else {
1279 text[..start]
1280 .chars()
1281 .last()
1282 .map(Self::is_word_boundary_char)
1283 .unwrap_or(true)
1284 };
1285
1286 let after_ok = if end >= text.len() {
1287 true
1288 } else {
1289 text[end..]
1290 .chars()
1291 .next()
1292 .map(Self::is_word_boundary_char)
1293 .unwrap_or(true)
1294 };
1295
1296 before_ok && after_ok
1297 }
1298
1299 fn term_matches_with_word_boundaries(term: &str, text: &str) -> bool {
1303 let mut start = 0;
1305 while let Some(pos) = text[start..].find(term) {
1306 let abs_start = start + pos;
1307 let abs_end = abs_start + term.len();
1308
1309 if Self::is_at_word_boundary(text, abs_start, abs_end) {
1310 return true;
1311 }
1312 start = abs_end;
1313 }
1314 false
1315 }
1316
1317 pub async fn apply_logical_operators_to_documents(
1319 &mut self,
1320 search_query: &SearchQuery,
1321 documents: Vec<Document>,
1322 ) -> Result<Vec<Document>> {
1323 use terraphim_types::LogicalOperator;
1324
1325 let all_terms = search_query.get_all_terms();
1326 let operator = search_query.get_operator();
1327
1328 let initial_doc_count = documents.len();
1329
1330 log::debug!(
1331 "Applying {:?} operator to {} documents with {} search terms",
1332 operator,
1333 initial_doc_count,
1334 all_terms.len()
1335 );
1336
1337 let terms_lower: Vec<String> = all_terms
1339 .iter()
1340 .map(|t| t.as_str().to_lowercase())
1341 .collect();
1342
1343 let filtered_docs: Vec<Document> = documents
1344 .into_iter()
1345 .filter(|doc| {
1346 let searchable_text = format!(
1348 "{} {} {}",
1349 doc.title.to_lowercase(),
1350 doc.body.to_lowercase(),
1351 doc.description
1352 .as_ref()
1353 .unwrap_or(&String::new())
1354 .to_lowercase()
1355 );
1356
1357 match operator {
1358 LogicalOperator::And => {
1359 terms_lower.iter().all(|term| {
1361 Self::term_matches_with_word_boundaries(term, &searchable_text)
1362 })
1363 }
1364 LogicalOperator::Or => {
1365 terms_lower.iter().any(|term| {
1367 Self::term_matches_with_word_boundaries(term, &searchable_text)
1368 })
1369 }
1370 }
1371 })
1372 .collect();
1373
1374 log::debug!(
1375 "Logical operator filtering: {} -> {} documents",
1376 initial_doc_count,
1377 filtered_docs.len()
1378 );
1379
1380 let combined_query_string = terms_lower.join(" ");
1382 let query = Query::new(&combined_query_string);
1383 let sorted_docs = score::sort_documents(&query, filtered_docs);
1384
1385 Ok(sorted_docs)
1386 }
1387
1388 pub async fn search_documents_selected_role(
1391 &mut self,
1392 search_term: &NormalizedTermValue,
1393 ) -> Result<Vec<Document>> {
1394 let role = self.config_state.get_selected_role().await;
1395 let documents = self
1396 .search(&SearchQuery {
1397 search_term: search_term.clone(),
1398 search_terms: None,
1399 operator: None,
1400 role: Some(role),
1401 skip: None,
1402 limit: None,
1403 layer: Layer::default(),
1404 include_pinned: false,
1405 })
1406 .await?;
1407 Ok(documents)
1408 }
1409
1410 pub async fn search(&mut self, search_query: &SearchQuery) -> Result<Vec<Document>> {
1412 log::debug!("Role for searching: {:?}", search_query.role);
1414 let role = self.get_search_role(search_query).await?;
1415
1416 log::trace!("Building index for search query: {:?}", search_query);
1417 let index: Index =
1418 terraphim_middleware::search_haystacks(self.config_state.clone(), search_query.clone())
1419 .await?;
1420
1421 match role.relevance_function {
1422 RelevanceFunction::TitleScorer => {
1423 log::debug!("Searching haystack with title scorer");
1424
1425 let documents = index.get_all_documents();
1426
1427 log::debug!("Sorting documents by relevance");
1428
1429 let documents = if search_query.is_multi_term_query() {
1430 self.apply_logical_operators_to_documents(search_query, documents)
1432 .await?
1433 } else {
1434 let query = Query::new(&search_query.search_term.to_string());
1436 score::sort_documents(&query, documents)
1437 };
1438 let total_length = documents.len();
1439 let mut docs_ranked = Vec::new();
1440 for (idx, doc) in documents.iter().enumerate() {
1441 let mut document: terraphim_types::Document = doc.clone();
1442 let rank = (total_length - idx).try_into().unwrap();
1443 document.rank = Some(rank);
1444
1445 if document.id.starts_with("http://") || document.id.starts_with("https://") {
1447 log::debug!(
1449 "Processing Atomic Data document '{}' (URL: {})",
1450 document.title,
1451 document.id
1452 );
1453
1454 let mut placeholder = Document {
1456 id: document.id.clone(),
1457 ..Default::default()
1458 };
1459 match placeholder.load().await {
1460 Ok(persisted_doc) => {
1461 log::debug!(
1463 "Found cached Atomic Data document '{}' in persistence",
1464 document.title
1465 );
1466 if let Some(better_description) = persisted_doc.description {
1467 document.description = Some(better_description);
1468 }
1469 if !persisted_doc.body.is_empty() && !role.terraphim_it {
1473 log::debug!(
1474 "Updated body from persistence for Atomic document '{}' (role: '{}', terraphim_it: {})",
1475 document.title,
1476 role.name,
1477 role.terraphim_it
1478 );
1479 document.body = persisted_doc.body;
1480 } else if role.terraphim_it {
1481 log::debug!(
1482 "Keeping search result body for Atomic document '{}' because role '{}' uses KG preprocessing (terraphim_it=true)",
1483 document.title,
1484 role.name
1485 );
1486 }
1487 }
1488 Err(_) => {
1489 log::debug!(
1491 "Caching Atomic Data document '{}' to persistence for future queries",
1492 document.title
1493 );
1494
1495 let doc_to_save = document.clone();
1497 tokio::spawn(async move {
1498 if let Err(e) = doc_to_save.save().await {
1499 log::warn!(
1500 "Failed to cache Atomic Data document '{}': {}",
1501 doc_to_save.title,
1502 e
1503 );
1504 } else {
1505 log::debug!(
1506 "Successfully cached Atomic Data document '{}'",
1507 doc_to_save.title
1508 );
1509 }
1510 });
1511 }
1512 }
1513 } else {
1514 let should_lookup_persistence = document
1516 .get_source_haystack()
1517 .and_then(|source| {
1518 role.haystacks
1519 .iter()
1520 .find(|haystack| haystack.location == *source)
1521 })
1522 .map(|haystack| haystack.fetch_content)
1523 .unwrap_or(true);
1524
1525 if !should_lookup_persistence {
1526 log::trace!(
1527 "Skipping persistence lookup for '{}' (haystack fetch_content=false)",
1528 document.title
1529 );
1530 } else {
1531 let mut placeholder = Document {
1532 id: document.id.clone(),
1533 ..Default::default()
1534 };
1535 if let Ok(persisted_doc) = placeholder.load().await {
1536 if let Some(better_description) = persisted_doc.description {
1537 log::debug!(
1538 "Replaced ripgrep description for '{}' with persistence description",
1539 document.title
1540 );
1541 document.description = Some(better_description);
1542 }
1543 } else {
1544 let normalized_id = normalize_filename_to_id(&document.title);
1547
1548 let mut normalized_placeholder = Document {
1549 id: normalized_id.clone(),
1550 ..Default::default()
1551 };
1552 if let Ok(persisted_doc) = normalized_placeholder.load().await {
1553 if let Some(better_description) = persisted_doc.description {
1554 log::debug!(
1555 "Replaced ripgrep description for '{}' with persistence description (normalized from title: {})",
1556 document.title,
1557 normalized_id
1558 );
1559 document.description = Some(better_description);
1560 }
1561 } else {
1562 let normalized_id_with_md = format!("{}md", normalized_id);
1564 let mut md_placeholder = Document {
1565 id: normalized_id_with_md.clone(),
1566 ..Default::default()
1567 };
1568 if let Ok(persisted_doc) = md_placeholder.load().await {
1569 if let Some(better_description) = persisted_doc.description
1570 {
1571 log::debug!(
1572 "Replaced ripgrep description for '{}' with persistence description (normalized with md: {})",
1573 document.title,
1574 normalized_id_with_md
1575 );
1576 document.description = Some(better_description);
1577 }
1578 } else {
1579 log::debug!(
1580 "No persistence document found for '{}' (tried ID: '{}', normalized: '{}', with md: '{}')",
1581 document.title,
1582 document.id,
1583 normalized_id,
1584 normalized_id_with_md
1585 );
1586 }
1587 }
1588 }
1589 }
1590 }
1591
1592 docs_ranked.push(document);
1593 }
1594
1595 #[cfg(feature = "openrouter")]
1598 if role.has_llm_config() && role.llm_auto_summarize {
1599 log::debug!(
1600 "Applying OpenRouter AI summarization to {} search results for role '{}'",
1601 docs_ranked.len(),
1602 role.name
1603 );
1604 docs_ranked = self
1605 .enhance_descriptions_with_ai(docs_ranked, &role)
1606 .await?;
1607 } else {
1608 eprintln!(
1610 "📋 Entering LLM AI summarization branch for role: {}",
1611 role.name
1612 );
1613 log::debug!(
1614 "Applying LLM AI summarization to {} search results for role '{}'",
1615 docs_ranked.len(),
1616 role.name
1617 );
1618 docs_ranked = self
1619 .enhance_descriptions_with_ai(docs_ranked, &role)
1620 .await?;
1621 }
1622
1623 if role.terraphim_it {
1625 log::info!(
1626 "🧠 Applying KG preprocessing to {} TerraphimGraph search results for role '{}'",
1627 docs_ranked.len(),
1628 role.name
1629 );
1630 let mut processed_docs = Vec::new();
1631 let mut total_kg_terms = 0;
1632 let mut docs_with_kg_links = 0;
1633
1634 for document in docs_ranked {
1635 let original_body_len = document.body.len();
1636 let processed_doc =
1637 self.preprocess_document_content(document, &role).await?;
1638
1639 let new_body_len = processed_doc.body.len();
1641 if new_body_len > original_body_len {
1642 docs_with_kg_links += 1;
1643 let estimated_links = (new_body_len - original_body_len) / 17;
1645 total_kg_terms += estimated_links;
1646 }
1647
1648 processed_docs.push(processed_doc);
1649 }
1650
1651 log::info!(
1652 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1653 processed_docs.len(),
1654 docs_with_kg_links,
1655 total_kg_terms
1656 );
1657 Ok(processed_docs)
1658 } else {
1659 Ok(docs_ranked)
1660 }
1661 }
1662 RelevanceFunction::BM25 => {
1663 log::debug!("Searching haystack with BM25 scorer");
1664
1665 let documents = index.get_all_documents();
1666
1667 log::debug!("Sorting documents by BM25 relevance");
1668
1669 let documents = if search_query.is_multi_term_query() {
1670 let filtered_docs = self
1672 .apply_logical_operators_to_documents(search_query, documents)
1673 .await?;
1674 let combined_query_string = search_query
1676 .get_all_terms()
1677 .iter()
1678 .map(|t| t.as_str())
1679 .collect::<Vec<_>>()
1680 .join(" ");
1681 let query =
1682 Query::new(&combined_query_string).name_scorer(score::QueryScorer::BM25);
1683 score::sort_documents(&query, filtered_docs)
1684 } else {
1685 let query = Query::new(&search_query.search_term.to_string())
1687 .name_scorer(score::QueryScorer::BM25);
1688 score::sort_documents(&query, documents)
1689 };
1690 let total_length = documents.len();
1691 let mut docs_ranked = Vec::new();
1692 for (idx, doc) in documents.iter().enumerate() {
1693 let mut document: terraphim_types::Document = doc.clone();
1694 let rank = (total_length - idx).try_into().unwrap();
1695 document.rank = Some(rank);
1696 docs_ranked.push(document);
1697 }
1698
1699 #[cfg(feature = "openrouter")]
1701 if role.has_llm_config() && role.llm_auto_summarize {
1702 log::debug!(
1703 "Applying OpenRouter AI summarization to {} BM25 search results for role '{}'",
1704 docs_ranked.len(),
1705 role.name
1706 );
1707 docs_ranked = self
1708 .enhance_descriptions_with_ai(docs_ranked, &role)
1709 .await?;
1710 } else {
1711 log::debug!(
1713 "Applying LLM AI summarization to {} BM25 search results for role '{}'",
1714 docs_ranked.len(),
1715 role.name
1716 );
1717 docs_ranked = self
1718 .enhance_descriptions_with_ai(docs_ranked, &role)
1719 .await?;
1720 }
1721
1722 if role.terraphim_it {
1724 log::info!(
1725 "🧠 Applying KG preprocessing to {} BM25 search results for role '{}'",
1726 docs_ranked.len(),
1727 role.name
1728 );
1729 let mut processed_docs = Vec::new();
1730 let mut total_kg_terms = 0;
1731 let mut docs_with_kg_links = 0;
1732
1733 for document in docs_ranked {
1734 let original_body_len = document.body.len();
1735 let processed_doc =
1736 self.preprocess_document_content(document, &role).await?;
1737
1738 let new_body_len = processed_doc.body.len();
1740 if new_body_len > original_body_len {
1741 docs_with_kg_links += 1;
1742 let estimated_links = (new_body_len - original_body_len) / 17;
1743 total_kg_terms += estimated_links;
1744 }
1745
1746 processed_docs.push(processed_doc);
1747 }
1748
1749 log::info!(
1750 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1751 processed_docs.len(),
1752 docs_with_kg_links,
1753 total_kg_terms
1754 );
1755 Ok(processed_docs)
1756 } else {
1757 Ok(docs_ranked)
1758 }
1759 }
1760 RelevanceFunction::BM25F => {
1761 log::debug!("Searching haystack with BM25F scorer");
1762
1763 let documents = index.get_all_documents();
1764
1765 log::debug!("Sorting documents by BM25F relevance");
1766
1767 let documents = if search_query.is_multi_term_query() {
1768 let filtered_docs = self
1770 .apply_logical_operators_to_documents(search_query, documents)
1771 .await?;
1772 let combined_query_string = search_query
1774 .get_all_terms()
1775 .iter()
1776 .map(|t| t.as_str())
1777 .collect::<Vec<_>>()
1778 .join(" ");
1779 let query =
1780 Query::new(&combined_query_string).name_scorer(score::QueryScorer::BM25F);
1781 score::sort_documents(&query, filtered_docs)
1782 } else {
1783 let query = Query::new(&search_query.search_term.to_string())
1785 .name_scorer(score::QueryScorer::BM25F);
1786 score::sort_documents(&query, documents)
1787 };
1788 let total_length = documents.len();
1789 let mut docs_ranked = Vec::new();
1790 for (idx, doc) in documents.iter().enumerate() {
1791 let mut document: terraphim_types::Document = doc.clone();
1792 let rank = (total_length - idx).try_into().unwrap();
1793 document.rank = Some(rank);
1794 docs_ranked.push(document);
1795 }
1796
1797 #[cfg(feature = "openrouter")]
1799 if role.has_llm_config() && role.llm_auto_summarize {
1800 log::debug!(
1801 "Applying OpenRouter AI summarization to {} BM25F search results for role '{}'",
1802 docs_ranked.len(),
1803 role.name
1804 );
1805 docs_ranked = self
1806 .enhance_descriptions_with_ai(docs_ranked, &role)
1807 .await?;
1808 } else {
1809 log::debug!(
1811 "Applying LLM AI summarization to {} BM25F search results for role '{}'",
1812 docs_ranked.len(),
1813 role.name
1814 );
1815 docs_ranked = self
1816 .enhance_descriptions_with_ai(docs_ranked, &role)
1817 .await?;
1818 }
1819
1820 if role.terraphim_it {
1822 log::info!(
1823 "🧠 Applying KG preprocessing to {} BM25F search results for role '{}'",
1824 docs_ranked.len(),
1825 role.name
1826 );
1827 let mut processed_docs = Vec::new();
1828 let mut total_kg_terms = 0;
1829 let mut docs_with_kg_links = 0;
1830
1831 for document in docs_ranked {
1832 let original_body_len = document.body.len();
1833 let processed_doc =
1834 self.preprocess_document_content(document, &role).await?;
1835
1836 let new_body_len = processed_doc.body.len();
1838 if new_body_len > original_body_len {
1839 docs_with_kg_links += 1;
1840 let estimated_links = (new_body_len - original_body_len) / 17;
1841 total_kg_terms += estimated_links;
1842 }
1843
1844 processed_docs.push(processed_doc);
1845 }
1846
1847 log::info!(
1848 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1849 processed_docs.len(),
1850 docs_with_kg_links,
1851 total_kg_terms
1852 );
1853 Ok(processed_docs)
1854 } else {
1855 Ok(docs_ranked)
1856 }
1857 }
1858 RelevanceFunction::BM25Plus => {
1859 log::debug!("Searching haystack with BM25Plus scorer");
1860
1861 let documents = index.get_all_documents();
1862
1863 log::debug!("Sorting documents by BM25Plus relevance");
1864
1865 let documents = if search_query.is_multi_term_query() {
1866 let filtered_docs = self
1868 .apply_logical_operators_to_documents(search_query, documents)
1869 .await?;
1870 let combined_query_string = search_query
1872 .get_all_terms()
1873 .iter()
1874 .map(|t| t.as_str())
1875 .collect::<Vec<_>>()
1876 .join(" ");
1877 let query = Query::new(&combined_query_string)
1878 .name_scorer(score::QueryScorer::BM25Plus);
1879 score::sort_documents(&query, filtered_docs)
1880 } else {
1881 let query = Query::new(&search_query.search_term.to_string())
1883 .name_scorer(score::QueryScorer::BM25Plus);
1884 score::sort_documents(&query, documents)
1885 };
1886 let total_length = documents.len();
1887 let mut docs_ranked = Vec::new();
1888 for (idx, doc) in documents.iter().enumerate() {
1889 let mut document: terraphim_types::Document = doc.clone();
1890 let rank = (total_length - idx).try_into().unwrap();
1891 document.rank = Some(rank);
1892 docs_ranked.push(document);
1893 }
1894
1895 #[cfg(feature = "openrouter")]
1897 if role.has_llm_config() && role.llm_auto_summarize {
1898 log::debug!(
1899 "Applying OpenRouter AI summarization to {} BM25Plus search results for role '{}'",
1900 docs_ranked.len(),
1901 role.name
1902 );
1903 docs_ranked = self
1904 .enhance_descriptions_with_ai(docs_ranked, &role)
1905 .await?;
1906 }
1907
1908 if role.terraphim_it {
1910 log::info!(
1911 "🧠 Applying KG preprocessing to {} BM25Plus search results for role '{}'",
1912 docs_ranked.len(),
1913 role.name
1914 );
1915 let mut processed_docs = Vec::new();
1916 let mut total_kg_terms = 0;
1917 let mut docs_with_kg_links = 0;
1918
1919 for document in docs_ranked {
1920 let original_body_len = document.body.len();
1921 let processed_doc =
1922 self.preprocess_document_content(document, &role).await?;
1923
1924 let new_body_len = processed_doc.body.len();
1926 if new_body_len > original_body_len {
1927 docs_with_kg_links += 1;
1928 let estimated_links = (new_body_len - original_body_len) / 17;
1929 total_kg_terms += estimated_links;
1930 }
1931
1932 processed_docs.push(processed_doc);
1933 }
1934
1935 log::info!(
1936 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1937 processed_docs.len(),
1938 docs_with_kg_links,
1939 total_kg_terms
1940 );
1941 Ok(processed_docs)
1942 } else {
1943 Ok(docs_ranked)
1944 }
1945 }
1946 RelevanceFunction::TerraphimGraph => {
1947 log::debug!("TerraphimGraph search initiated for role: {}", role.name);
1948 self.build_thesaurus(search_query).await?;
1949 let _thesaurus = self.ensure_thesaurus_loaded(&role.name).await?;
1950 let scored_index_docs: Vec<IndexedDocument> = self
1951 .config_state
1952 .search_indexed_documents(search_query, &role)
1953 .await;
1954
1955 log::debug!(
1956 "TerraphimGraph search found {} indexed documents",
1957 scored_index_docs.len()
1958 );
1959
1960 log::debug!("Ranking documents with thesaurus");
1963 let mut documents = index.get_documents(scored_index_docs.clone());
1964
1965 let all_haystack_docs = index.get_all_documents();
1968 log::debug!(
1969 "Found {} total documents from haystacks, checking which need indexing",
1970 all_haystack_docs.len()
1971 );
1972 let mut need_reindexing = false;
1973
1974 if let Some(rolegraph_sync) = self.config_state.roles.get(&role.name) {
1975 let mut rolegraph = rolegraph_sync.lock().await;
1976 let mut newly_indexed = 0;
1977
1978 for doc in &all_haystack_docs {
1979 if !rolegraph.has_document(&doc.id) && !doc.body.is_empty() {
1981 log::debug!(
1982 "Indexing new document '{}' into rolegraph for TerraphimGraph search",
1983 doc.id
1984 );
1985 rolegraph.insert_document(&doc.id, doc.clone());
1986
1987 drop(rolegraph);
1990 if let Err(e) = doc.save().await {
1991 log::warn!(
1992 "Failed to save document '{}' to persistence: {}",
1993 doc.id,
1994 e
1995 );
1996 } else {
1997 log::debug!(
1998 "Successfully saved document '{}' to persistence",
1999 doc.id
2000 );
2001 }
2002 rolegraph = rolegraph_sync.lock().await;
2004
2005 newly_indexed += 1;
2006 }
2007 }
2008
2009 if newly_indexed > 0 {
2010 log::info!(
2011 "✅ Indexed {} new documents into rolegraph for role '{}'",
2012 newly_indexed,
2013 role.name
2014 );
2015 log::debug!(
2016 "RoleGraph now has {} nodes, {} edges, {} documents",
2017 rolegraph.get_node_count(),
2018 rolegraph.get_edge_count(),
2019 rolegraph.get_document_count()
2020 );
2021 need_reindexing = true; }
2023 }
2024
2025 let mut documents_with_content = Vec::new();
2028
2029 for mut document in documents {
2030 if document.body.is_empty() {
2032 log::debug!(
2033 "Document '{}' has empty body, attempting to load from persistence",
2034 document.id
2035 );
2036
2037 let mut full_doc = Document::new(document.id.clone());
2039 match full_doc.load().await {
2040 Ok(loaded_doc) => {
2041 if !loaded_doc.body.is_empty() {
2042 log::info!(
2043 "✅ Loaded body content for document '{}' from persistence",
2044 document.id
2045 );
2046 document.body = loaded_doc.body.clone();
2047 if loaded_doc.description.is_some() {
2048 document.description = loaded_doc.description.clone();
2049 }
2050
2051 if let Some(rolegraph_sync) =
2053 self.config_state.roles.get(&role.name)
2054 {
2055 let mut rolegraph = rolegraph_sync.lock().await;
2056 rolegraph.insert_document(&document.id, loaded_doc);
2057 need_reindexing = true;
2058 log::debug!(
2059 "Re-indexed document '{}' into rolegraph with content",
2060 document.id
2061 );
2062 }
2063 } else {
2064 log::warn!(
2065 "Document '{}' still has empty body after loading from persistence",
2066 document.id
2067 );
2068 }
2069 }
2070 Err(e) => {
2071 log::warn!(
2072 "Failed to load document '{}' from persistence: {}",
2073 document.id,
2074 e
2075 );
2076
2077 if document.url.starts_with('/')
2079 || document.url.starts_with("docs/")
2080 {
2081 match tokio::fs::read_to_string(&document.url).await {
2082 Ok(content) => {
2083 log::info!(
2084 "✅ Loaded content for '{}' from file: {}",
2085 document.id,
2086 document.url
2087 );
2088 document.body = content.clone();
2089
2090 let full_doc = Document {
2092 id: document.id.clone(),
2093 title: document.title.clone(),
2094 body: content,
2095 url: document.url.clone(),
2096 description: document.description.clone(),
2097 summarization: document.summarization.clone(),
2098 stub: None,
2099 tags: document.tags.clone(),
2100 rank: document.rank,
2101 source_haystack: document.source_haystack.clone(),
2102 doc_type: terraphim_types::DocumentType::KgEntry,
2103 synonyms: None,
2104 route: None,
2105 priority: None,
2106 };
2107
2108 if let Err(e) = full_doc.save().await {
2110 log::warn!(
2111 "Failed to save document '{}' to persistence: {}",
2112 document.id,
2113 e
2114 );
2115 }
2116
2117 if let Some(rolegraph_sync) =
2119 self.config_state.roles.get(&role.name)
2120 {
2121 let mut rolegraph = rolegraph_sync.lock().await;
2122 rolegraph.insert_document(&document.id, full_doc);
2123 need_reindexing = true;
2124 log::debug!(
2125 "Re-indexed document '{}' into rolegraph from file",
2126 document.id
2127 );
2128 }
2129 }
2130 Err(file_e) => {
2131 log::warn!(
2132 "Failed to read file '{}' for document '{}': {}",
2133 document.url,
2134 document.id,
2135 file_e
2136 );
2137 }
2138 }
2139 }
2140 }
2141 }
2142 }
2143 documents_with_content.push(document);
2144 }
2145
2146 documents = documents_with_content;
2147
2148 if need_reindexing {
2149 log::info!("🔄 Re-running TerraphimGraph search after indexing new documents");
2150
2151 let updated_scored_docs: Vec<IndexedDocument> = self
2153 .config_state
2154 .search_indexed_documents(search_query, &role)
2155 .await;
2156
2157 if !updated_scored_docs.is_empty() {
2158 log::debug!(
2159 "✅ Updated rolegraph search found {} documents",
2160 updated_scored_docs.len()
2161 );
2162 let updated_documents = index.get_documents(updated_scored_docs);
2164 if !updated_documents.is_empty() {
2165 documents = updated_documents;
2166 }
2167 }
2168 }
2169
2170 if documents.is_empty() && !all_haystack_docs.is_empty() {
2171 log::info!(
2172 "TerraphimGraph returned no results for role '{}'; falling back to lexical haystack ranking",
2173 role.name
2174 );
2175 documents = if search_query.is_multi_term_query() {
2176 let filtered_docs = self
2177 .apply_logical_operators_to_documents(
2178 search_query,
2179 all_haystack_docs.clone(),
2180 )
2181 .await?;
2182 let combined_query_string = search_query
2183 .get_all_terms()
2184 .iter()
2185 .map(|t| t.as_str())
2186 .collect::<Vec<_>>()
2187 .join(" ");
2188 let query = Query::new(&combined_query_string);
2189 score::sort_documents(&query, filtered_docs)
2190 } else {
2191 let query = Query::new(&search_query.search_term.to_string());
2192 score::sort_documents(&query, all_haystack_docs.clone())
2193 };
2194 }
2195
2196 if !documents.is_empty() {
2198 log::debug!(
2199 "Applying TF-IDF scoring to {} documents for enhanced ranking",
2200 documents.len()
2201 );
2202
2203 use crate::score::bm25_additional::TFIDFScorer;
2204 let mut tfidf_scorer = TFIDFScorer::new();
2205 tfidf_scorer.initialize(&documents);
2206
2207 let query_text = &search_query.search_term.to_string();
2209 for document in &mut documents {
2210 let tfidf_score = tfidf_scorer.score(query_text, document);
2211 if let Some(rank) = document.rank {
2213 document.rank = Some(rank + (tfidf_score * 0.3) as u64);
2214 } else {
2216 document.rank = Some((tfidf_score * 10.0) as u64); }
2218 }
2219
2220 documents.sort_by(|a, b| b.rank.unwrap_or(0).cmp(&a.rank.unwrap_or(0)));
2222
2223 log::debug!("TF-IDF scoring applied successfully");
2224 }
2225
2226 for document in &mut documents {
2228 if document.id.starts_with("http://") || document.id.starts_with("https://") {
2229 log::debug!(
2231 "Processing Atomic Data document '{}' (URL: {})",
2232 document.title,
2233 document.id
2234 );
2235
2236 let mut placeholder = Document {
2238 id: document.id.clone(),
2239 ..Default::default()
2240 };
2241 match placeholder.load().await {
2242 Ok(persisted_doc) => {
2243 log::debug!(
2245 "Found cached Atomic Data document '{}' in persistence",
2246 document.title
2247 );
2248 if let Some(better_description) = persisted_doc.description {
2249 document.description = Some(better_description);
2250 }
2251 if !persisted_doc.body.is_empty() && !role.terraphim_it {
2255 log::debug!(
2256 "Updated body from persistence for Atomic document '{}' (role: '{}', terraphim_it: {})",
2257 document.title,
2258 role.name,
2259 role.terraphim_it
2260 );
2261 document.body = persisted_doc.body;
2262 } else if role.terraphim_it {
2263 log::debug!(
2264 "Keeping search result body for Atomic document '{}' because role '{}' uses KG preprocessing (terraphim_it=true)",
2265 document.title,
2266 role.name
2267 );
2268 }
2269 }
2270 Err(_) => {
2271 log::debug!(
2273 "Caching Atomic Data document '{}' to persistence for future queries",
2274 document.title
2275 );
2276
2277 let doc_to_save = document.clone();
2279 tokio::spawn(async move {
2280 if let Err(e) = doc_to_save.save().await {
2281 log::warn!(
2282 "Failed to cache Atomic Data document '{}': {}",
2283 doc_to_save.title,
2284 e
2285 );
2286 } else {
2287 log::debug!(
2288 "Successfully cached Atomic Data document '{}'",
2289 doc_to_save.title
2290 );
2291 }
2292 });
2293 }
2294 }
2295 } else {
2296 let mut placeholder = Document {
2298 id: document.id.clone(),
2299 ..Default::default()
2300 };
2301 if let Ok(persisted_doc) = placeholder.load().await {
2302 if let Some(better_description) = persisted_doc.description {
2303 log::debug!(
2304 "Replaced ripgrep description for '{}' with persistence description",
2305 document.title
2306 );
2307 document.description = Some(better_description);
2308 }
2309 } else {
2310 let normalized_id = normalize_filename_to_id(&document.title);
2313
2314 let mut normalized_placeholder = Document {
2315 id: normalized_id.clone(),
2316 ..Default::default()
2317 };
2318 if let Ok(persisted_doc) = normalized_placeholder.load().await {
2319 if let Some(better_description) = persisted_doc.description {
2320 log::debug!(
2321 "Replaced ripgrep description for '{}' with persistence description (normalized from title: {})",
2322 document.title,
2323 normalized_id
2324 );
2325 document.description = Some(better_description);
2326 }
2327 } else {
2328 let normalized_id_with_md = format!("{}md", normalized_id);
2330 let mut md_placeholder = Document {
2331 id: normalized_id_with_md.clone(),
2332 ..Default::default()
2333 };
2334 if let Ok(persisted_doc) = md_placeholder.load().await {
2335 if let Some(better_description) = persisted_doc.description {
2336 log::debug!(
2337 "Replaced ripgrep description for '{}' with persistence description (normalized with md: {})",
2338 document.title,
2339 normalized_id_with_md
2340 );
2341 document.description = Some(better_description);
2342 }
2343 } else {
2344 log::debug!(
2345 "No persistence document found for '{}' (tried ID: '{}', normalized: '{}', with md: '{}')",
2346 document.title,
2347 document.id,
2348 normalized_id,
2349 normalized_id_with_md
2350 );
2351 }
2352 }
2353 }
2354 }
2355 }
2356
2357 #[cfg(feature = "openrouter")]
2359 if role.has_llm_config() {
2360 log::debug!(
2361 "Applying OpenRouter AI summarization to {} search results for role '{}'",
2362 documents.len(),
2363 role.name
2364 );
2365 documents = self.enhance_descriptions_with_ai(documents, &role).await?;
2366 } else {
2367 log::debug!(
2369 "Applying LLM AI summarization to {} search results for role '{}'",
2370 documents.len(),
2371 role.name
2372 );
2373 documents = self.enhance_descriptions_with_ai(documents, &role).await?;
2374 }
2375
2376 if role.terraphim_it {
2378 log::debug!(
2379 "Applying KG preprocessing to {} search results for role '{}'",
2380 documents.len(),
2381 role.name
2382 );
2383 let mut processed_docs = Vec::new();
2384 for document in documents {
2385 let processed_doc =
2386 self.preprocess_document_content(document, &role).await?;
2387 processed_docs.push(processed_doc);
2388 }
2389 Ok(processed_docs)
2390 } else {
2391 Ok(documents)
2392 }
2393 }
2394 }
2395 }
2396
2397 fn is_hash_based_id(id: &str) -> bool {
2399 id.len() == 16 && id.chars().all(|c| c.is_ascii_hexdigit())
2400 }
2401
2402 pub async fn find_documents_for_kg_term(
2413 &mut self,
2414 role_name: &RoleName,
2415 term: &str,
2416 ) -> Result<Vec<Document>> {
2417 log::debug!(
2418 "Finding documents for KG term '{}' in role '{}'",
2419 term,
2420 role_name
2421 );
2422
2423 let thesaurus = self.ensure_thesaurus_loaded(role_name).await?;
2425
2426 let role = self.config_state.get_role(role_name).await.ok_or_else(|| {
2428 ServiceError::Config(format!("Role '{}' not found in config", role_name))
2429 })?;
2430
2431 let mut documents = Vec::new();
2432
2433 if let Some(kg_config) = &role.kg {
2437 log::debug!("Found KG config for role");
2438 if let Some(kg_local) = &kg_config.knowledge_graph_local {
2439 let mut potential_concepts = vec![term.to_string()];
2440
2441 log::debug!("Checking thesaurus for term '{}'", term);
2443
2444 let normalized_search_term =
2446 terraphim_types::NormalizedTermValue::new(term.to_string());
2447
2448 if let Some(root_concept) = thesaurus.get(&normalized_search_term) {
2450 log::debug!("Found root concept for '{}': {:?}", term, root_concept);
2451
2452 let root_concept_name = root_concept.value.as_str();
2454
2455 let concept_name = if let Some(url) = &root_concept.url {
2457 url.split('/')
2458 .next_back()
2459 .and_then(|s| s.strip_suffix(".md"))
2460 .unwrap_or(root_concept_name)
2461 } else {
2462 root_concept_name
2463 };
2464
2465 if !potential_concepts.contains(&concept_name.to_string()) {
2466 potential_concepts.push(concept_name.to_string());
2467 log::debug!(
2468 "Added concept from thesaurus: {} (root: {})",
2469 concept_name,
2470 root_concept_name
2471 );
2472 }
2473 } else {
2474 log::debug!("No direct mapping found for '{}' in thesaurus", term);
2475 }
2476
2477 log::debug!(
2478 "Trying {} potential concepts: {:?}",
2479 potential_concepts.len(),
2480 potential_concepts
2481 );
2482
2483 for concept in potential_concepts {
2485 let potential_kg_file = kg_local.path.join(format!("{}.md", concept));
2486 log::debug!("Looking for KG definition file: {:?}", potential_kg_file);
2487
2488 if potential_kg_file.exists() {
2489 log::info!("Found KG definition file: {:?}", potential_kg_file);
2490
2491 let file_path = potential_kg_file.to_string_lossy().to_string();
2493 if documents.iter().any(|d: &Document| d.url == file_path) {
2494 log::debug!("Skipping duplicate KG document: {}", file_path);
2495 continue;
2496 }
2497
2498 match std::fs::read_to_string(&potential_kg_file) {
2501 Ok(content) => {
2502 let mut kg_doc =
2503 Document::new(potential_kg_file.to_string_lossy().to_string());
2504 kg_doc.url = potential_kg_file.to_string_lossy().to_string();
2505 kg_doc.body = content.clone();
2506
2507 let title = content
2509 .lines()
2510 .find(|line| line.starts_with("# "))
2511 .map(|line| line.trim_start_matches("# ").trim())
2512 .unwrap_or(&concept)
2513 .to_string();
2514 kg_doc.title = title;
2515
2516 log::debug!(
2517 "Successfully loaded KG definition document: {}",
2518 kg_doc.title
2519 );
2520 documents.push(kg_doc);
2521
2522 break;
2524 }
2525 Err(e) => {
2526 log::warn!(
2527 "Failed to read KG definition file '{}': {}",
2528 potential_kg_file.display(),
2529 e
2530 );
2531 }
2532 }
2533 } else {
2534 log::debug!("KG definition file not found: {:?}", potential_kg_file);
2535 }
2536 }
2537 } else {
2538 log::debug!("No KG local config found");
2539 }
2540 } else {
2541 log::debug!("No KG config found for role");
2542 }
2543
2544 let rolegraph_sync = self
2546 .config_state
2547 .roles
2548 .get(role_name)
2549 .ok_or_else(|| ServiceError::Config(format!("Role '{}' not found", role_name)))?;
2550
2551 let rolegraph = rolegraph_sync.lock().await;
2552 let document_ids = rolegraph.find_document_ids_for_term(term);
2553 drop(rolegraph); log::debug!(
2556 "Found {} document IDs from rolegraph for term '{}'",
2557 document_ids.len(),
2558 term
2559 );
2560
2561 for doc_id in &document_ids {
2563 if documents
2565 .iter()
2566 .any(|d| d.id == *doc_id || d.url == *doc_id)
2567 {
2568 log::debug!("Skipping duplicate document from rolegraph: {}", doc_id);
2569 continue;
2570 }
2571
2572 if doc_id.starts_with("http://") || doc_id.starts_with("https://") {
2575 log::debug!("Loading Atomic Data document '{}' from persistence", doc_id);
2577 let mut placeholder = Document {
2578 id: doc_id.clone(),
2579 ..Default::default()
2580 };
2581 match placeholder.load().await {
2582 Ok(loaded_doc) => {
2583 log::debug!(
2584 "Found cached Atomic Data document '{}' in persistence",
2585 doc_id
2586 );
2587 documents.push(loaded_doc);
2588 }
2589 Err(_) => {
2590 log::warn!(
2591 "Atomic Data document '{}' not found in persistence - this may indicate the document hasn't been cached yet",
2592 doc_id
2593 );
2594 }
2597 }
2598 } else {
2599 let mut doc = Document::new(doc_id.clone());
2601 match doc.load().await {
2602 Ok(loaded_doc) => {
2603 documents.push(loaded_doc);
2604 log::trace!("Successfully loaded local document: {}", doc_id);
2605 }
2606 Err(e) => {
2607 log::warn!("Failed to load local document '{}': {}", doc_id, e);
2608
2609 if Self::is_hash_based_id(doc_id) {
2611 log::debug!(
2612 "Document ID '{}' appears to be hash-based (legacy document), skipping for now",
2613 doc_id
2614 );
2615 log::info!(
2616 "💡 Hash-based document IDs are deprecated. This document will be re-indexed with normalized IDs on next haystack search."
2617 );
2618 }
2621
2622 }
2624 }
2625 }
2626 }
2627
2628 if role.terraphim_it {
2630 log::info!(
2631 "🧠 Applying KG preprocessing to {} KG term documents for role '{}' (terraphim_it enabled)",
2632 documents.len(),
2633 role_name
2634 );
2635 let mut processed_documents = Vec::new();
2636 let mut total_kg_terms = 0;
2637 let mut docs_with_kg_links = 0;
2638
2639 for document in documents {
2640 let original_body_len = document.body.len();
2641 let processed_doc = self.preprocess_document_content(document, &role).await?;
2642
2643 let new_body_len = processed_doc.body.len();
2645 if new_body_len > original_body_len {
2646 docs_with_kg_links += 1;
2647 let estimated_links = (new_body_len - original_body_len) / 17;
2648 total_kg_terms += estimated_links;
2649 }
2650
2651 processed_documents.push(processed_doc);
2652 }
2653
2654 log::info!(
2655 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
2656 processed_documents.len(),
2657 docs_with_kg_links,
2658 total_kg_terms
2659 );
2660 documents = processed_documents;
2661 } else {
2662 log::info!(
2663 "🔍 terraphim_it disabled for role '{}', skipping KG preprocessing for {} documents",
2664 role_name,
2665 documents.len()
2666 );
2667 }
2668
2669 let total_length = documents.len();
2672 for (idx, doc) in documents.iter_mut().enumerate() {
2673 let rank = (total_length - idx) as u64;
2674 doc.rank = Some(rank);
2675 log::trace!("Assigned rank {} to document '{}'", rank, doc.title);
2676 }
2677
2678 log::debug!(
2679 "Successfully loaded and processed {} documents for term '{}', ranks assigned from {} to 1",
2680 documents.len(),
2681 term,
2682 total_length
2683 );
2684 Ok(documents)
2685 }
2686
2687 #[cfg(feature = "openrouter")]
2704 pub async fn generate_document_summary(
2705 &self,
2706 document: &Document,
2707 api_key: &str,
2708 model: &str,
2709 max_length: usize,
2710 ) -> Result<String> {
2711 use crate::openrouter::OpenRouterService;
2712
2713 log::debug!(
2714 "Generating summary for document '{}' using model '{}'",
2715 document.id,
2716 model
2717 );
2718
2719 let openrouter_service =
2721 OpenRouterService::new(api_key, model).map_err(ServiceError::OpenRouter)?;
2722
2723 let content = &document.body;
2725
2726 if content.trim().is_empty() {
2727 return Err(ServiceError::Config(
2728 "Document body is empty, cannot generate summary".to_string(),
2729 ));
2730 }
2731
2732 let summary = openrouter_service
2734 .generate_summary(content, max_length)
2735 .await
2736 .map_err(ServiceError::OpenRouter)?;
2737
2738 log::info!(
2739 "Generated {}-character summary for document '{}' using model '{}'",
2740 summary.len(),
2741 document.id,
2742 model
2743 );
2744
2745 Ok(summary)
2746 }
2747
2748 #[cfg(not(feature = "openrouter"))]
2750 pub async fn generate_document_summary(
2751 &self,
2752 _document: &Document,
2753 _api_key: &str,
2754 _model: &str,
2755 _max_length: usize,
2756 ) -> Result<String> {
2757 Err(ServiceError::Config(
2758 "OpenRouter feature not enabled during compilation".to_string(),
2759 ))
2760 }
2761
2762 pub async fn fetch_config(&self) -> terraphim_config::Config {
2764 let current_config = self.config_state.config.lock().await;
2765 current_config.clone()
2766 }
2767
2768 #[cfg(test)]
2770 pub async fn get_role(&self, role_name: &RoleName) -> Result<Role> {
2771 let config = self.config_state.config.lock().await;
2772 config
2773 .roles
2774 .get(role_name)
2775 .cloned()
2776 .ok_or_else(|| ServiceError::Config(format!("Role '{}' not found", role_name)))
2777 }
2778
2779 pub async fn update_config(
2784 &self,
2785 config: terraphim_config::Config,
2786 ) -> Result<terraphim_config::Config> {
2787 let mut current_config = self.config_state.config.lock().await;
2788 *current_config = config.clone();
2789 current_config.save().await?;
2790 log::info!("Config updated");
2791 Ok(config)
2792 }
2793
2794 pub async fn update_selected_role(
2797 &self,
2798 role_name: terraphim_types::RoleName,
2799 ) -> Result<terraphim_config::Config> {
2800 let mut current_config = self.config_state.config.lock().await;
2801
2802 if !current_config.roles.contains_key(&role_name) {
2804 return Err(ServiceError::Config(format!(
2805 "Role `{}` not found in config",
2806 role_name
2807 )));
2808 }
2809
2810 current_config.selected_role = role_name.clone();
2811 current_config.save().await?;
2812
2813 if let Some(role) = current_config.roles.get(&role_name) {
2815 if role.terraphim_it {
2816 log::info!(
2817 "🎯 Selected role '{}' → terraphim_it: ✅ ENABLED (KG preprocessing will be applied)",
2818 role_name
2819 );
2820 if role.kg.is_some() {
2821 log::info!("📚 KG configuration: Available for role '{}'", role_name);
2822 } else {
2823 log::warn!(
2824 "⚠️ KG configuration: Missing for role '{}' (terraphim_it enabled but no KG)",
2825 role_name
2826 );
2827 }
2828 } else {
2829 log::info!(
2830 "🎯 Selected role '{}' → terraphim_it: ❌ DISABLED (KG preprocessing skipped)",
2831 role_name
2832 );
2833 }
2834 } else {
2835 log::info!("🎯 Selected role updated to '{}'", role_name);
2836 }
2837
2838 Ok(current_config.clone())
2839 }
2840
2841 fn highlight_search_terms(content: &str, search_query: &SearchQuery) -> String {
2846 let mut highlighted_content = content.to_string();
2847
2848 let terms = search_query.get_all_terms();
2850
2851 let mut sorted_terms: Vec<&str> = terms.iter().map(|t| t.as_str()).collect();
2853 sorted_terms.sort_by_key(|term| std::cmp::Reverse(term.len()));
2854
2855 for term in sorted_terms {
2856 if term.trim().is_empty() {
2857 continue;
2858 }
2859
2860 let escaped_term = regex::escape(term);
2863
2864 if let Ok(regex) = regex::RegexBuilder::new(&escaped_term)
2865 .case_insensitive(true)
2866 .build()
2867 {
2868 let highlight_open = "<mark class=\"search-highlight\">";
2871 let highlight_close = "</mark>";
2872
2873 highlighted_content = regex
2874 .replace_all(
2875 &highlighted_content,
2876 format!("{}{}{}", highlight_open, "$0", highlight_close),
2877 )
2878 .to_string();
2879 }
2880 }
2881
2882 highlighted_content
2883 }
2884}
2885
2886#[cfg(test)]
2887mod tests {
2888 use super::*;
2889 use std::path::PathBuf;
2890 use terraphim_config::ConfigBuilder;
2891 use terraphim_types::NormalizedTermValue;
2892
2893 #[tokio::test]
2894 async fn test_get_config() {
2895 let mut config = ConfigBuilder::new()
2896 .build_default_desktop()
2897 .build()
2898 .unwrap();
2899 let config_state = ConfigState::new(&mut config).await.unwrap();
2900 let service = TerraphimService::new(config_state);
2901 let fetched_config = service.fetch_config().await;
2902 assert_eq!(fetched_config.id, terraphim_config::ConfigId::Desktop);
2903 }
2904
2905 #[tokio::test]
2906 async fn test_search_documents_selected_role() {
2907 let project_root = std::env::current_dir().unwrap_or_else(|_| PathBuf::from("."));
2909 let kg_path = project_root.join("docs/src/kg");
2910 if !kg_path.exists() {
2911 println!("Skipping test: KG directory not found at {:?}", kg_path);
2912 return;
2913 }
2914
2915 let mut config = ConfigBuilder::new()
2916 .build_default_desktop()
2917 .build()
2918 .unwrap();
2919 let config_state = match ConfigState::new(&mut config).await {
2920 Ok(state) => state,
2921 Err(e) => {
2922 println!("Skipping test: Failed to create config state: {:?}", e);
2923 return;
2924 }
2925 };
2926 let mut service = TerraphimService::new(config_state);
2927 let search_term = NormalizedTermValue::new("terraphim".to_string());
2928 let documents = match service.search_documents_selected_role(&search_term).await {
2929 Ok(docs) => docs,
2930 Err(e) => {
2931 println!(
2932 "Skipping test: Search failed (expected in some environments): {:?}",
2933 e
2934 );
2935 return;
2936 }
2937 };
2938 assert!(documents.is_empty() || !documents.is_empty()); }
2940
2941 #[tokio::test]
2942 async fn test_ensure_thesaurus_loaded_terraphim_engineer() {
2943 let project_root = std::env::current_dir().unwrap_or_else(|_| PathBuf::from("."));
2945 let kg_path = project_root.join("docs/src/kg");
2946
2947 if !kg_path.exists() {
2949 println!("⚠️ KG directory not found at {:?}, skipping test", kg_path);
2950 return;
2951 }
2952
2953 let mut config = ConfigBuilder::new()
2954 .build_default_desktop()
2955 .build()
2956 .unwrap();
2957
2958 if let Some(terr_eng_role) = config.roles.get_mut(&"Terraphim Engineer".into()) {
2960 if let Some(kg) = &mut terr_eng_role.kg {
2961 if let Some(kg_local) = &mut kg.knowledge_graph_local {
2962 kg_local.path = kg_path;
2963 }
2964 }
2965 }
2966
2967 let config_state = ConfigState::new(&mut config).await.unwrap();
2968 let mut service = TerraphimService::new(config_state);
2969
2970 let role_name = RoleName::new("Terraphim Engineer");
2971 let thesaurus_result = service.ensure_thesaurus_loaded(&role_name).await;
2972
2973 match thesaurus_result {
2974 Ok(thesaurus) => {
2975 println!(
2976 "✅ Successfully loaded thesaurus with {} entries",
2977 thesaurus.len()
2978 );
2979 assert!(!thesaurus.is_empty(), "Thesaurus should not be empty");
2981
2982 let has_terraphim = (&thesaurus)
2984 .into_iter()
2985 .any(|(term, _)| term.as_str().to_lowercase().contains("terraphim"));
2986 let has_graph = (&thesaurus)
2987 .into_iter()
2988 .any(|(term, _)| term.as_str().to_lowercase().contains("graph"));
2989
2990 println!(" Contains 'terraphim': {}", has_terraphim);
2991 println!(" Contains 'graph': {}", has_graph);
2992
2993 assert!(
2995 has_terraphim || has_graph,
2996 "Thesaurus should contain expected terms"
2997 );
2998 }
2999 Err(e) => {
3000 println!("❌ Failed to load thesaurus: {:?}", e);
3001 }
3004 }
3005 }
3006
3007 #[tokio::test]
3008 #[ignore = "Requires local KG fixtures at ~/.terraphim/kg"]
3009 async fn test_config_building_with_local_kg() {
3010 let mut config = ConfigBuilder::new()
3012 .build_default_desktop()
3013 .build()
3014 .unwrap();
3015 let config_state_result = ConfigState::new(&mut config).await;
3016
3017 match config_state_result {
3018 Ok(config_state) => {
3019 println!("✅ Successfully built config state");
3020 assert!(
3022 !config_state.roles.is_empty(),
3023 "Config state should have roles"
3024 );
3025
3026 let terraphim_engineer_role = RoleName::new("Terraphim Engineer");
3028 let has_terraphim_engineer =
3029 config_state.roles.contains_key(&terraphim_engineer_role);
3030 println!(" Has Terraphim Engineer role: {}", has_terraphim_engineer);
3031
3032 assert!(
3034 has_terraphim_engineer,
3035 "Terraphim Engineer role should exist"
3036 );
3037 }
3038 Err(e) => {
3039 println!("❌ Failed to build config state: {:?}", e);
3040 }
3043 }
3044 }
3045
3046 #[tokio::test]
3047 async fn test_atomic_data_persistence_skip() {
3048 use ahash::AHashMap;
3049 use terraphim_config::{Config, Haystack, Role, ServiceType};
3050 use terraphim_persistence::DeviceStorage;
3051 use terraphim_types::{NormalizedTermValue, RoleName, SearchQuery};
3052
3053 DeviceStorage::init_memory_only().await.unwrap();
3055
3056 let mut config = Config::default();
3058 let role_name = RoleName::new("test_role");
3059 let role = Role {
3060 shortname: None,
3061 name: "test_role".into(),
3062 haystacks: vec![Haystack {
3063 location: "test".to_string(),
3064 service: ServiceType::Ripgrep,
3065 read_only: false,
3066 atomic_server_secret: None,
3067 extra_parameters: std::collections::HashMap::new(),
3068 fetch_content: false,
3069 }],
3070 kg: None,
3071 terraphim_it: false,
3072 theme: "default".to_string(),
3073 relevance_function: terraphim_types::RelevanceFunction::TitleScorer,
3074 llm_enabled: false,
3075 llm_api_key: None,
3076 llm_model: None,
3077 llm_auto_summarize: false,
3078 llm_chat_enabled: false,
3079 llm_chat_system_prompt: None,
3080 llm_chat_model: None,
3081 llm_context_window: None,
3082 extra: AHashMap::new(),
3083 llm_router_enabled: false,
3084 llm_router_config: None,
3085 };
3086 config.roles.insert(role_name.clone(), role);
3087
3088 let config_state = ConfigState::new(&mut config).await.unwrap();
3089 let mut service = TerraphimService::new(config_state);
3090
3091 let search_query = SearchQuery {
3093 search_term: NormalizedTermValue::new("test".to_string()),
3094 search_terms: None,
3095 operator: None,
3096 limit: Some(10),
3097 skip: None,
3098 role: Some(role_name),
3099 layer: Layer::default(),
3100 include_pinned: false,
3101 };
3102
3103 let result = service.search(&search_query).await;
3106
3107 assert!(result.is_ok(), "Search should complete without errors");
3110 }
3111
3112 #[tokio::test]
3113 async fn test_atomic_data_caching() {
3114 use ahash::AHashMap;
3115 use terraphim_config::{Config, Haystack, Role, ServiceType};
3116 use terraphim_persistence::DeviceStorage;
3117 use terraphim_types::{Document, NormalizedTermValue, RoleName, SearchQuery};
3118
3119 DeviceStorage::init_memory_only().await.unwrap();
3121
3122 let mut config = Config::default();
3124 let role_name = RoleName::new("test_role");
3125 let role = Role {
3126 shortname: None,
3127 name: "test_role".into(),
3128 haystacks: vec![Haystack {
3129 location: "test".to_string(),
3130 service: ServiceType::Ripgrep,
3131 read_only: false,
3132 atomic_server_secret: None,
3133 extra_parameters: std::collections::HashMap::new(),
3134 fetch_content: false,
3135 }],
3136 kg: None,
3137 terraphim_it: false,
3138 theme: "default".to_string(),
3139 relevance_function: terraphim_types::RelevanceFunction::TitleScorer,
3140 llm_enabled: false,
3141 llm_api_key: None,
3142 llm_model: None,
3143 llm_auto_summarize: false,
3144 llm_chat_enabled: false,
3145 llm_chat_system_prompt: None,
3146 llm_chat_model: None,
3147 llm_context_window: None,
3148 extra: AHashMap::new(),
3149 llm_router_enabled: false,
3150 llm_router_config: None,
3151 };
3152 config.roles.insert(role_name.clone(), role);
3153
3154 let config_state = ConfigState::new(&mut config).await.unwrap();
3155 let mut service = TerraphimService::new(config_state);
3156
3157 let atomic_doc = Document {
3159 id: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3160 url: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3161 title: "Requested Loan Amount ($)".to_string(),
3162 body: "Form field for Requested Loan Amount ($)".to_string(),
3163 description: Some("Form field for Requested Loan Amount ($)".to_string()),
3164 summarization: None,
3165 stub: None,
3166 tags: None,
3167 rank: None,
3168 source_haystack: None,
3169 doc_type: terraphim_types::DocumentType::KgEntry,
3170 synonyms: None,
3171 route: None,
3172 priority: None,
3173 };
3174
3175 log::info!("Testing Atomic Data document caching...");
3177 match atomic_doc.save().await {
3178 Ok(_) => log::info!("✅ Successfully saved Atomic Data document to persistence"),
3179 Err(e) => {
3180 log::error!("❌ Failed to save Atomic Data document: {}", e);
3181 panic!("Atomic Data document save failed");
3182 }
3183 }
3184
3185 let mut placeholder = Document {
3187 id: atomic_doc.id.clone(),
3188 ..Default::default()
3189 };
3190 match placeholder.load().await {
3191 Ok(loaded_doc) => {
3192 log::info!("✅ Successfully loaded Atomic Data document from persistence");
3193 assert_eq!(loaded_doc.title, atomic_doc.title);
3194 assert_eq!(loaded_doc.body, atomic_doc.body);
3195 assert_eq!(loaded_doc.description, atomic_doc.description);
3196 }
3197 Err(e) => {
3198 log::error!(
3199 "❌ Failed to load Atomic Data document from persistence: {}",
3200 e
3201 );
3202 panic!("Atomic Data document load failed");
3203 }
3204 }
3205
3206 let search_query = SearchQuery {
3208 search_term: NormalizedTermValue::new("test".to_string()),
3209 search_terms: None,
3210 operator: None,
3211 limit: Some(10),
3212 skip: None,
3213 role: Some(role_name),
3214 layer: Layer::default(),
3215 include_pinned: false,
3216 };
3217
3218 let result = service.search(&search_query).await;
3219 assert!(result.is_ok(), "Search should complete without errors");
3220
3221 log::info!("✅ All Atomic Data caching tests passed!");
3222 }
3223
3224 #[tokio::test]
3225 #[ignore = "Requires local KG fixtures at 'test' directory"]
3226 async fn test_kg_term_search_with_atomic_data() {
3227 use ahash::AHashMap;
3228 use std::path::PathBuf;
3229 use terraphim_config::{
3230 Config, Haystack, KnowledgeGraph, KnowledgeGraphLocal, Role, ServiceType,
3231 };
3232 use terraphim_persistence::DeviceStorage;
3233 use terraphim_types::{Document, KnowledgeGraphInputType, RoleName};
3234
3235 DeviceStorage::init_memory_only().await.unwrap();
3237
3238 let mut config = Config::default();
3240 let role_name = RoleName::new("test_kg_role");
3241 let role = Role {
3242 shortname: None,
3243 name: "test_kg_role".into(),
3244 haystacks: vec![Haystack {
3245 location: "test".to_string(),
3246 service: ServiceType::Ripgrep,
3247 read_only: false,
3248 atomic_server_secret: None,
3249 extra_parameters: std::collections::HashMap::new(),
3250 fetch_content: false,
3251 }],
3252 kg: Some(KnowledgeGraph {
3253 automata_path: None,
3254 knowledge_graph_local: Some(KnowledgeGraphLocal {
3255 input_type: KnowledgeGraphInputType::Markdown,
3256 path: PathBuf::from("test"),
3257 }),
3258 public: true,
3259 publish: true,
3260 }),
3261 terraphim_it: true,
3262 theme: "default".to_string(),
3263 relevance_function: terraphim_types::RelevanceFunction::TerraphimGraph,
3264 llm_enabled: false,
3265 llm_api_key: None,
3266 llm_model: None,
3267 llm_auto_summarize: false,
3268 llm_chat_enabled: false,
3269 llm_chat_system_prompt: None,
3270 llm_chat_model: None,
3271 llm_context_window: None,
3272 extra: AHashMap::new(),
3273 llm_router_enabled: false,
3274 llm_router_config: None,
3275 };
3276 config.roles.insert(role_name.clone(), role);
3277
3278 let config_state = ConfigState::new(&mut config).await.unwrap();
3279 let mut service = TerraphimService::new(config_state);
3280
3281 let atomic_doc = Document {
3283 id: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3284 url: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3285 title: "Requested Loan Amount ($)".to_string(),
3286 body: "Form field for Requested Loan Amount ($)".to_string(),
3287 description: Some("Form field for Requested Loan Amount ($)".to_string()),
3288 summarization: None,
3289 stub: None,
3290 tags: None,
3291 rank: None,
3292 source_haystack: None,
3293 doc_type: terraphim_types::DocumentType::KgEntry,
3294 synonyms: None,
3295 route: None,
3296 priority: None,
3297 };
3298
3299 log::info!("Testing KG term search with Atomic Data documents...");
3301 match atomic_doc.save().await {
3302 Ok(_) => log::info!("✅ Successfully saved Atomic Data document to persistence"),
3303 Err(e) => {
3304 log::error!("❌ Failed to save Atomic Data document: {}", e);
3305 panic!("Atomic Data document save failed");
3306 }
3307 }
3308
3309 let result = service.find_documents_for_kg_term(&role_name, "test").await;
3313
3314 assert!(
3317 result.is_ok(),
3318 "find_documents_for_kg_term should complete without errors"
3319 );
3320
3321 let documents = result.unwrap();
3322 log::info!(
3323 "✅ KG term search completed successfully, found {} documents",
3324 documents.len()
3325 );
3326
3327 let atomic_doc_id = "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount";
3330 let mut placeholder = Document {
3331 id: atomic_doc_id.to_string(),
3332 ..Default::default()
3333 };
3334
3335 match placeholder.load().await {
3336 Ok(loaded_doc) => {
3337 log::info!(
3338 "✅ Successfully loaded Atomic Data document from persistence in KG term search context"
3339 );
3340 assert_eq!(loaded_doc.title, atomic_doc.title);
3341 assert_eq!(loaded_doc.body, atomic_doc.body);
3342 }
3343 Err(e) => {
3344 log::error!(
3345 "❌ Failed to load Atomic Data document in KG term search context: {}",
3346 e
3347 );
3348 panic!("Atomic Data document load failed in KG term search context");
3349 }
3350 }
3351
3352 log::info!("✅ All KG term search with Atomic Data tests passed!");
3353 }
3354
3355 #[tokio::test]
3356 async fn test_kg_term_search_rank_assignment() -> Result<()> {
3357 use ahash::AHashMap;
3358 use terraphim_config::{Config, Haystack, Role, ServiceType};
3359 use terraphim_persistence::DeviceStorage;
3360 use terraphim_types::{Document, RoleName};
3361
3362 DeviceStorage::init_memory_only().await.unwrap();
3364
3365 let mut config = Config::default();
3367 let role_name = RoleName::new("Test KG Role");
3368 let role = Role {
3369 shortname: Some("test-kg".to_string()),
3370 name: role_name.clone(),
3371 haystacks: vec![Haystack {
3372 location: "test".to_string(),
3373 service: ServiceType::Ripgrep,
3374 read_only: false,
3375 atomic_server_secret: None,
3376 extra_parameters: std::collections::HashMap::new(),
3377 fetch_content: false,
3378 }],
3379 kg: Some(terraphim_config::KnowledgeGraph {
3380 automata_path: Some(terraphim_automata::AutomataPath::local_example()),
3381 knowledge_graph_local: None,
3382 public: false,
3383 publish: false,
3384 }),
3385 terraphim_it: false,
3386 theme: "default".to_string(),
3387 relevance_function: terraphim_types::RelevanceFunction::TitleScorer,
3388 llm_enabled: false,
3389 llm_api_key: None,
3390 llm_model: None,
3391 llm_auto_summarize: false,
3392 llm_chat_enabled: false,
3393 llm_chat_system_prompt: None,
3394 llm_chat_model: None,
3395 llm_context_window: None,
3396 extra: AHashMap::new(),
3397 llm_router_enabled: false,
3398 llm_router_config: None,
3399 };
3400 config.roles.insert(role_name.clone(), role);
3401
3402 let config_state = ConfigState::new(&mut config).await.unwrap();
3403 let _service = TerraphimService::new(config_state);
3404
3405 let test_documents = vec![
3407 Document {
3408 id: "test-doc-1".to_string(),
3409 title: "First Test Document".to_string(),
3410 body: "This is the first test document body".to_string(),
3411 url: "test://doc1".to_string(),
3412 description: Some("First document description".to_string()),
3413 summarization: None,
3414 stub: None,
3415 tags: Some(vec!["test".to_string(), "first".to_string()]),
3416 rank: None, source_haystack: None,
3418 doc_type: terraphim_types::DocumentType::KgEntry,
3419 synonyms: None,
3420 route: None,
3421 priority: None,
3422 },
3423 Document {
3424 id: "test-doc-2".to_string(),
3425 title: "Second Test Document".to_string(),
3426 body: "This is the second test document body".to_string(),
3427 url: "test://doc2".to_string(),
3428 description: Some("Second document description".to_string()),
3429 summarization: None,
3430 stub: None,
3431 tags: Some(vec!["test".to_string(), "second".to_string()]),
3432 rank: None, source_haystack: None,
3434 doc_type: terraphim_types::DocumentType::KgEntry,
3435 synonyms: None,
3436 route: None,
3437 priority: None,
3438 },
3439 Document {
3440 id: "test-doc-3".to_string(),
3441 title: "Third Test Document".to_string(),
3442 body: "This is the third test document body".to_string(),
3443 url: "test://doc3".to_string(),
3444 description: Some("Third document description".to_string()),
3445 summarization: None,
3446 stub: None,
3447 tags: Some(vec!["test".to_string(), "third".to_string()]),
3448 rank: None, source_haystack: None,
3450 doc_type: terraphim_types::DocumentType::KgEntry,
3451 synonyms: None,
3452 route: None,
3453 priority: None,
3454 },
3455 ];
3456
3457 for doc in &test_documents {
3459 doc.save().await.expect("Failed to save test document");
3460 }
3461
3462 let mut simulated_documents = test_documents.clone();
3468
3469 let total_length = simulated_documents.len();
3471 for (idx, doc) in simulated_documents.iter_mut().enumerate() {
3472 let rank = (total_length - idx) as u64;
3473 doc.rank = Some(rank);
3474 }
3475
3476 assert_eq!(simulated_documents.len(), 3, "Should have 3 test documents");
3478
3479 for doc in &simulated_documents {
3481 assert!(
3482 doc.rank.is_some(),
3483 "Document '{}' should have a rank assigned",
3484 doc.title
3485 );
3486 assert!(
3487 doc.rank.unwrap() > 0,
3488 "Document '{}' should have a positive rank",
3489 doc.title
3490 );
3491 }
3492
3493 assert_eq!(
3495 simulated_documents[0].rank,
3496 Some(3),
3497 "First document should have highest rank (3)"
3498 );
3499 assert_eq!(
3500 simulated_documents[1].rank,
3501 Some(2),
3502 "Second document should have rank 2"
3503 );
3504 assert_eq!(
3505 simulated_documents[2].rank,
3506 Some(1),
3507 "Third document should have rank 1"
3508 );
3509
3510 let mut ranks: Vec<u64> = simulated_documents
3512 .iter()
3513 .map(|doc| doc.rank.unwrap())
3514 .collect();
3515 ranks.sort_by(|a, b| b.cmp(a)); assert_eq!(
3517 ranks,
3518 vec![3, 2, 1],
3519 "Ranks should be unique and in descending order"
3520 );
3521
3522 log::info!("✅ KG term search rank assignment test completed successfully!");
3523 Ok(())
3524 }
3525}