1use ahash::AHashMap;
2use regex::Regex;
3use terraphim_automata::builder::{Logseq, ThesaurusBuilder};
4use terraphim_automata::load_thesaurus;
5use terraphim_automata::{replace_matches, LinkType};
6use terraphim_config::{ConfigState, Role};
7use terraphim_middleware::thesaurus::build_thesaurus_from_haystack;
8use terraphim_persistence::Persistable;
9use terraphim_rolegraph::{RoleGraph, RoleGraphSync};
10use terraphim_types::{
11 Document, Index, IndexedDocument, NormalizedTermValue, RelevanceFunction, RoleName,
12 SearchQuery, Thesaurus,
13};
14mod score;
15use crate::score::Query;
16
17#[cfg(feature = "openrouter")]
18pub mod openrouter;
19
20pub mod llm;
22
23pub mod llm_proxy;
29
30pub mod http_client;
34
35pub mod logging;
37
38pub mod conversation_service;
40pub mod rate_limiter;
41pub mod summarization_manager;
42pub mod summarization_queue;
43pub mod summarization_worker;
44
45pub mod error;
47
48pub mod context;
50
51#[cfg(test)]
52mod context_tests;
53
54fn normalize_filename_to_id(filename: &str) -> String {
58 let re = regex::Regex::new(r"[^a-zA-Z0-9]+").expect("Failed to create regex");
59 re.replace_all(filename, "").to_lowercase()
60}
61
62#[derive(thiserror::Error, Debug)]
63pub enum ServiceError {
64 #[error("Middleware error: {0}")]
65 Middleware(#[from] terraphim_middleware::Error),
66
67 #[error("OpenDal error: {0}")]
68 OpenDal(Box<opendal::Error>),
69
70 #[error("Persistence error: {0}")]
71 Persistence(#[from] terraphim_persistence::Error),
72
73 #[error("Config error: {0}")]
74 Config(String),
75
76 #[cfg(feature = "openrouter")]
77 #[error("OpenRouter error: {0}")]
78 OpenRouter(#[from] crate::openrouter::OpenRouterError),
79
80 #[error("Common error: {0}")]
81 Common(#[from] crate::error::CommonError),
82}
83
84impl From<opendal::Error> for ServiceError {
85 fn from(err: opendal::Error) -> Self {
86 ServiceError::OpenDal(Box::new(err))
87 }
88}
89
90impl crate::error::TerraphimError for ServiceError {
91 fn category(&self) -> crate::error::ErrorCategory {
92 use crate::error::ErrorCategory;
93 match self {
94 ServiceError::Middleware(_) => ErrorCategory::Integration,
95 ServiceError::OpenDal(_) => ErrorCategory::Storage,
96 ServiceError::Persistence(_) => ErrorCategory::Storage,
97 ServiceError::Config(_) => ErrorCategory::Configuration,
98 #[cfg(feature = "openrouter")]
99 ServiceError::OpenRouter(_) => ErrorCategory::Integration,
100 ServiceError::Common(err) => err.category(),
101 }
102 }
103
104 fn is_recoverable(&self) -> bool {
105 match self {
106 ServiceError::Middleware(_) => true,
107 ServiceError::OpenDal(_) => false,
108 ServiceError::Persistence(_) => false,
109 ServiceError::Config(_) => false,
110 #[cfg(feature = "openrouter")]
111 ServiceError::OpenRouter(_) => true,
112 ServiceError::Common(err) => err.is_recoverable(),
113 }
114 }
115}
116
117pub type Result<T> = std::result::Result<T, ServiceError>;
118
119pub struct TerraphimService {
120 config_state: ConfigState,
121}
122
123impl TerraphimService {
124 pub fn new(config_state: ConfigState) -> Self {
126 Self { config_state }
127 }
128
129 async fn build_thesaurus(&mut self, search_query: &SearchQuery) -> Result<()> {
131 Ok(build_thesaurus_from_haystack(&mut self.config_state, search_query).await?)
132 }
133 pub async fn ensure_thesaurus_loaded(&mut self, role_name: &RoleName) -> Result<Thesaurus> {
135 async fn load_thesaurus_from_automata_path(
136 config_state: &ConfigState,
137 role_name: &RoleName,
138 rolegraphs: &mut AHashMap<RoleName, RoleGraphSync>,
139 ) -> Result<Thesaurus> {
140 let config = config_state.config.lock().await;
141 let Some(role) = config.roles.get(role_name).cloned() else {
142 return Err(ServiceError::Config(format!(
143 "Role '{}' not found in config",
144 role_name
145 )));
146 };
147 if let Some(kg) = &role.kg {
148 if let Some(automata_path) = &kg.automata_path {
149 log::info!("Loading Role `{}` - URL: {:?}", role_name, automata_path);
150
151 match load_thesaurus(automata_path).await {
153 Ok(mut thesaurus) => {
154 log::info!("Successfully loaded thesaurus from automata path");
155
156 match thesaurus.save().await {
158 Ok(_) => {
159 log::info!(
160 "Thesaurus for role `{}` saved to persistence",
161 role_name
162 );
163 match thesaurus.load().await {
165 Ok(persisted_thesaurus) => {
166 thesaurus = persisted_thesaurus;
167 log::debug!("Reloaded thesaurus from persistence");
168 }
169 Err(e) => {
170 log::warn!(
171 "Failed to reload thesaurus from persistence, using in-memory version: {:?}",
172 e
173 );
174 }
175 }
176 }
177 Err(e) => {
178 log::warn!("Failed to save thesaurus to persistence: {:?}", e);
179 }
180 }
181
182 let rolegraph =
183 RoleGraph::new(role_name.clone(), thesaurus.clone()).await;
184 match rolegraph {
185 Ok(rolegraph) => {
186 let rolegraph_value = RoleGraphSync::from(rolegraph);
187 rolegraphs.insert(role_name.clone(), rolegraph_value);
188 }
189 Err(e) => {
190 log::error!("Failed to update role and thesaurus: {:?}", e)
191 }
192 }
193 Ok(thesaurus)
194 }
195 Err(e) => {
196 log::warn!("Failed to load thesaurus from automata path: {:?}", e);
197 if let Some(kg_local) = &kg.knowledge_graph_local {
199 log::info!(
200 "Fallback: building thesaurus from local KG for role {}",
201 role_name
202 );
203 let logseq_builder = Logseq::default();
204 match logseq_builder
205 .build(
206 role_name.as_lowercase().to_string(),
207 kg_local.path.clone(),
208 )
209 .await
210 {
211 Ok(mut thesaurus) => {
212 match thesaurus.save().await {
214 Ok(_) => {
215 log::info!(
216 "Fallback thesaurus for role `{}` saved to persistence",
217 role_name
218 );
219 match thesaurus.load().await {
221 Ok(persisted_thesaurus) => {
222 thesaurus = persisted_thesaurus;
223 log::debug!(
224 "Reloaded fallback thesaurus from persistence"
225 );
226 }
227 Err(e) => {
228 log::warn!(
229 "Failed to reload fallback thesaurus from persistence, using in-memory version: {:?}",
230 e
231 );
232 }
233 }
234 }
235 Err(e) => {
236 log::warn!(
237 "Failed to save fallback thesaurus to persistence: {:?}",
238 e
239 );
240 }
241 }
242
243 let rolegraph =
244 RoleGraph::new(role_name.clone(), thesaurus.clone())
245 .await;
246 match rolegraph {
247 Ok(rolegraph) => {
248 let rolegraph_value =
249 RoleGraphSync::from(rolegraph);
250 rolegraphs
251 .insert(role_name.clone(), rolegraph_value);
252 }
253 Err(e) => log::error!(
254 "Failed to update role and thesaurus: {:?}",
255 e
256 ),
257 }
258
259 Ok(thesaurus)
260 }
261 Err(e) => {
262 log::error!(
263 "Failed to build thesaurus from local KG for role {}: {:?}",
264 role_name,
265 e
266 );
267 Err(ServiceError::Config(
268 "Failed to load or build thesaurus".into(),
269 ))
270 }
271 }
272 } else {
273 log::error!(
274 "No fallback available for role {}: no local KG path configured",
275 role_name
276 );
277 Err(ServiceError::Config(
278 "No automata path and no local KG available".into(),
279 ))
280 }
281 }
282 }
283 } else if let Some(kg_local) = &kg.knowledge_graph_local {
284 log::info!(
286 "Role {} has no automata_path, building thesaurus from local KG files at {:?}",
287 role_name,
288 kg_local.path
289 );
290 let logseq_builder = Logseq::default();
291 match logseq_builder
292 .build(role_name.as_lowercase().to_string(), kg_local.path.clone())
293 .await
294 {
295 Ok(mut thesaurus) => {
296 log::info!(
297 "Successfully built thesaurus from local KG for role {}",
298 role_name
299 );
300
301 match thesaurus.save().await {
303 Ok(_) => {
304 log::info!(
305 "Local KG thesaurus for role `{}` saved to persistence",
306 role_name
307 );
308 match thesaurus.load().await {
310 Ok(persisted_thesaurus) => {
311 log::info!(
312 "Reloaded local KG thesaurus from persistence: {} entries",
313 persisted_thesaurus.len()
314 );
315 thesaurus = persisted_thesaurus;
316 }
317 Err(e) => {
318 log::warn!(
319 "Failed to reload local KG thesaurus from persistence, using in-memory version: {:?}",
320 e
321 );
322 }
323 }
324 }
325 Err(e) => {
326 log::warn!(
327 "Failed to save local KG thesaurus to persistence: {:?}",
328 e
329 );
330 }
331 }
332
333 let rolegraph =
334 RoleGraph::new(role_name.clone(), thesaurus.clone()).await;
335 match rolegraph {
336 Ok(rolegraph) => {
337 let rolegraph_value = RoleGraphSync::from(rolegraph);
338 rolegraphs.insert(role_name.clone(), rolegraph_value);
339 }
340 Err(e) => {
341 log::error!("Failed to update role and thesaurus: {:?}", e)
342 }
343 }
344
345 Ok(thesaurus)
346 }
347 Err(e) => {
348 log::error!(
349 "Failed to build thesaurus from local KG for role {}: {:?}",
350 role_name,
351 e
352 );
353 Err(ServiceError::Config(
354 "Failed to build thesaurus from local KG".into(),
355 ))
356 }
357 }
358 } else {
359 log::warn!(
360 "Role {} is configured for TerraphimGraph but has neither automata_path nor knowledge_graph_local defined.",
361 role_name
362 );
363 if let Some(kg_local) = &kg.knowledge_graph_local {
364 log::info!(
366 "Building thesaurus from local KG files for role {} at {:?}",
367 role_name,
368 kg_local.path
369 );
370 let logseq_builder = Logseq::default();
371 match logseq_builder
372 .build(role_name.as_lowercase().to_string(), kg_local.path.clone())
373 .await
374 {
375 Ok(mut thesaurus) => {
376 log::info!(
377 "Successfully built thesaurus from local KG for role {}",
378 role_name
379 );
380
381 match thesaurus.save().await {
383 Ok(_) => {
384 log::info!(
385 "No-automata thesaurus for role `{}` saved to persistence",
386 role_name
387 );
388 match thesaurus.load().await {
390 Ok(persisted_thesaurus) => {
391 thesaurus = persisted_thesaurus;
392 log::debug!(
393 "Reloaded no-automata thesaurus from persistence"
394 );
395 }
396 Err(e) => {
397 log::warn!(
398 "Failed to reload no-automata thesaurus from persistence, using in-memory version: {:?}",
399 e
400 );
401 }
402 }
403 }
404 Err(e) => {
405 log::warn!(
406 "Failed to save no-automata thesaurus to persistence: {:?}",
407 e
408 );
409 }
410 }
411
412 let rolegraph =
413 RoleGraph::new(role_name.clone(), thesaurus.clone()).await;
414 match rolegraph {
415 Ok(rolegraph) => {
416 let rolegraph_value = RoleGraphSync::from(rolegraph);
417 rolegraphs.insert(role_name.clone(), rolegraph_value);
418 }
419 Err(e) => {
420 log::error!("Failed to update role and thesaurus: {:?}", e)
421 }
422 }
423
424 Ok(thesaurus)
425 }
426 Err(e) => {
427 log::error!(
428 "Failed to build thesaurus from local KG for role {}: {:?}",
429 role_name,
430 e
431 );
432 Err(ServiceError::Config(
433 "Failed to build thesaurus from local KG".into(),
434 ))
435 }
436 }
437 } else {
438 Err(ServiceError::Config(
439 "No local knowledge graph path available".into(),
440 ))
441 }
442 }
443 } else {
444 Err(ServiceError::Config(
445 "Knowledge graph not configured".into(),
446 ))
447 }
448 }
449
450 log::debug!("Loading thesaurus for role: {}", role_name);
451 log::debug!("Role keys {:?}", self.config_state.roles.keys());
452
453 if let Some(rolegraph_value) = self.config_state.roles.get(role_name) {
454 let thesaurus_result = rolegraph_value.lock().await.thesaurus.clone().load().await;
455 match thesaurus_result {
456 Ok(thesaurus) => {
457 log::debug!("Thesaurus loaded: {:?}", thesaurus);
458 log::info!("Rolegraph loaded: for role name {:?}", role_name);
459 Ok(thesaurus)
460 }
461 Err(e) => {
462 log::error!("Failed to load thesaurus: {:?}", e);
463 let mut rolegraphs = self.config_state.roles.clone();
465 let result = load_thesaurus_from_automata_path(
466 &self.config_state,
467 role_name,
468 &mut rolegraphs,
469 )
470 .await;
471
472 if result.is_ok() {
474 if let Some(updated_rolegraph) = rolegraphs.get(role_name) {
475 self.config_state
476 .roles
477 .insert(role_name.clone(), updated_rolegraph.clone());
478 log::info!(
479 "Updated config_state with new rolegraph for role: {}",
480 role_name
481 );
482 }
483 }
484
485 result
486 }
487 }
488 } else {
489 let mut rolegraphs = self.config_state.roles.clone();
491 let result =
492 load_thesaurus_from_automata_path(&self.config_state, role_name, &mut rolegraphs)
493 .await;
494
495 if result.is_ok() {
497 if let Some(new_rolegraph) = rolegraphs.get(role_name) {
498 self.config_state
499 .roles
500 .insert(role_name.clone(), new_rolegraph.clone());
501 log::info!(
502 "Added new rolegraph to config_state for role: {}",
503 role_name
504 );
505 }
506 }
507
508 result
509 }
510 }
511
512 pub async fn preprocess_document_content(
518 &mut self,
519 mut document: Document,
520 role: &Role,
521 ) -> Result<Document> {
522 if !role.terraphim_it {
524 log::info!(
525 "🔍 terraphim_it disabled for role '{}', skipping KG preprocessing",
526 role.name
527 );
528 return Ok(document);
529 }
530
531 let Some(_kg) = &role.kg else {
532 log::info!(
533 "⚠️ No KG configured for role '{}', skipping KG preprocessing",
534 role.name
535 );
536 return Ok(document);
537 };
538
539 log::info!(
540 "🧠 Starting KG preprocessing for document '{}' in role '{}' (terraphim_it enabled)",
541 document.title,
542 role.name
543 );
544 log::debug!(
545 "📄 Document preview: {} characters starting with: {}",
546 document.body.len(),
547 &document.body.chars().take(100).collect::<String>()
548 );
549
550 let thesaurus = match self.ensure_thesaurus_loaded(&role.name).await {
552 Ok(thesaurus) => thesaurus,
553 Err(e) => {
554 log::warn!("Failed to load thesaurus for role {}: {:?}", role.name, e);
555 return Ok(document); }
557 };
558
559 let mut kg_thesaurus = Thesaurus::new(format!("kg_links_{}", role.name));
561
562 let important_kg_terms = [
565 "graph",
566 "haystack",
567 "service",
568 "terraphim",
569 "knowledge",
570 "embedding",
571 "search",
572 "automata",
573 "thesaurus",
574 "rolegraph",
575 ];
576
577 let excluded_common_terms = [
579 "system",
580 "config",
581 "configuration",
582 "type",
583 "method",
584 "function",
585 "class",
586 "component",
587 "module",
588 "library",
589 "framework",
590 "interface",
591 "api",
592 "data",
593 "file",
594 "path",
595 "url",
596 "string",
597 "number",
598 "value",
599 "option",
600 "parameter",
601 "field",
602 "property",
603 "attribute",
604 "element",
605 "item",
606 "object",
607 "array",
608 "list",
609 "map",
610 "set",
611 "collection",
612 "server",
613 "client",
614 "request",
615 "response",
616 "error",
617 "result",
618 "success",
619 "failure",
620 "true",
621 "false",
622 "null",
623 "undefined",
624 "empty",
625 "full",
626 "start",
627 "end",
628 "begin",
629 "finish",
630 "create",
631 "delete",
632 "update",
633 "read",
634 "write",
635 "load",
636 "save",
637 "process",
638 "handle",
639 "manage",
640 "control",
641 "execute",
642 "run",
643 "call",
644 "invoke",
645 "trigger",
646 "event",
647 "action",
648 "command",
649 "query",
650 "search",
651 "filter",
652 "sort",
653 "order",
654 "group",
655 "match",
656 "find",
657 "replace",
658 "insert",
659 "remove",
660 "add",
661 "set",
662 "get",
663 "put",
664 "post",
665 "head",
666 "patch",
667 "delete",
668 ];
669
670 let mut sorted_terms: Vec<_> = (&thesaurus)
671 .into_iter()
672 .filter(|(key, _)| {
673 let term = key.as_str();
674
675 if term.is_empty() || term.len() < 3 {
677 return false;
678 }
679
680 if important_kg_terms.contains(&term) {
682 return true;
683 }
684
685 if excluded_common_terms.contains(&term) {
687 return false;
688 }
689
690 term.len() > 5
696 || term.contains('-')
697 || term.contains('_')
698 || term.chars().next().is_some_and(|c| c.is_uppercase())
699 })
700 .collect();
701
702 sorted_terms.sort_by(|a, b| {
704 let a_important = important_kg_terms.contains(&a.0.as_str());
705 let b_important = important_kg_terms.contains(&b.0.as_str());
706
707 match (a_important, b_important) {
708 (true, false) => std::cmp::Ordering::Less, (false, true) => std::cmp::Ordering::Greater, _ => b.1.id.cmp(&a.1.id), }
712 });
713
714 let max_kg_terms = 8;
716 for (key, value) in sorted_terms.into_iter().take(max_kg_terms) {
717 let mut kg_value = value.clone();
718 kg_value.value = key.clone(); kg_value.url = Some(format!("kg:{}", value.value)); kg_thesaurus.insert(key.clone(), kg_value);
724 }
725
726 let kg_terms_count = kg_thesaurus.len();
727 log::info!(
728 "📋 KG thesaurus filtering: {} → {} terms (prioritizing: {}, filters: len>5, hyphenated, or important KG terms)",
729 thesaurus.len(),
730 kg_terms_count,
731 important_kg_terms.join(", ")
732 );
733
734 if kg_terms_count > 0 {
736 let terms: Vec<String> = (&kg_thesaurus)
737 .into_iter()
738 .map(|(k, v)| format!("'{}' → kg:{}", k, v.value))
739 .collect();
740 log::info!("🔍 KG terms selected for linking: {}", terms.join(", "));
741 } else {
742 log::info!(
743 "⚠️ No KG terms passed filtering criteria - document '{}' will have no KG links",
744 document.title
745 );
746 }
747
748 if !kg_thesaurus.is_empty() {
750 let debug_thesaurus: Vec<String> = (&kg_thesaurus)
752 .into_iter()
753 .map(|(k, v)| format!("'{}' -> '{}' (url: {:?})", k, v.value, v.url))
754 .take(3) .collect();
756 log::info!(
757 "🔧 Passing to replace_matches: {} (total terms: {})",
758 debug_thesaurus.join(", "),
759 kg_thesaurus.len()
760 );
761 let preview = if document.body.chars().count() > 200 {
762 document.body.chars().take(200).collect::<String>() + "..."
763 } else {
764 document.body.clone()
765 };
766 log::info!("📝 Document body preview (first 200 chars): {}", preview);
767
768 match replace_matches(&document.body, kg_thesaurus, LinkType::MarkdownLinks) {
769 Ok(processed_bytes) => {
770 match String::from_utf8(processed_bytes) {
771 Ok(processed_content) => {
772 log::info!(
773 "✅ Successfully preprocessed document '{}' with {} KG terms → created [term](kg:concept) links",
774 document.title,
775 kg_terms_count
776 );
777
778 let content_changed = processed_content != document.body;
780 log::info!(
781 "🔄 Content changed: {} (original: {} chars, processed: {} chars)",
782 content_changed,
783 document.body.len(),
784 processed_content.len()
785 );
786
787 let kg_links: Vec<&str> = processed_content
789 .split("[")
790 .filter_map(|s| s.find("](kg:").map(|closing| &s[..closing]))
791 .collect();
792
793 if !kg_links.is_empty() {
794 log::info!(
795 "🔗 Found KG links in processed content: [{}](kg:...)",
796 kg_links.join("], [")
797 );
798
799 if let Some(first_link_pos) = processed_content.find("](kg:") {
801 let start = first_link_pos.saturating_sub(50);
802 let end = (first_link_pos + 100).min(processed_content.len());
803 log::info!(
804 "📄 Content snippet with KG link: ...{}...",
805 &processed_content[start..end]
806 );
807 }
808 } else {
809 log::warn!(
810 "⚠️ No KG links found in processed content despite successful replacement"
811 );
812 }
813
814 document.body = processed_content;
815 }
816 Err(e) => {
817 log::warn!(
818 "Failed to convert processed content to UTF-8 for document '{}': {:?}",
819 document.title,
820 e
821 );
822 }
823 }
824 }
825 Err(e) => {
826 log::warn!(
827 "Failed to replace KG terms in document '{}': {:?}",
828 document.title,
829 e
830 );
831 }
832 }
833 } else {
834 log::info!(
835 "💭 No specific KG terms found for document '{}' (filters excluded generic terms)",
836 document.title
837 );
838 }
839
840 Ok(document)
841 }
842
843 pub async fn preprocess_document_content_with_search(
845 &mut self,
846 document: Document,
847 role: &Role,
848 search_query: Option<&SearchQuery>,
849 ) -> Result<Document> {
850 let mut processed_doc = self.preprocess_document_content(document, role).await?;
852
853 if let Some(query) = search_query {
855 log::debug!(
856 "Applying search term highlighting to document '{}'",
857 processed_doc.title
858 );
859 processed_doc.body = Self::highlight_search_terms(&processed_doc.body, query);
860 }
861
862 Ok(processed_doc)
863 }
864
865 pub async fn create_document(&mut self, document: Document) -> Result<Document> {
867 document.save().await?;
870
871 self.config_state.add_to_roles(&document).await?;
874
875 use terraphim_config::ServiceType;
879 use terraphim_middleware::indexer::RipgrepIndexer;
880
881 let ripgrep = RipgrepIndexer::default();
882 let config_snapshot = { self.config_state.config.lock().await.clone() };
883
884 for role in config_snapshot.roles.values() {
885 for haystack in &role.haystacks {
886 if haystack.service == ServiceType::Ripgrep && !haystack.read_only {
887 if let Err(e) = ripgrep.update_document(&document).await {
888 log::warn!(
889 "Failed to write document {} to haystack {:?}: {:?}",
890 document.id,
891 haystack.location,
892 e
893 );
894 }
895 }
896 }
897 }
898
899 Ok(document)
900 }
901
902 pub async fn get_document_by_id(&mut self, document_id: &str) -> Result<Option<Document>> {
908 log::debug!("Getting document by ID: '{}'", document_id);
909
910 if document_id.trim().is_empty() {
912 log::warn!("Empty or whitespace-only document_id provided");
913 return Ok(None);
914 }
915
916 let mut placeholder = Document {
918 id: document_id.to_string(),
919 ..Default::default()
920 };
921 match placeholder.load().await {
922 Ok(doc) => {
923 log::debug!("Found document '{}' with direct ID lookup", document_id);
924 return self.apply_kg_preprocessing_if_needed(doc).await.map(Some);
925 }
926 Err(e) => {
927 log::debug!(
928 "Document '{}' not found with direct lookup: {:?}",
929 document_id,
930 e
931 );
932 }
933 }
934
935 if document_id.contains('.') || document_id.contains('-') || document_id.contains('_') {
937 let normalized_id = normalize_filename_to_id(document_id);
938 log::debug!(
939 "Trying normalized ID '{}' for filename '{}'",
940 normalized_id,
941 document_id
942 );
943
944 let mut normalized_placeholder = Document {
945 id: normalized_id.clone(),
946 ..Default::default()
947 };
948 match normalized_placeholder.load().await {
949 Ok(doc) => {
950 log::debug!(
951 "Found document '{}' with normalized ID '{}'",
952 document_id,
953 normalized_id
954 );
955 return self.apply_kg_preprocessing_if_needed(doc).await.map(Some);
956 }
957 Err(e) => {
958 log::debug!(
959 "Document '{}' not found with normalized ID '{}': {:?}",
960 document_id,
961 normalized_id,
962 e
963 );
964 }
965 }
966 }
967
968 log::debug!("Falling back to search for document '{}'", document_id);
970 let search_query = SearchQuery {
971 search_term: NormalizedTermValue::new(document_id.to_string()),
972 search_terms: None,
973 operator: None,
974 limit: Some(5), skip: None,
976 role: None,
977 };
978
979 let documents = self.search(&search_query).await?;
980
981 for doc in documents {
983 if doc.title == document_id || doc.id == document_id {
984 log::debug!("Found document '{}' via search fallback", document_id);
985 return self.apply_kg_preprocessing_if_needed(doc).await.map(Some);
986 }
987 }
988
989 log::debug!("Document '{}' not found anywhere", document_id);
990 Ok(None)
991 }
992
993 async fn apply_kg_preprocessing_if_needed(&mut self, document: Document) -> Result<Document> {
999 log::debug!(
1000 "🔍 [KG-DEBUG] apply_kg_preprocessing_if_needed called for document: '{}'",
1001 document.title
1002 );
1003 log::debug!(
1004 "🔍 [KG-DEBUG] Document body preview: {}",
1005 document.body.chars().take(100).collect::<String>()
1006 );
1007
1008 let role = {
1009 let config = self.config_state.config.lock().await;
1010 let selected_role = &config.selected_role;
1011
1012 log::debug!("🔍 [KG-DEBUG] Selected role: '{}'", selected_role);
1013
1014 match config.roles.get(selected_role) {
1015 Some(role) => {
1016 log::debug!(
1017 "🔍 [KG-DEBUG] Role found: '{}', terraphim_it: {}",
1018 role.name,
1019 role.terraphim_it
1020 );
1021 role.clone() }
1023 None => {
1024 log::warn!(
1025 "❌ [KG-DEBUG] Selected role '{}' not found in config, skipping KG preprocessing",
1026 selected_role
1027 );
1028 return Ok(document);
1029 }
1030 }
1031 }; if !role.terraphim_it {
1035 log::info!(
1036 "🔍 [KG-DEBUG] terraphim_it disabled for role '{}', skipping KG preprocessing",
1037 role.name
1038 );
1039 return Ok(document);
1040 }
1041
1042 let has_existing_kg_links = document.body.contains("](kg:");
1044 log::debug!(
1045 "🔍 [KG-DEBUG] Document already has KG links: {}",
1046 has_existing_kg_links
1047 );
1048 if has_existing_kg_links {
1049 log::info!(
1050 "🔍 [KG-DEBUG] Document '{}' already has KG links, skipping preprocessing to prevent double processing",
1051 document.title
1052 );
1053 return Ok(document);
1054 }
1055
1056 log::info!(
1057 "🧠 [KG-DEBUG] Starting KG preprocessing for document '{}' with role '{}' (terraphim_it enabled)",
1058 document.title,
1059 role.name
1060 );
1061
1062 let document_title = document.title.clone(); let processed_doc = match self.preprocess_document_content(document, &role).await {
1065 Ok(doc) => {
1066 let links_added = doc.body.contains("](kg:");
1067 log::info!(
1068 "✅ [KG-DEBUG] KG preprocessing completed for document '{}'. Links added: {}",
1069 doc.title,
1070 links_added
1071 );
1072 if links_added {
1073 log::debug!(
1074 "🔍 [KG-DEBUG] Processed body preview: {}",
1075 doc.body.chars().take(200).collect::<String>()
1076 );
1077 }
1078 doc
1079 }
1080 Err(e) => {
1081 log::error!(
1082 "❌ [KG-DEBUG] KG preprocessing failed for document '{}': {:?}",
1083 document_title,
1084 e
1085 );
1086 return Err(e);
1087 }
1088 };
1089
1090 Ok(processed_doc)
1091 }
1092
1093 #[allow(dead_code)] async fn enhance_descriptions_with_ai(
1099 &self,
1100 mut documents: Vec<Document>,
1101 role: &Role,
1102 ) -> Result<Vec<Document>> {
1103 use crate::llm::{build_llm_from_role, SummarizeOptions};
1104
1105 eprintln!("🤖 Attempting to build LLM client for role: {}", role.name);
1106 let llm = match build_llm_from_role(role) {
1107 Some(client) => {
1108 eprintln!("✅ LLM client successfully created: {}", client.name());
1109 client
1110 }
1111 None => {
1112 eprintln!("❌ No LLM client available for role: {}", role.name);
1113 return Ok(documents);
1114 }
1115 };
1116
1117 log::info!(
1118 "Enhancing {} document descriptions with LLM provider: {}",
1119 documents.len(),
1120 llm.name()
1121 );
1122
1123 let mut enhanced_count = 0;
1124 let mut error_count = 0;
1125
1126 for document in &mut documents {
1127 if self.should_generate_ai_summary(document) {
1128 let summary_length = 250;
1129 match llm
1130 .summarize(
1131 &document.body,
1132 SummarizeOptions {
1133 max_length: summary_length,
1134 },
1135 )
1136 .await
1137 {
1138 Ok(ai_summary) => {
1139 log::debug!(
1140 "Generated AI summary for '{}': {} characters",
1141 document.title,
1142 ai_summary.len()
1143 );
1144 document.description = Some(ai_summary);
1145 enhanced_count += 1;
1146 }
1147 Err(e) => {
1148 log::warn!(
1149 "Failed to generate AI summary for '{}': {}",
1150 document.title,
1151 e
1152 );
1153 error_count += 1;
1154 }
1155 }
1156 }
1157 }
1158
1159 log::info!(
1160 "LLM enhancement complete: {} enhanced, {} errors, {} skipped",
1161 enhanced_count,
1162 error_count,
1163 documents.len() - enhanced_count - error_count
1164 );
1165
1166 Ok(documents)
1167 }
1168
1169 #[allow(dead_code)] fn should_generate_ai_summary(&self, document: &Document) -> bool {
1175 if document.body.trim().len() < 200 {
1177 return false;
1178 }
1179
1180 if let Some(ref description) = document.description {
1182 if description.len() > 100 && !description.ends_with("...") {
1184 return false;
1185 }
1186 }
1187
1188 if document.body.len() > 8000 {
1190 return false;
1191 }
1192
1193 true
1195 }
1196
1197 async fn get_search_role(&self, search_query: &SearchQuery) -> Result<Role> {
1199 let search_role = match &search_query.role {
1200 Some(role) => role.clone(),
1201 None => self.config_state.get_default_role().await,
1202 };
1203
1204 log::debug!("Searching for role: {:?}", search_role);
1205 let Some(role) = self.config_state.get_role(&search_role).await else {
1206 return Err(ServiceError::Config(format!(
1207 "Role `{}` not found in config",
1208 search_role
1209 )));
1210 };
1211 Ok(role)
1212 }
1213
1214 fn term_matches_with_word_boundaries(term: &str, text: &str) -> bool {
1216 if let Ok(regex) = Regex::new(&format!(r"\b{}\b", regex::escape(term))) {
1218 regex.is_match(text)
1219 } else {
1220 text.contains(term)
1222 }
1223 }
1224
1225 pub async fn apply_logical_operators_to_documents(
1227 &mut self,
1228 search_query: &SearchQuery,
1229 documents: Vec<Document>,
1230 ) -> Result<Vec<Document>> {
1231 use terraphim_types::LogicalOperator;
1232
1233 let all_terms = search_query.get_all_terms();
1234 let operator = search_query.get_operator();
1235
1236 let initial_doc_count = documents.len();
1237
1238 log::debug!(
1239 "Applying {:?} operator to {} documents with {} search terms",
1240 operator,
1241 initial_doc_count,
1242 all_terms.len()
1243 );
1244
1245 let filtered_docs: Vec<Document> = documents
1246 .into_iter()
1247 .filter(|doc| {
1248 let searchable_text = format!(
1250 "{} {} {}",
1251 doc.title.to_lowercase(),
1252 doc.body.to_lowercase(),
1253 doc.description
1254 .as_ref()
1255 .unwrap_or(&String::new())
1256 .to_lowercase()
1257 );
1258
1259 match operator {
1260 LogicalOperator::And => {
1261 all_terms.iter().all(|term| {
1263 Self::term_matches_with_word_boundaries(
1264 &term.as_str().to_lowercase(),
1265 &searchable_text,
1266 )
1267 })
1268 }
1269 LogicalOperator::Or => {
1270 all_terms.iter().any(|term| {
1272 Self::term_matches_with_word_boundaries(
1273 &term.as_str().to_lowercase(),
1274 &searchable_text,
1275 )
1276 })
1277 }
1278 }
1279 })
1280 .collect();
1281
1282 log::debug!(
1283 "Logical operator filtering: {} -> {} documents",
1284 initial_doc_count,
1285 filtered_docs.len()
1286 );
1287
1288 let combined_query_string = all_terms
1290 .iter()
1291 .map(|t| t.as_str())
1292 .collect::<Vec<_>>()
1293 .join(" ");
1294 let query = Query::new(&combined_query_string);
1295 let sorted_docs = score::sort_documents(&query, filtered_docs);
1296
1297 Ok(sorted_docs)
1298 }
1299
1300 pub async fn search_documents_selected_role(
1303 &mut self,
1304 search_term: &NormalizedTermValue,
1305 ) -> Result<Vec<Document>> {
1306 let role = self.config_state.get_selected_role().await;
1307 let documents = self
1308 .search(&SearchQuery {
1309 search_term: search_term.clone(),
1310 search_terms: None,
1311 operator: None,
1312 role: Some(role),
1313 skip: None,
1314 limit: None,
1315 })
1316 .await?;
1317 Ok(documents)
1318 }
1319
1320 pub async fn search(&mut self, search_query: &SearchQuery) -> Result<Vec<Document>> {
1322 log::debug!("Role for searching: {:?}", search_query.role);
1324 let role = self.get_search_role(search_query).await?;
1325
1326 log::trace!("Building index for search query: {:?}", search_query);
1327 let index: Index =
1328 terraphim_middleware::search_haystacks(self.config_state.clone(), search_query.clone())
1329 .await?;
1330
1331 match role.relevance_function {
1332 RelevanceFunction::TitleScorer => {
1333 log::debug!("Searching haystack with title scorer");
1334
1335 let documents = index.get_all_documents();
1336
1337 log::debug!("Sorting documents by relevance");
1338
1339 let documents = if search_query.is_multi_term_query() {
1340 self.apply_logical_operators_to_documents(search_query, documents)
1342 .await?
1343 } else {
1344 let query = Query::new(&search_query.search_term.to_string());
1346 score::sort_documents(&query, documents)
1347 };
1348 let total_length = documents.len();
1349 let mut docs_ranked = Vec::new();
1350 for (idx, doc) in documents.iter().enumerate() {
1351 let mut document: terraphim_types::Document = doc.clone();
1352 let rank = (total_length - idx).try_into().unwrap();
1353 document.rank = Some(rank);
1354
1355 if document.id.starts_with("http://") || document.id.starts_with("https://") {
1357 log::debug!(
1359 "Processing Atomic Data document '{}' (URL: {})",
1360 document.title,
1361 document.id
1362 );
1363
1364 let mut placeholder = Document {
1366 id: document.id.clone(),
1367 ..Default::default()
1368 };
1369 match placeholder.load().await {
1370 Ok(persisted_doc) => {
1371 log::debug!(
1373 "Found cached Atomic Data document '{}' in persistence",
1374 document.title
1375 );
1376 if let Some(better_description) = persisted_doc.description {
1377 document.description = Some(better_description);
1378 }
1379 if !persisted_doc.body.is_empty() && !role.terraphim_it {
1383 log::debug!(
1384 "Updated body from persistence for Atomic document '{}' (role: '{}', terraphim_it: {})",
1385 document.title,
1386 role.name,
1387 role.terraphim_it
1388 );
1389 document.body = persisted_doc.body;
1390 } else if role.terraphim_it {
1391 log::debug!(
1392 "Keeping search result body for Atomic document '{}' because role '{}' uses KG preprocessing (terraphim_it=true)",
1393 document.title,
1394 role.name
1395 );
1396 }
1397 }
1398 Err(_) => {
1399 log::debug!(
1401 "Caching Atomic Data document '{}' to persistence for future queries",
1402 document.title
1403 );
1404
1405 let doc_to_save = document.clone();
1407 tokio::spawn(async move {
1408 if let Err(e) = doc_to_save.save().await {
1409 log::warn!(
1410 "Failed to cache Atomic Data document '{}': {}",
1411 doc_to_save.title,
1412 e
1413 );
1414 } else {
1415 log::debug!(
1416 "Successfully cached Atomic Data document '{}'",
1417 doc_to_save.title
1418 );
1419 }
1420 });
1421 }
1422 }
1423 } else {
1424 let should_lookup_persistence = document
1426 .get_source_haystack()
1427 .and_then(|source| {
1428 role.haystacks
1429 .iter()
1430 .find(|haystack| haystack.location == *source)
1431 })
1432 .map(|haystack| haystack.fetch_content)
1433 .unwrap_or(true);
1434
1435 if !should_lookup_persistence {
1436 log::trace!(
1437 "Skipping persistence lookup for '{}' (haystack fetch_content=false)",
1438 document.title
1439 );
1440 } else {
1441 let mut placeholder = Document {
1442 id: document.id.clone(),
1443 ..Default::default()
1444 };
1445 if let Ok(persisted_doc) = placeholder.load().await {
1446 if let Some(better_description) = persisted_doc.description {
1447 log::debug!(
1448 "Replaced ripgrep description for '{}' with persistence description",
1449 document.title
1450 );
1451 document.description = Some(better_description);
1452 }
1453 } else {
1454 let normalized_id = normalize_filename_to_id(&document.title);
1457
1458 let mut normalized_placeholder = Document {
1459 id: normalized_id.clone(),
1460 ..Default::default()
1461 };
1462 if let Ok(persisted_doc) = normalized_placeholder.load().await {
1463 if let Some(better_description) = persisted_doc.description {
1464 log::debug!(
1465 "Replaced ripgrep description for '{}' with persistence description (normalized from title: {})",
1466 document.title,
1467 normalized_id
1468 );
1469 document.description = Some(better_description);
1470 }
1471 } else {
1472 let normalized_id_with_md = format!("{}md", normalized_id);
1474 let mut md_placeholder = Document {
1475 id: normalized_id_with_md.clone(),
1476 ..Default::default()
1477 };
1478 if let Ok(persisted_doc) = md_placeholder.load().await {
1479 if let Some(better_description) = persisted_doc.description
1480 {
1481 log::debug!(
1482 "Replaced ripgrep description for '{}' with persistence description (normalized with md: {})",
1483 document.title,
1484 normalized_id_with_md
1485 );
1486 document.description = Some(better_description);
1487 }
1488 } else {
1489 log::debug!(
1490 "No persistence document found for '{}' (tried ID: '{}', normalized: '{}', with md: '{}')",
1491 document.title,
1492 document.id,
1493 normalized_id,
1494 normalized_id_with_md
1495 );
1496 }
1497 }
1498 }
1499 }
1500 }
1501
1502 docs_ranked.push(document);
1503 }
1504
1505 #[cfg(feature = "openrouter")]
1508 if role.has_llm_config() && role.llm_auto_summarize {
1509 log::debug!(
1510 "Applying OpenRouter AI summarization to {} search results for role '{}'",
1511 docs_ranked.len(),
1512 role.name
1513 );
1514 docs_ranked = self
1515 .enhance_descriptions_with_ai(docs_ranked, &role)
1516 .await?;
1517 } else {
1518 eprintln!(
1520 "📋 Entering LLM AI summarization branch for role: {}",
1521 role.name
1522 );
1523 log::debug!(
1524 "Applying LLM AI summarization to {} search results for role '{}'",
1525 docs_ranked.len(),
1526 role.name
1527 );
1528 docs_ranked = self
1529 .enhance_descriptions_with_ai(docs_ranked, &role)
1530 .await?;
1531 }
1532
1533 if role.terraphim_it {
1535 log::info!(
1536 "🧠 Applying KG preprocessing to {} TerraphimGraph search results for role '{}'",
1537 docs_ranked.len(),
1538 role.name
1539 );
1540 let mut processed_docs = Vec::new();
1541 let mut total_kg_terms = 0;
1542 let mut docs_with_kg_links = 0;
1543
1544 for document in docs_ranked {
1545 let original_body_len = document.body.len();
1546 let processed_doc =
1547 self.preprocess_document_content(document, &role).await?;
1548
1549 let new_body_len = processed_doc.body.len();
1551 if new_body_len > original_body_len {
1552 docs_with_kg_links += 1;
1553 let estimated_links = (new_body_len - original_body_len) / 17;
1555 total_kg_terms += estimated_links;
1556 }
1557
1558 processed_docs.push(processed_doc);
1559 }
1560
1561 log::info!(
1562 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1563 processed_docs.len(),
1564 docs_with_kg_links,
1565 total_kg_terms
1566 );
1567 Ok(processed_docs)
1568 } else {
1569 Ok(docs_ranked)
1570 }
1571 }
1572 RelevanceFunction::BM25 => {
1573 log::debug!("Searching haystack with BM25 scorer");
1574
1575 let documents = index.get_all_documents();
1576
1577 log::debug!("Sorting documents by BM25 relevance");
1578
1579 let documents = if search_query.is_multi_term_query() {
1580 let filtered_docs = self
1582 .apply_logical_operators_to_documents(search_query, documents)
1583 .await?;
1584 let combined_query_string = search_query
1586 .get_all_terms()
1587 .iter()
1588 .map(|t| t.as_str())
1589 .collect::<Vec<_>>()
1590 .join(" ");
1591 let query =
1592 Query::new(&combined_query_string).name_scorer(score::QueryScorer::BM25);
1593 score::sort_documents(&query, filtered_docs)
1594 } else {
1595 let query = Query::new(&search_query.search_term.to_string())
1597 .name_scorer(score::QueryScorer::BM25);
1598 score::sort_documents(&query, documents)
1599 };
1600 let total_length = documents.len();
1601 let mut docs_ranked = Vec::new();
1602 for (idx, doc) in documents.iter().enumerate() {
1603 let mut document: terraphim_types::Document = doc.clone();
1604 let rank = (total_length - idx).try_into().unwrap();
1605 document.rank = Some(rank);
1606 docs_ranked.push(document);
1607 }
1608
1609 #[cfg(feature = "openrouter")]
1611 if role.has_llm_config() && role.llm_auto_summarize {
1612 log::debug!(
1613 "Applying OpenRouter AI summarization to {} BM25 search results for role '{}'",
1614 docs_ranked.len(),
1615 role.name
1616 );
1617 docs_ranked = self
1618 .enhance_descriptions_with_ai(docs_ranked, &role)
1619 .await?;
1620 } else {
1621 log::debug!(
1623 "Applying LLM AI summarization to {} BM25 search results for role '{}'",
1624 docs_ranked.len(),
1625 role.name
1626 );
1627 docs_ranked = self
1628 .enhance_descriptions_with_ai(docs_ranked, &role)
1629 .await?;
1630 }
1631
1632 if role.terraphim_it {
1634 log::info!(
1635 "🧠 Applying KG preprocessing to {} BM25 search results for role '{}'",
1636 docs_ranked.len(),
1637 role.name
1638 );
1639 let mut processed_docs = Vec::new();
1640 let mut total_kg_terms = 0;
1641 let mut docs_with_kg_links = 0;
1642
1643 for document in docs_ranked {
1644 let original_body_len = document.body.len();
1645 let processed_doc =
1646 self.preprocess_document_content(document, &role).await?;
1647
1648 let new_body_len = processed_doc.body.len();
1650 if new_body_len > original_body_len {
1651 docs_with_kg_links += 1;
1652 let estimated_links = (new_body_len - original_body_len) / 17;
1653 total_kg_terms += estimated_links;
1654 }
1655
1656 processed_docs.push(processed_doc);
1657 }
1658
1659 log::info!(
1660 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1661 processed_docs.len(),
1662 docs_with_kg_links,
1663 total_kg_terms
1664 );
1665 Ok(processed_docs)
1666 } else {
1667 Ok(docs_ranked)
1668 }
1669 }
1670 RelevanceFunction::BM25F => {
1671 log::debug!("Searching haystack with BM25F scorer");
1672
1673 let documents = index.get_all_documents();
1674
1675 log::debug!("Sorting documents by BM25F relevance");
1676
1677 let documents = if search_query.is_multi_term_query() {
1678 let filtered_docs = self
1680 .apply_logical_operators_to_documents(search_query, documents)
1681 .await?;
1682 let combined_query_string = search_query
1684 .get_all_terms()
1685 .iter()
1686 .map(|t| t.as_str())
1687 .collect::<Vec<_>>()
1688 .join(" ");
1689 let query =
1690 Query::new(&combined_query_string).name_scorer(score::QueryScorer::BM25F);
1691 score::sort_documents(&query, filtered_docs)
1692 } else {
1693 let query = Query::new(&search_query.search_term.to_string())
1695 .name_scorer(score::QueryScorer::BM25F);
1696 score::sort_documents(&query, documents)
1697 };
1698 let total_length = documents.len();
1699 let mut docs_ranked = Vec::new();
1700 for (idx, doc) in documents.iter().enumerate() {
1701 let mut document: terraphim_types::Document = doc.clone();
1702 let rank = (total_length - idx).try_into().unwrap();
1703 document.rank = Some(rank);
1704 docs_ranked.push(document);
1705 }
1706
1707 #[cfg(feature = "openrouter")]
1709 if role.has_llm_config() && role.llm_auto_summarize {
1710 log::debug!(
1711 "Applying OpenRouter AI summarization to {} BM25F search results for role '{}'",
1712 docs_ranked.len(),
1713 role.name
1714 );
1715 docs_ranked = self
1716 .enhance_descriptions_with_ai(docs_ranked, &role)
1717 .await?;
1718 } else {
1719 log::debug!(
1721 "Applying LLM AI summarization to {} BM25F search results for role '{}'",
1722 docs_ranked.len(),
1723 role.name
1724 );
1725 docs_ranked = self
1726 .enhance_descriptions_with_ai(docs_ranked, &role)
1727 .await?;
1728 }
1729
1730 if role.terraphim_it {
1732 log::info!(
1733 "🧠 Applying KG preprocessing to {} BM25F search results for role '{}'",
1734 docs_ranked.len(),
1735 role.name
1736 );
1737 let mut processed_docs = Vec::new();
1738 let mut total_kg_terms = 0;
1739 let mut docs_with_kg_links = 0;
1740
1741 for document in docs_ranked {
1742 let original_body_len = document.body.len();
1743 let processed_doc =
1744 self.preprocess_document_content(document, &role).await?;
1745
1746 let new_body_len = processed_doc.body.len();
1748 if new_body_len > original_body_len {
1749 docs_with_kg_links += 1;
1750 let estimated_links = (new_body_len - original_body_len) / 17;
1751 total_kg_terms += estimated_links;
1752 }
1753
1754 processed_docs.push(processed_doc);
1755 }
1756
1757 log::info!(
1758 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1759 processed_docs.len(),
1760 docs_with_kg_links,
1761 total_kg_terms
1762 );
1763 Ok(processed_docs)
1764 } else {
1765 Ok(docs_ranked)
1766 }
1767 }
1768 RelevanceFunction::BM25Plus => {
1769 log::debug!("Searching haystack with BM25Plus scorer");
1770
1771 let documents = index.get_all_documents();
1772
1773 log::debug!("Sorting documents by BM25Plus relevance");
1774
1775 let documents = if search_query.is_multi_term_query() {
1776 let filtered_docs = self
1778 .apply_logical_operators_to_documents(search_query, documents)
1779 .await?;
1780 let combined_query_string = search_query
1782 .get_all_terms()
1783 .iter()
1784 .map(|t| t.as_str())
1785 .collect::<Vec<_>>()
1786 .join(" ");
1787 let query = Query::new(&combined_query_string)
1788 .name_scorer(score::QueryScorer::BM25Plus);
1789 score::sort_documents(&query, filtered_docs)
1790 } else {
1791 let query = Query::new(&search_query.search_term.to_string())
1793 .name_scorer(score::QueryScorer::BM25Plus);
1794 score::sort_documents(&query, documents)
1795 };
1796 let total_length = documents.len();
1797 let mut docs_ranked = Vec::new();
1798 for (idx, doc) in documents.iter().enumerate() {
1799 let mut document: terraphim_types::Document = doc.clone();
1800 let rank = (total_length - idx).try_into().unwrap();
1801 document.rank = Some(rank);
1802 docs_ranked.push(document);
1803 }
1804
1805 #[cfg(feature = "openrouter")]
1807 if role.has_llm_config() && role.llm_auto_summarize {
1808 log::debug!(
1809 "Applying OpenRouter AI summarization to {} BM25Plus search results for role '{}'",
1810 docs_ranked.len(),
1811 role.name
1812 );
1813 docs_ranked = self
1814 .enhance_descriptions_with_ai(docs_ranked, &role)
1815 .await?;
1816 }
1817
1818 if role.terraphim_it {
1820 log::info!(
1821 "🧠 Applying KG preprocessing to {} BM25Plus search results for role '{}'",
1822 docs_ranked.len(),
1823 role.name
1824 );
1825 let mut processed_docs = Vec::new();
1826 let mut total_kg_terms = 0;
1827 let mut docs_with_kg_links = 0;
1828
1829 for document in docs_ranked {
1830 let original_body_len = document.body.len();
1831 let processed_doc =
1832 self.preprocess_document_content(document, &role).await?;
1833
1834 let new_body_len = processed_doc.body.len();
1836 if new_body_len > original_body_len {
1837 docs_with_kg_links += 1;
1838 let estimated_links = (new_body_len - original_body_len) / 17;
1839 total_kg_terms += estimated_links;
1840 }
1841
1842 processed_docs.push(processed_doc);
1843 }
1844
1845 log::info!(
1846 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1847 processed_docs.len(),
1848 docs_with_kg_links,
1849 total_kg_terms
1850 );
1851 Ok(processed_docs)
1852 } else {
1853 Ok(docs_ranked)
1854 }
1855 }
1856 RelevanceFunction::TerraphimGraph => {
1857 eprintln!("🧠 TerraphimGraph search initiated for role: {}", role.name);
1858 self.build_thesaurus(search_query).await?;
1859 let _thesaurus = self.ensure_thesaurus_loaded(&role.name).await?;
1860 let scored_index_docs: Vec<IndexedDocument> = self
1861 .config_state
1862 .search_indexed_documents(search_query, &role)
1863 .await;
1864
1865 log::debug!(
1866 "TerraphimGraph search found {} indexed documents",
1867 scored_index_docs.len()
1868 );
1869
1870 log::debug!("Ranking documents with thesaurus");
1873 let mut documents = index.get_documents(scored_index_docs.clone());
1874
1875 let all_haystack_docs = index.get_all_documents();
1878 log::debug!(
1879 "Found {} total documents from haystacks, checking which need indexing",
1880 all_haystack_docs.len()
1881 );
1882 let mut need_reindexing = false;
1883
1884 if let Some(rolegraph_sync) = self.config_state.roles.get(&role.name) {
1885 let mut rolegraph = rolegraph_sync.lock().await;
1886 let mut newly_indexed = 0;
1887
1888 for doc in &all_haystack_docs {
1889 if !rolegraph.has_document(&doc.id) && !doc.body.is_empty() {
1891 log::debug!(
1892 "Indexing new document '{}' into rolegraph for TerraphimGraph search",
1893 doc.id
1894 );
1895 rolegraph.insert_document(&doc.id, doc.clone());
1896
1897 drop(rolegraph);
1900 if let Err(e) = doc.save().await {
1901 log::warn!(
1902 "Failed to save document '{}' to persistence: {}",
1903 doc.id,
1904 e
1905 );
1906 } else {
1907 log::debug!(
1908 "Successfully saved document '{}' to persistence",
1909 doc.id
1910 );
1911 }
1912 rolegraph = rolegraph_sync.lock().await;
1914
1915 newly_indexed += 1;
1916 }
1917 }
1918
1919 if newly_indexed > 0 {
1920 log::info!(
1921 "✅ Indexed {} new documents into rolegraph for role '{}'",
1922 newly_indexed,
1923 role.name
1924 );
1925 log::debug!(
1926 "RoleGraph now has {} nodes, {} edges, {} documents",
1927 rolegraph.get_node_count(),
1928 rolegraph.get_edge_count(),
1929 rolegraph.get_document_count()
1930 );
1931 need_reindexing = true; }
1933 }
1934
1935 let mut documents_with_content = Vec::new();
1938
1939 for mut document in documents {
1940 if document.body.is_empty() {
1942 log::debug!(
1943 "Document '{}' has empty body, attempting to load from persistence",
1944 document.id
1945 );
1946
1947 let mut full_doc = Document::new(document.id.clone());
1949 match full_doc.load().await {
1950 Ok(loaded_doc) => {
1951 if !loaded_doc.body.is_empty() {
1952 log::info!(
1953 "✅ Loaded body content for document '{}' from persistence",
1954 document.id
1955 );
1956 document.body = loaded_doc.body.clone();
1957 if loaded_doc.description.is_some() {
1958 document.description = loaded_doc.description.clone();
1959 }
1960
1961 if let Some(rolegraph_sync) =
1963 self.config_state.roles.get(&role.name)
1964 {
1965 let mut rolegraph = rolegraph_sync.lock().await;
1966 rolegraph.insert_document(&document.id, loaded_doc);
1967 need_reindexing = true;
1968 log::debug!(
1969 "Re-indexed document '{}' into rolegraph with content",
1970 document.id
1971 );
1972 }
1973 } else {
1974 log::warn!(
1975 "Document '{}' still has empty body after loading from persistence",
1976 document.id
1977 );
1978 }
1979 }
1980 Err(e) => {
1981 log::warn!(
1982 "Failed to load document '{}' from persistence: {}",
1983 document.id,
1984 e
1985 );
1986
1987 if document.url.starts_with('/')
1989 || document.url.starts_with("docs/")
1990 {
1991 match tokio::fs::read_to_string(&document.url).await {
1992 Ok(content) => {
1993 log::info!(
1994 "✅ Loaded content for '{}' from file: {}",
1995 document.id,
1996 document.url
1997 );
1998 document.body = content.clone();
1999
2000 let full_doc = Document {
2002 id: document.id.clone(),
2003 title: document.title.clone(),
2004 body: content,
2005 url: document.url.clone(),
2006 description: document.description.clone(),
2007 summarization: document.summarization.clone(),
2008 stub: None,
2009 tags: document.tags.clone(),
2010 rank: document.rank,
2011 source_haystack: document.source_haystack.clone(),
2012 };
2013
2014 if let Err(e) = full_doc.save().await {
2016 log::warn!(
2017 "Failed to save document '{}' to persistence: {}",
2018 document.id,
2019 e
2020 );
2021 }
2022
2023 if let Some(rolegraph_sync) =
2025 self.config_state.roles.get(&role.name)
2026 {
2027 let mut rolegraph = rolegraph_sync.lock().await;
2028 rolegraph.insert_document(&document.id, full_doc);
2029 need_reindexing = true;
2030 log::debug!(
2031 "Re-indexed document '{}' into rolegraph from file",
2032 document.id
2033 );
2034 }
2035 }
2036 Err(file_e) => {
2037 log::warn!(
2038 "Failed to read file '{}' for document '{}': {}",
2039 document.url,
2040 document.id,
2041 file_e
2042 );
2043 }
2044 }
2045 }
2046 }
2047 }
2048 }
2049 documents_with_content.push(document);
2050 }
2051
2052 documents = documents_with_content;
2053
2054 if need_reindexing {
2055 log::info!("🔄 Re-running TerraphimGraph search after indexing new documents");
2056
2057 let updated_scored_docs: Vec<IndexedDocument> = self
2059 .config_state
2060 .search_indexed_documents(search_query, &role)
2061 .await;
2062
2063 if !updated_scored_docs.is_empty() {
2064 log::debug!(
2065 "✅ Updated rolegraph search found {} documents",
2066 updated_scored_docs.len()
2067 );
2068 let updated_documents = index.get_documents(updated_scored_docs);
2070 if !updated_documents.is_empty() {
2071 documents = updated_documents;
2072 }
2073 }
2074 }
2075
2076 if !documents.is_empty() {
2078 log::debug!(
2079 "Applying TF-IDF scoring to {} documents for enhanced ranking",
2080 documents.len()
2081 );
2082
2083 use crate::score::bm25_additional::TFIDFScorer;
2084 let mut tfidf_scorer = TFIDFScorer::new();
2085 tfidf_scorer.initialize(&documents);
2086
2087 let query_text = &search_query.search_term.to_string();
2089 for document in &mut documents {
2090 let tfidf_score = tfidf_scorer.score(query_text, document);
2091 if let Some(rank) = document.rank {
2093 document.rank = Some(rank + (tfidf_score * 0.3) as u64);
2094 } else {
2096 document.rank = Some((tfidf_score * 10.0) as u64); }
2098 }
2099
2100 documents.sort_by(|a, b| b.rank.unwrap_or(0).cmp(&a.rank.unwrap_or(0)));
2102
2103 log::debug!("TF-IDF scoring applied successfully");
2104 }
2105
2106 for document in &mut documents {
2108 if document.id.starts_with("http://") || document.id.starts_with("https://") {
2109 log::debug!(
2111 "Processing Atomic Data document '{}' (URL: {})",
2112 document.title,
2113 document.id
2114 );
2115
2116 let mut placeholder = Document {
2118 id: document.id.clone(),
2119 ..Default::default()
2120 };
2121 match placeholder.load().await {
2122 Ok(persisted_doc) => {
2123 log::debug!(
2125 "Found cached Atomic Data document '{}' in persistence",
2126 document.title
2127 );
2128 if let Some(better_description) = persisted_doc.description {
2129 document.description = Some(better_description);
2130 }
2131 if !persisted_doc.body.is_empty() && !role.terraphim_it {
2135 log::debug!(
2136 "Updated body from persistence for Atomic document '{}' (role: '{}', terraphim_it: {})",
2137 document.title,
2138 role.name,
2139 role.terraphim_it
2140 );
2141 document.body = persisted_doc.body;
2142 } else if role.terraphim_it {
2143 log::debug!(
2144 "Keeping search result body for Atomic document '{}' because role '{}' uses KG preprocessing (terraphim_it=true)",
2145 document.title,
2146 role.name
2147 );
2148 }
2149 }
2150 Err(_) => {
2151 log::debug!(
2153 "Caching Atomic Data document '{}' to persistence for future queries",
2154 document.title
2155 );
2156
2157 let doc_to_save = document.clone();
2159 tokio::spawn(async move {
2160 if let Err(e) = doc_to_save.save().await {
2161 log::warn!(
2162 "Failed to cache Atomic Data document '{}': {}",
2163 doc_to_save.title,
2164 e
2165 );
2166 } else {
2167 log::debug!(
2168 "Successfully cached Atomic Data document '{}'",
2169 doc_to_save.title
2170 );
2171 }
2172 });
2173 }
2174 }
2175 } else {
2176 let mut placeholder = Document {
2178 id: document.id.clone(),
2179 ..Default::default()
2180 };
2181 if let Ok(persisted_doc) = placeholder.load().await {
2182 if let Some(better_description) = persisted_doc.description {
2183 log::debug!(
2184 "Replaced ripgrep description for '{}' with persistence description",
2185 document.title
2186 );
2187 document.description = Some(better_description);
2188 }
2189 } else {
2190 let normalized_id = normalize_filename_to_id(&document.title);
2193
2194 let mut normalized_placeholder = Document {
2195 id: normalized_id.clone(),
2196 ..Default::default()
2197 };
2198 if let Ok(persisted_doc) = normalized_placeholder.load().await {
2199 if let Some(better_description) = persisted_doc.description {
2200 log::debug!(
2201 "Replaced ripgrep description for '{}' with persistence description (normalized from title: {})",
2202 document.title,
2203 normalized_id
2204 );
2205 document.description = Some(better_description);
2206 }
2207 } else {
2208 let normalized_id_with_md = format!("{}md", normalized_id);
2210 let mut md_placeholder = Document {
2211 id: normalized_id_with_md.clone(),
2212 ..Default::default()
2213 };
2214 if let Ok(persisted_doc) = md_placeholder.load().await {
2215 if let Some(better_description) = persisted_doc.description {
2216 log::debug!(
2217 "Replaced ripgrep description for '{}' with persistence description (normalized with md: {})",
2218 document.title,
2219 normalized_id_with_md
2220 );
2221 document.description = Some(better_description);
2222 }
2223 } else {
2224 log::debug!(
2225 "No persistence document found for '{}' (tried ID: '{}', normalized: '{}', with md: '{}')",
2226 document.title,
2227 document.id,
2228 normalized_id,
2229 normalized_id_with_md
2230 );
2231 }
2232 }
2233 }
2234 }
2235 }
2236
2237 #[cfg(feature = "openrouter")]
2239 if role.has_llm_config() {
2240 log::debug!(
2241 "Applying OpenRouter AI summarization to {} search results for role '{}'",
2242 documents.len(),
2243 role.name
2244 );
2245 documents = self.enhance_descriptions_with_ai(documents, &role).await?;
2246 } else {
2247 log::debug!(
2249 "Applying LLM AI summarization to {} search results for role '{}'",
2250 documents.len(),
2251 role.name
2252 );
2253 documents = self.enhance_descriptions_with_ai(documents, &role).await?;
2254 }
2255
2256 if role.terraphim_it {
2258 log::debug!(
2259 "Applying KG preprocessing to {} search results for role '{}'",
2260 documents.len(),
2261 role.name
2262 );
2263 let mut processed_docs = Vec::new();
2264 for document in documents {
2265 let processed_doc =
2266 self.preprocess_document_content(document, &role).await?;
2267 processed_docs.push(processed_doc);
2268 }
2269 Ok(processed_docs)
2270 } else {
2271 Ok(documents)
2272 }
2273 }
2274 }
2275 }
2276
2277 fn is_hash_based_id(id: &str) -> bool {
2279 id.len() == 16 && id.chars().all(|c| c.is_ascii_hexdigit())
2280 }
2281
2282 pub async fn find_documents_for_kg_term(
2293 &mut self,
2294 role_name: &RoleName,
2295 term: &str,
2296 ) -> Result<Vec<Document>> {
2297 log::debug!(
2298 "Finding documents for KG term '{}' in role '{}'",
2299 term,
2300 role_name
2301 );
2302
2303 let thesaurus = self.ensure_thesaurus_loaded(role_name).await?;
2305
2306 let role = self.config_state.get_role(role_name).await.ok_or_else(|| {
2308 ServiceError::Config(format!("Role '{}' not found in config", role_name))
2309 })?;
2310
2311 let mut documents = Vec::new();
2312
2313 if let Some(kg_config) = &role.kg {
2317 log::debug!("Found KG config for role");
2318 if let Some(kg_local) = &kg_config.knowledge_graph_local {
2319 let mut potential_concepts = vec![term.to_string()];
2320
2321 log::debug!("Checking thesaurus for term '{}'", term);
2323
2324 let normalized_search_term =
2326 terraphim_types::NormalizedTermValue::new(term.to_string());
2327
2328 if let Some(root_concept) = thesaurus.get(&normalized_search_term) {
2330 log::debug!("Found root concept for '{}': {:?}", term, root_concept);
2331
2332 let root_concept_name = root_concept.value.as_str();
2334
2335 let concept_name = if let Some(url) = &root_concept.url {
2337 url.split('/')
2338 .next_back()
2339 .and_then(|s| s.strip_suffix(".md"))
2340 .unwrap_or(root_concept_name)
2341 } else {
2342 root_concept_name
2343 };
2344
2345 if !potential_concepts.contains(&concept_name.to_string()) {
2346 potential_concepts.push(concept_name.to_string());
2347 log::debug!(
2348 "Added concept from thesaurus: {} (root: {})",
2349 concept_name,
2350 root_concept_name
2351 );
2352 }
2353 } else {
2354 log::debug!("No direct mapping found for '{}' in thesaurus", term);
2355 }
2356
2357 log::debug!(
2358 "Trying {} potential concepts: {:?}",
2359 potential_concepts.len(),
2360 potential_concepts
2361 );
2362
2363 for concept in potential_concepts {
2365 let potential_kg_file = kg_local.path.join(format!("{}.md", concept));
2366 log::debug!("Looking for KG definition file: {:?}", potential_kg_file);
2367
2368 if potential_kg_file.exists() {
2369 log::info!("Found KG definition file: {:?}", potential_kg_file);
2370
2371 let file_path = potential_kg_file.to_string_lossy().to_string();
2373 if documents.iter().any(|d: &Document| d.url == file_path) {
2374 log::debug!("Skipping duplicate KG document: {}", file_path);
2375 continue;
2376 }
2377
2378 match std::fs::read_to_string(&potential_kg_file) {
2381 Ok(content) => {
2382 let mut kg_doc =
2383 Document::new(potential_kg_file.to_string_lossy().to_string());
2384 kg_doc.url = potential_kg_file.to_string_lossy().to_string();
2385 kg_doc.body = content.clone();
2386
2387 let title = content
2389 .lines()
2390 .find(|line| line.starts_with("# "))
2391 .map(|line| line.trim_start_matches("# ").trim())
2392 .unwrap_or(&concept)
2393 .to_string();
2394 kg_doc.title = title;
2395
2396 log::debug!(
2397 "Successfully loaded KG definition document: {}",
2398 kg_doc.title
2399 );
2400 documents.push(kg_doc);
2401
2402 break;
2404 }
2405 Err(e) => {
2406 log::warn!(
2407 "Failed to read KG definition file '{}': {}",
2408 potential_kg_file.display(),
2409 e
2410 );
2411 }
2412 }
2413 } else {
2414 log::debug!("KG definition file not found: {:?}", potential_kg_file);
2415 }
2416 }
2417 } else {
2418 log::debug!("No KG local config found");
2419 }
2420 } else {
2421 log::debug!("No KG config found for role");
2422 }
2423
2424 let rolegraph_sync = self
2426 .config_state
2427 .roles
2428 .get(role_name)
2429 .ok_or_else(|| ServiceError::Config(format!("Role '{}' not found", role_name)))?;
2430
2431 let rolegraph = rolegraph_sync.lock().await;
2432 let document_ids = rolegraph.find_document_ids_for_term(term);
2433 drop(rolegraph); log::debug!(
2436 "Found {} document IDs from rolegraph for term '{}'",
2437 document_ids.len(),
2438 term
2439 );
2440
2441 for doc_id in &document_ids {
2443 if documents
2445 .iter()
2446 .any(|d| d.id == *doc_id || d.url == *doc_id)
2447 {
2448 log::debug!("Skipping duplicate document from rolegraph: {}", doc_id);
2449 continue;
2450 }
2451
2452 if doc_id.starts_with("http://") || doc_id.starts_with("https://") {
2455 log::debug!("Loading Atomic Data document '{}' from persistence", doc_id);
2457 let mut placeholder = Document {
2458 id: doc_id.clone(),
2459 ..Default::default()
2460 };
2461 match placeholder.load().await {
2462 Ok(loaded_doc) => {
2463 log::debug!(
2464 "Found cached Atomic Data document '{}' in persistence",
2465 doc_id
2466 );
2467 documents.push(loaded_doc);
2468 }
2469 Err(_) => {
2470 log::warn!(
2471 "Atomic Data document '{}' not found in persistence - this may indicate the document hasn't been cached yet",
2472 doc_id
2473 );
2474 }
2477 }
2478 } else {
2479 let mut doc = Document::new(doc_id.clone());
2481 match doc.load().await {
2482 Ok(loaded_doc) => {
2483 documents.push(loaded_doc);
2484 log::trace!("Successfully loaded local document: {}", doc_id);
2485 }
2486 Err(e) => {
2487 log::warn!("Failed to load local document '{}': {}", doc_id, e);
2488
2489 if Self::is_hash_based_id(doc_id) {
2491 log::debug!(
2492 "Document ID '{}' appears to be hash-based (legacy document), skipping for now",
2493 doc_id
2494 );
2495 log::info!(
2496 "💡 Hash-based document IDs are deprecated. This document will be re-indexed with normalized IDs on next haystack search."
2497 );
2498 }
2501
2502 }
2504 }
2505 }
2506 }
2507
2508 if role.terraphim_it {
2510 log::info!(
2511 "🧠 Applying KG preprocessing to {} KG term documents for role '{}' (terraphim_it enabled)",
2512 documents.len(),
2513 role_name
2514 );
2515 let mut processed_documents = Vec::new();
2516 let mut total_kg_terms = 0;
2517 let mut docs_with_kg_links = 0;
2518
2519 for document in documents {
2520 let original_body_len = document.body.len();
2521 let processed_doc = self.preprocess_document_content(document, &role).await?;
2522
2523 let new_body_len = processed_doc.body.len();
2525 if new_body_len > original_body_len {
2526 docs_with_kg_links += 1;
2527 let estimated_links = (new_body_len - original_body_len) / 17;
2528 total_kg_terms += estimated_links;
2529 }
2530
2531 processed_documents.push(processed_doc);
2532 }
2533
2534 log::info!(
2535 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
2536 processed_documents.len(),
2537 docs_with_kg_links,
2538 total_kg_terms
2539 );
2540 documents = processed_documents;
2541 } else {
2542 log::info!(
2543 "🔍 terraphim_it disabled for role '{}', skipping KG preprocessing for {} documents",
2544 role_name,
2545 documents.len()
2546 );
2547 }
2548
2549 let total_length = documents.len();
2552 for (idx, doc) in documents.iter_mut().enumerate() {
2553 let rank = (total_length - idx) as u64;
2554 doc.rank = Some(rank);
2555 log::trace!("Assigned rank {} to document '{}'", rank, doc.title);
2556 }
2557
2558 log::debug!(
2559 "Successfully loaded and processed {} documents for term '{}', ranks assigned from {} to 1",
2560 documents.len(),
2561 term,
2562 total_length
2563 );
2564 Ok(documents)
2565 }
2566
2567 #[cfg(feature = "openrouter")]
2584 pub async fn generate_document_summary(
2585 &self,
2586 document: &Document,
2587 api_key: &str,
2588 model: &str,
2589 max_length: usize,
2590 ) -> Result<String> {
2591 use crate::openrouter::OpenRouterService;
2592
2593 log::debug!(
2594 "Generating summary for document '{}' using model '{}'",
2595 document.id,
2596 model
2597 );
2598
2599 let openrouter_service =
2601 OpenRouterService::new(api_key, model).map_err(ServiceError::OpenRouter)?;
2602
2603 let content = &document.body;
2605
2606 if content.trim().is_empty() {
2607 return Err(ServiceError::Config(
2608 "Document body is empty, cannot generate summary".to_string(),
2609 ));
2610 }
2611
2612 let summary = openrouter_service
2614 .generate_summary(content, max_length)
2615 .await
2616 .map_err(ServiceError::OpenRouter)?;
2617
2618 log::info!(
2619 "Generated {}-character summary for document '{}' using model '{}'",
2620 summary.len(),
2621 document.id,
2622 model
2623 );
2624
2625 Ok(summary)
2626 }
2627
2628 #[cfg(not(feature = "openrouter"))]
2630 pub async fn generate_document_summary(
2631 &self,
2632 _document: &Document,
2633 _api_key: &str,
2634 _model: &str,
2635 _max_length: usize,
2636 ) -> Result<String> {
2637 Err(ServiceError::Config(
2638 "OpenRouter feature not enabled during compilation".to_string(),
2639 ))
2640 }
2641
2642 pub async fn fetch_config(&self) -> terraphim_config::Config {
2644 let current_config = self.config_state.config.lock().await;
2645 current_config.clone()
2646 }
2647
2648 #[cfg(test)]
2650 pub async fn get_role(&self, role_name: &RoleName) -> Result<Role> {
2651 let config = self.config_state.config.lock().await;
2652 config
2653 .roles
2654 .get(role_name)
2655 .cloned()
2656 .ok_or_else(|| ServiceError::Config(format!("Role '{}' not found", role_name)))
2657 }
2658
2659 pub async fn update_config(
2664 &self,
2665 config: terraphim_config::Config,
2666 ) -> Result<terraphim_config::Config> {
2667 let mut current_config = self.config_state.config.lock().await;
2668 *current_config = config.clone();
2669 current_config.save().await?;
2670 log::info!("Config updated");
2671 Ok(config)
2672 }
2673
2674 pub async fn update_selected_role(
2677 &self,
2678 role_name: terraphim_types::RoleName,
2679 ) -> Result<terraphim_config::Config> {
2680 let mut current_config = self.config_state.config.lock().await;
2681
2682 if !current_config.roles.contains_key(&role_name) {
2684 return Err(ServiceError::Config(format!(
2685 "Role `{}` not found in config",
2686 role_name
2687 )));
2688 }
2689
2690 current_config.selected_role = role_name.clone();
2691 current_config.save().await?;
2692
2693 if let Some(role) = current_config.roles.get(&role_name) {
2695 if role.terraphim_it {
2696 log::info!(
2697 "🎯 Selected role '{}' → terraphim_it: ✅ ENABLED (KG preprocessing will be applied)",
2698 role_name
2699 );
2700 if role.kg.is_some() {
2701 log::info!("📚 KG configuration: Available for role '{}'", role_name);
2702 } else {
2703 log::warn!(
2704 "⚠️ KG configuration: Missing for role '{}' (terraphim_it enabled but no KG)",
2705 role_name
2706 );
2707 }
2708 } else {
2709 log::info!(
2710 "🎯 Selected role '{}' → terraphim_it: ❌ DISABLED (KG preprocessing skipped)",
2711 role_name
2712 );
2713 }
2714 } else {
2715 log::info!("🎯 Selected role updated to '{}'", role_name);
2716 }
2717
2718 Ok(current_config.clone())
2719 }
2720
2721 fn highlight_search_terms(content: &str, search_query: &SearchQuery) -> String {
2726 let mut highlighted_content = content.to_string();
2727
2728 let terms = search_query.get_all_terms();
2730
2731 let mut sorted_terms: Vec<&str> = terms.iter().map(|t| t.as_str()).collect();
2733 sorted_terms.sort_by_key(|term| std::cmp::Reverse(term.len()));
2734
2735 for term in sorted_terms {
2736 if term.trim().is_empty() {
2737 continue;
2738 }
2739
2740 let escaped_term = regex::escape(term);
2743
2744 if let Ok(regex) = regex::RegexBuilder::new(&escaped_term)
2745 .case_insensitive(true)
2746 .build()
2747 {
2748 let highlight_open = "<mark class=\"search-highlight\">";
2751 let highlight_close = "</mark>";
2752
2753 highlighted_content = regex
2754 .replace_all(
2755 &highlighted_content,
2756 format!("{}{}{}", highlight_open, "$0", highlight_close),
2757 )
2758 .to_string();
2759 }
2760 }
2761
2762 highlighted_content
2763 }
2764}
2765
2766#[cfg(test)]
2767mod tests {
2768 use super::*;
2769 use std::path::PathBuf;
2770 use terraphim_config::ConfigBuilder;
2771 use terraphim_types::NormalizedTermValue;
2772
2773 #[tokio::test]
2774 async fn test_get_config() {
2775 let mut config = ConfigBuilder::new()
2776 .build_default_desktop()
2777 .build()
2778 .unwrap();
2779 let config_state = ConfigState::new(&mut config).await.unwrap();
2780 let service = TerraphimService::new(config_state);
2781 let fetched_config = service.fetch_config().await;
2782 assert_eq!(fetched_config.id, terraphim_config::ConfigId::Desktop);
2783 }
2784
2785 #[tokio::test]
2786 async fn test_search_documents_selected_role() {
2787 let mut config = ConfigBuilder::new()
2788 .build_default_desktop()
2789 .build()
2790 .unwrap();
2791 let config_state = ConfigState::new(&mut config).await.unwrap();
2792 let mut service = TerraphimService::new(config_state);
2793 let search_term = NormalizedTermValue::new("terraphim".to_string());
2794 let documents = service
2795 .search_documents_selected_role(&search_term)
2796 .await
2797 .unwrap();
2798 assert!(documents.is_empty() || !documents.is_empty()); }
2800
2801 #[tokio::test]
2802 async fn test_ensure_thesaurus_loaded_terraphim_engineer() {
2803 let project_root = std::env::current_dir().unwrap_or_else(|_| PathBuf::from("."));
2805 let kg_path = project_root.join("docs/src/kg");
2806
2807 if !kg_path.exists() {
2809 println!("⚠️ KG directory not found at {:?}, skipping test", kg_path);
2810 return;
2811 }
2812
2813 let mut config = ConfigBuilder::new()
2814 .build_default_desktop()
2815 .build()
2816 .unwrap();
2817
2818 if let Some(terr_eng_role) = config.roles.get_mut(&"Terraphim Engineer".into()) {
2820 if let Some(kg) = &mut terr_eng_role.kg {
2821 if let Some(kg_local) = &mut kg.knowledge_graph_local {
2822 kg_local.path = kg_path;
2823 }
2824 }
2825 }
2826
2827 let config_state = ConfigState::new(&mut config).await.unwrap();
2828 let mut service = TerraphimService::new(config_state);
2829
2830 let role_name = RoleName::new("Terraphim Engineer");
2831 let thesaurus_result = service.ensure_thesaurus_loaded(&role_name).await;
2832
2833 match thesaurus_result {
2834 Ok(thesaurus) => {
2835 println!(
2836 "✅ Successfully loaded thesaurus with {} entries",
2837 thesaurus.len()
2838 );
2839 assert!(!thesaurus.is_empty(), "Thesaurus should not be empty");
2841
2842 let has_terraphim = (&thesaurus)
2844 .into_iter()
2845 .any(|(term, _)| term.as_str().to_lowercase().contains("terraphim"));
2846 let has_graph = (&thesaurus)
2847 .into_iter()
2848 .any(|(term, _)| term.as_str().to_lowercase().contains("graph"));
2849
2850 println!(" Contains 'terraphim': {}", has_terraphim);
2851 println!(" Contains 'graph': {}", has_graph);
2852
2853 assert!(
2855 has_terraphim || has_graph,
2856 "Thesaurus should contain expected terms"
2857 );
2858 }
2859 Err(e) => {
2860 println!("❌ Failed to load thesaurus: {:?}", e);
2861 }
2864 }
2865 }
2866
2867 #[tokio::test]
2868 #[ignore = "Requires local KG fixtures at ~/.terraphim/kg"]
2869 async fn test_config_building_with_local_kg() {
2870 let mut config = ConfigBuilder::new()
2872 .build_default_desktop()
2873 .build()
2874 .unwrap();
2875 let config_state_result = ConfigState::new(&mut config).await;
2876
2877 match config_state_result {
2878 Ok(config_state) => {
2879 println!("✅ Successfully built config state");
2880 assert!(
2882 !config_state.roles.is_empty(),
2883 "Config state should have roles"
2884 );
2885
2886 let terraphim_engineer_role = RoleName::new("Terraphim Engineer");
2888 let has_terraphim_engineer =
2889 config_state.roles.contains_key(&terraphim_engineer_role);
2890 println!(" Has Terraphim Engineer role: {}", has_terraphim_engineer);
2891
2892 assert!(
2894 has_terraphim_engineer,
2895 "Terraphim Engineer role should exist"
2896 );
2897 }
2898 Err(e) => {
2899 println!("❌ Failed to build config state: {:?}", e);
2900 }
2903 }
2904 }
2905
2906 #[tokio::test]
2907 async fn test_atomic_data_persistence_skip() {
2908 use ahash::AHashMap;
2909 use terraphim_config::{Config, Haystack, Role, ServiceType};
2910 use terraphim_persistence::DeviceStorage;
2911 use terraphim_types::{NormalizedTermValue, RoleName, SearchQuery};
2912
2913 DeviceStorage::init_memory_only().await.unwrap();
2915
2916 let mut config = Config::default();
2918 let role_name = RoleName::new("test_role");
2919 let role = Role {
2920 shortname: None,
2921 name: "test_role".into(),
2922 haystacks: vec![Haystack {
2923 location: "test".to_string(),
2924 service: ServiceType::Ripgrep,
2925 read_only: false,
2926 atomic_server_secret: None,
2927 extra_parameters: std::collections::HashMap::new(),
2928 fetch_content: false,
2929 }],
2930 kg: None,
2931 terraphim_it: false,
2932 theme: "default".to_string(),
2933 relevance_function: terraphim_types::RelevanceFunction::TitleScorer,
2934 llm_enabled: false,
2935 llm_api_key: None,
2936 llm_model: None,
2937 llm_auto_summarize: false,
2938 llm_chat_enabled: false,
2939 llm_chat_system_prompt: None,
2940 llm_chat_model: None,
2941 llm_context_window: None,
2942 extra: AHashMap::new(),
2943 llm_router_enabled: false,
2944 llm_router_config: None,
2945 };
2946 config.roles.insert(role_name.clone(), role);
2947
2948 let config_state = ConfigState::new(&mut config).await.unwrap();
2949 let mut service = TerraphimService::new(config_state);
2950
2951 let search_query = SearchQuery {
2953 search_term: NormalizedTermValue::new("test".to_string()),
2954 search_terms: None,
2955 operator: None,
2956 limit: Some(10),
2957 skip: None,
2958 role: Some(role_name),
2959 };
2960
2961 let result = service.search(&search_query).await;
2964
2965 assert!(result.is_ok(), "Search should complete without errors");
2968 }
2969
2970 #[tokio::test]
2971 async fn test_atomic_data_caching() {
2972 use ahash::AHashMap;
2973 use terraphim_config::{Config, Haystack, Role, ServiceType};
2974 use terraphim_persistence::DeviceStorage;
2975 use terraphim_types::{Document, NormalizedTermValue, RoleName, SearchQuery};
2976
2977 DeviceStorage::init_memory_only().await.unwrap();
2979
2980 let mut config = Config::default();
2982 let role_name = RoleName::new("test_role");
2983 let role = Role {
2984 shortname: None,
2985 name: "test_role".into(),
2986 haystacks: vec![Haystack {
2987 location: "test".to_string(),
2988 service: ServiceType::Ripgrep,
2989 read_only: false,
2990 atomic_server_secret: None,
2991 extra_parameters: std::collections::HashMap::new(),
2992 fetch_content: false,
2993 }],
2994 kg: None,
2995 terraphim_it: false,
2996 theme: "default".to_string(),
2997 relevance_function: terraphim_types::RelevanceFunction::TitleScorer,
2998 llm_enabled: false,
2999 llm_api_key: None,
3000 llm_model: None,
3001 llm_auto_summarize: false,
3002 llm_chat_enabled: false,
3003 llm_chat_system_prompt: None,
3004 llm_chat_model: None,
3005 llm_context_window: None,
3006 extra: AHashMap::new(),
3007 llm_router_enabled: false,
3008 llm_router_config: None,
3009 };
3010 config.roles.insert(role_name.clone(), role);
3011
3012 let config_state = ConfigState::new(&mut config).await.unwrap();
3013 let mut service = TerraphimService::new(config_state);
3014
3015 let atomic_doc = Document {
3017 id: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3018 url: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3019 title: "Requested Loan Amount ($)".to_string(),
3020 body: "Form field for Requested Loan Amount ($)".to_string(),
3021 description: Some("Form field for Requested Loan Amount ($)".to_string()),
3022 summarization: None,
3023 stub: None,
3024 tags: None,
3025 rank: None,
3026 source_haystack: None,
3027 };
3028
3029 log::info!("Testing Atomic Data document caching...");
3031 match atomic_doc.save().await {
3032 Ok(_) => log::info!("✅ Successfully saved Atomic Data document to persistence"),
3033 Err(e) => {
3034 log::error!("❌ Failed to save Atomic Data document: {}", e);
3035 panic!("Atomic Data document save failed");
3036 }
3037 }
3038
3039 let mut placeholder = Document {
3041 id: atomic_doc.id.clone(),
3042 ..Default::default()
3043 };
3044 match placeholder.load().await {
3045 Ok(loaded_doc) => {
3046 log::info!("✅ Successfully loaded Atomic Data document from persistence");
3047 assert_eq!(loaded_doc.title, atomic_doc.title);
3048 assert_eq!(loaded_doc.body, atomic_doc.body);
3049 assert_eq!(loaded_doc.description, atomic_doc.description);
3050 }
3051 Err(e) => {
3052 log::error!(
3053 "❌ Failed to load Atomic Data document from persistence: {}",
3054 e
3055 );
3056 panic!("Atomic Data document load failed");
3057 }
3058 }
3059
3060 let search_query = SearchQuery {
3062 search_term: NormalizedTermValue::new("test".to_string()),
3063 search_terms: None,
3064 operator: None,
3065 limit: Some(10),
3066 skip: None,
3067 role: Some(role_name),
3068 };
3069
3070 let result = service.search(&search_query).await;
3071 assert!(result.is_ok(), "Search should complete without errors");
3072
3073 log::info!("✅ All Atomic Data caching tests passed!");
3074 }
3075
3076 #[tokio::test]
3077 #[ignore = "Requires local KG fixtures at 'test' directory"]
3078 async fn test_kg_term_search_with_atomic_data() {
3079 use ahash::AHashMap;
3080 use std::path::PathBuf;
3081 use terraphim_config::{
3082 Config, Haystack, KnowledgeGraph, KnowledgeGraphLocal, Role, ServiceType,
3083 };
3084 use terraphim_persistence::DeviceStorage;
3085 use terraphim_types::{Document, KnowledgeGraphInputType, RoleName};
3086
3087 DeviceStorage::init_memory_only().await.unwrap();
3089
3090 let mut config = Config::default();
3092 let role_name = RoleName::new("test_kg_role");
3093 let role = Role {
3094 shortname: None,
3095 name: "test_kg_role".into(),
3096 haystacks: vec![Haystack {
3097 location: "test".to_string(),
3098 service: ServiceType::Ripgrep,
3099 read_only: false,
3100 atomic_server_secret: None,
3101 extra_parameters: std::collections::HashMap::new(),
3102 fetch_content: false,
3103 }],
3104 kg: Some(KnowledgeGraph {
3105 automata_path: None,
3106 knowledge_graph_local: Some(KnowledgeGraphLocal {
3107 input_type: KnowledgeGraphInputType::Markdown,
3108 path: PathBuf::from("test"),
3109 }),
3110 public: true,
3111 publish: true,
3112 }),
3113 terraphim_it: true,
3114 theme: "default".to_string(),
3115 relevance_function: terraphim_types::RelevanceFunction::TerraphimGraph,
3116 llm_enabled: false,
3117 llm_api_key: None,
3118 llm_model: None,
3119 llm_auto_summarize: false,
3120 llm_chat_enabled: false,
3121 llm_chat_system_prompt: None,
3122 llm_chat_model: None,
3123 llm_context_window: None,
3124 extra: AHashMap::new(),
3125 llm_router_enabled: false,
3126 llm_router_config: None,
3127 };
3128 config.roles.insert(role_name.clone(), role);
3129
3130 let config_state = ConfigState::new(&mut config).await.unwrap();
3131 let mut service = TerraphimService::new(config_state);
3132
3133 let atomic_doc = Document {
3135 id: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3136 url: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3137 title: "Requested Loan Amount ($)".to_string(),
3138 body: "Form field for Requested Loan Amount ($)".to_string(),
3139 description: Some("Form field for Requested Loan Amount ($)".to_string()),
3140 summarization: None,
3141 stub: None,
3142 tags: None,
3143 rank: None,
3144 source_haystack: None,
3145 };
3146
3147 log::info!("Testing KG term search with Atomic Data documents...");
3149 match atomic_doc.save().await {
3150 Ok(_) => log::info!("✅ Successfully saved Atomic Data document to persistence"),
3151 Err(e) => {
3152 log::error!("❌ Failed to save Atomic Data document: {}", e);
3153 panic!("Atomic Data document save failed");
3154 }
3155 }
3156
3157 let result = service.find_documents_for_kg_term(&role_name, "test").await;
3161
3162 assert!(
3165 result.is_ok(),
3166 "find_documents_for_kg_term should complete without errors"
3167 );
3168
3169 let documents = result.unwrap();
3170 log::info!(
3171 "✅ KG term search completed successfully, found {} documents",
3172 documents.len()
3173 );
3174
3175 let atomic_doc_id = "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount";
3178 let mut placeholder = Document {
3179 id: atomic_doc_id.to_string(),
3180 ..Default::default()
3181 };
3182
3183 match placeholder.load().await {
3184 Ok(loaded_doc) => {
3185 log::info!(
3186 "✅ Successfully loaded Atomic Data document from persistence in KG term search context"
3187 );
3188 assert_eq!(loaded_doc.title, atomic_doc.title);
3189 assert_eq!(loaded_doc.body, atomic_doc.body);
3190 }
3191 Err(e) => {
3192 log::error!(
3193 "❌ Failed to load Atomic Data document in KG term search context: {}",
3194 e
3195 );
3196 panic!("Atomic Data document load failed in KG term search context");
3197 }
3198 }
3199
3200 log::info!("✅ All KG term search with Atomic Data tests passed!");
3201 }
3202
3203 #[tokio::test]
3204 async fn test_kg_term_search_rank_assignment() -> Result<()> {
3205 use ahash::AHashMap;
3206 use terraphim_config::{Config, Haystack, Role, ServiceType};
3207 use terraphim_persistence::DeviceStorage;
3208 use terraphim_types::{Document, RoleName};
3209
3210 DeviceStorage::init_memory_only().await.unwrap();
3212
3213 let mut config = Config::default();
3215 let role_name = RoleName::new("Test KG Role");
3216 let role = Role {
3217 shortname: Some("test-kg".to_string()),
3218 name: role_name.clone(),
3219 haystacks: vec![Haystack {
3220 location: "test".to_string(),
3221 service: ServiceType::Ripgrep,
3222 read_only: false,
3223 atomic_server_secret: None,
3224 extra_parameters: std::collections::HashMap::new(),
3225 fetch_content: false,
3226 }],
3227 kg: Some(terraphim_config::KnowledgeGraph {
3228 automata_path: Some(terraphim_automata::AutomataPath::local_example()),
3229 knowledge_graph_local: None,
3230 public: false,
3231 publish: false,
3232 }),
3233 terraphim_it: false,
3234 theme: "default".to_string(),
3235 relevance_function: terraphim_types::RelevanceFunction::TitleScorer,
3236 llm_enabled: false,
3237 llm_api_key: None,
3238 llm_model: None,
3239 llm_auto_summarize: false,
3240 llm_chat_enabled: false,
3241 llm_chat_system_prompt: None,
3242 llm_chat_model: None,
3243 llm_context_window: None,
3244 extra: AHashMap::new(),
3245 llm_router_enabled: false,
3246 llm_router_config: None,
3247 };
3248 config.roles.insert(role_name.clone(), role);
3249
3250 let config_state = ConfigState::new(&mut config).await.unwrap();
3251 let _service = TerraphimService::new(config_state);
3252
3253 let test_documents = vec![
3255 Document {
3256 id: "test-doc-1".to_string(),
3257 title: "First Test Document".to_string(),
3258 body: "This is the first test document body".to_string(),
3259 url: "test://doc1".to_string(),
3260 description: Some("First document description".to_string()),
3261 summarization: None,
3262 stub: None,
3263 tags: Some(vec!["test".to_string(), "first".to_string()]),
3264 rank: None, source_haystack: None,
3266 },
3267 Document {
3268 id: "test-doc-2".to_string(),
3269 title: "Second Test Document".to_string(),
3270 body: "This is the second test document body".to_string(),
3271 url: "test://doc2".to_string(),
3272 description: Some("Second document description".to_string()),
3273 summarization: None,
3274 stub: None,
3275 tags: Some(vec!["test".to_string(), "second".to_string()]),
3276 rank: None, source_haystack: None,
3278 },
3279 Document {
3280 id: "test-doc-3".to_string(),
3281 title: "Third Test Document".to_string(),
3282 body: "This is the third test document body".to_string(),
3283 url: "test://doc3".to_string(),
3284 description: Some("Third document description".to_string()),
3285 summarization: None,
3286 stub: None,
3287 tags: Some(vec!["test".to_string(), "third".to_string()]),
3288 rank: None, source_haystack: None,
3290 },
3291 ];
3292
3293 for doc in &test_documents {
3295 doc.save().await.expect("Failed to save test document");
3296 }
3297
3298 let mut simulated_documents = test_documents.clone();
3304
3305 let total_length = simulated_documents.len();
3307 for (idx, doc) in simulated_documents.iter_mut().enumerate() {
3308 let rank = (total_length - idx) as u64;
3309 doc.rank = Some(rank);
3310 }
3311
3312 assert_eq!(simulated_documents.len(), 3, "Should have 3 test documents");
3314
3315 for doc in &simulated_documents {
3317 assert!(
3318 doc.rank.is_some(),
3319 "Document '{}' should have a rank assigned",
3320 doc.title
3321 );
3322 assert!(
3323 doc.rank.unwrap() > 0,
3324 "Document '{}' should have a positive rank",
3325 doc.title
3326 );
3327 }
3328
3329 assert_eq!(
3331 simulated_documents[0].rank,
3332 Some(3),
3333 "First document should have highest rank (3)"
3334 );
3335 assert_eq!(
3336 simulated_documents[1].rank,
3337 Some(2),
3338 "Second document should have rank 2"
3339 );
3340 assert_eq!(
3341 simulated_documents[2].rank,
3342 Some(1),
3343 "Third document should have rank 1"
3344 );
3345
3346 let mut ranks: Vec<u64> = simulated_documents
3348 .iter()
3349 .map(|doc| doc.rank.unwrap())
3350 .collect();
3351 ranks.sort_by(|a, b| b.cmp(a)); assert_eq!(
3353 ranks,
3354 vec![3, 2, 1],
3355 "Ranks should be unique and in descending order"
3356 );
3357
3358 log::info!("✅ KG term search rank assignment test completed successfully!");
3359 Ok(())
3360 }
3361}