1use ahash::AHashMap;
2use regex::Regex;
3use terraphim_automata::builder::{Logseq, ThesaurusBuilder};
4use terraphim_automata::load_thesaurus;
5use terraphim_automata::{replace_matches, LinkType};
6use terraphim_config::{ConfigState, Role};
7use terraphim_middleware::thesaurus::build_thesaurus_from_haystack;
8use terraphim_persistence::Persistable;
9use terraphim_rolegraph::{RoleGraph, RoleGraphSync};
10use terraphim_types::{
11 Document, Index, IndexedDocument, NormalizedTermValue, RelevanceFunction, RoleName,
12 SearchQuery, Thesaurus,
13};
14mod score;
15use crate::score::Query;
16
17#[cfg(feature = "openrouter")]
18pub mod openrouter;
19
20pub mod llm;
22
23pub mod llm_proxy;
25
26pub mod http_client;
28
29pub mod logging;
31
32pub mod conversation_service;
34pub mod rate_limiter;
35pub mod summarization_manager;
36pub mod summarization_queue;
37pub mod summarization_worker;
38
39pub mod error;
41
42pub mod context;
44
45#[cfg(test)]
46mod context_tests;
47
48fn normalize_filename_to_id(filename: &str) -> String {
52 let re = regex::Regex::new(r"[^a-zA-Z0-9]+").expect("Failed to create regex");
53 re.replace_all(filename, "").to_lowercase()
54}
55
56#[derive(thiserror::Error, Debug)]
57pub enum ServiceError {
58 #[error("Middleware error: {0}")]
59 Middleware(#[from] terraphim_middleware::Error),
60
61 #[error("OpenDal error: {0}")]
62 OpenDal(Box<opendal::Error>),
63
64 #[error("Persistence error: {0}")]
65 Persistence(#[from] terraphim_persistence::Error),
66
67 #[error("Config error: {0}")]
68 Config(String),
69
70 #[cfg(feature = "openrouter")]
71 #[error("OpenRouter error: {0}")]
72 OpenRouter(#[from] crate::openrouter::OpenRouterError),
73
74 #[error("Common error: {0}")]
75 Common(#[from] crate::error::CommonError),
76}
77
78impl From<opendal::Error> for ServiceError {
79 fn from(err: opendal::Error) -> Self {
80 ServiceError::OpenDal(Box::new(err))
81 }
82}
83
84impl crate::error::TerraphimError for ServiceError {
85 fn category(&self) -> crate::error::ErrorCategory {
86 use crate::error::ErrorCategory;
87 match self {
88 ServiceError::Middleware(_) => ErrorCategory::Integration,
89 ServiceError::OpenDal(_) => ErrorCategory::Storage,
90 ServiceError::Persistence(_) => ErrorCategory::Storage,
91 ServiceError::Config(_) => ErrorCategory::Configuration,
92 #[cfg(feature = "openrouter")]
93 ServiceError::OpenRouter(_) => ErrorCategory::Integration,
94 ServiceError::Common(err) => err.category(),
95 }
96 }
97
98 fn is_recoverable(&self) -> bool {
99 match self {
100 ServiceError::Middleware(_) => true,
101 ServiceError::OpenDal(_) => false,
102 ServiceError::Persistence(_) => false,
103 ServiceError::Config(_) => false,
104 #[cfg(feature = "openrouter")]
105 ServiceError::OpenRouter(_) => true,
106 ServiceError::Common(err) => err.is_recoverable(),
107 }
108 }
109}
110
111pub type Result<T> = std::result::Result<T, ServiceError>;
112
113pub struct TerraphimService {
114 config_state: ConfigState,
115}
116
117impl TerraphimService {
118 pub fn new(config_state: ConfigState) -> Self {
120 Self { config_state }
121 }
122
123 async fn build_thesaurus(&mut self, search_query: &SearchQuery) -> Result<()> {
125 Ok(build_thesaurus_from_haystack(&mut self.config_state, search_query).await?)
126 }
127 pub async fn ensure_thesaurus_loaded(&mut self, role_name: &RoleName) -> Result<Thesaurus> {
129 async fn load_thesaurus_from_automata_path(
130 config_state: &ConfigState,
131 role_name: &RoleName,
132 rolegraphs: &mut AHashMap<RoleName, RoleGraphSync>,
133 ) -> Result<Thesaurus> {
134 let config = config_state.config.lock().await;
135 let Some(role) = config.roles.get(role_name).cloned() else {
136 return Err(ServiceError::Config(format!(
137 "Role '{}' not found in config",
138 role_name
139 )));
140 };
141 if let Some(kg) = &role.kg {
142 if let Some(automata_path) = &kg.automata_path {
143 log::info!("Loading Role `{}` - URL: {:?}", role_name, automata_path);
144
145 match load_thesaurus(automata_path).await {
147 Ok(mut thesaurus) => {
148 log::info!("Successfully loaded thesaurus from automata path");
149
150 match thesaurus.save().await {
152 Ok(_) => {
153 log::info!(
154 "Thesaurus for role `{}` saved to persistence",
155 role_name
156 );
157 match thesaurus.load().await {
159 Ok(persisted_thesaurus) => {
160 thesaurus = persisted_thesaurus;
161 log::debug!("Reloaded thesaurus from persistence");
162 }
163 Err(e) => {
164 log::warn!("Failed to reload thesaurus from persistence, using in-memory version: {:?}", e);
165 }
166 }
167 }
168 Err(e) => {
169 log::warn!("Failed to save thesaurus to persistence: {:?}", e);
170 }
171 }
172
173 let rolegraph =
174 RoleGraph::new(role_name.clone(), thesaurus.clone()).await;
175 match rolegraph {
176 Ok(rolegraph) => {
177 let rolegraph_value = RoleGraphSync::from(rolegraph);
178 rolegraphs.insert(role_name.clone(), rolegraph_value);
179 }
180 Err(e) => {
181 log::error!("Failed to update role and thesaurus: {:?}", e)
182 }
183 }
184 Ok(thesaurus)
185 }
186 Err(e) => {
187 log::warn!("Failed to load thesaurus from automata path: {:?}", e);
188 if let Some(kg_local) = &kg.knowledge_graph_local {
190 log::info!(
191 "Fallback: building thesaurus from local KG for role {}",
192 role_name
193 );
194 let logseq_builder = Logseq::default();
195 match logseq_builder
196 .build(
197 role_name.as_lowercase().to_string(),
198 kg_local.path.clone(),
199 )
200 .await
201 {
202 Ok(mut thesaurus) => {
203 match thesaurus.save().await {
205 Ok(_) => {
206 log::info!("Fallback thesaurus for role `{}` saved to persistence", role_name);
207 match thesaurus.load().await {
209 Ok(persisted_thesaurus) => {
210 thesaurus = persisted_thesaurus;
211 log::debug!("Reloaded fallback thesaurus from persistence");
212 }
213 Err(e) => {
214 log::warn!("Failed to reload fallback thesaurus from persistence, using in-memory version: {:?}", e);
215 }
216 }
217 }
218 Err(e) => {
219 log::warn!("Failed to save fallback thesaurus to persistence: {:?}", e);
220 }
221 }
222
223 let rolegraph =
224 RoleGraph::new(role_name.clone(), thesaurus.clone())
225 .await;
226 match rolegraph {
227 Ok(rolegraph) => {
228 let rolegraph_value =
229 RoleGraphSync::from(rolegraph);
230 rolegraphs
231 .insert(role_name.clone(), rolegraph_value);
232 }
233 Err(e) => log::error!(
234 "Failed to update role and thesaurus: {:?}",
235 e
236 ),
237 }
238
239 Ok(thesaurus)
240 }
241 Err(e) => {
242 log::error!(
243 "Failed to build thesaurus from local KG for role {}: {:?}",
244 role_name,
245 e
246 );
247 Err(ServiceError::Config(
248 "Failed to load or build thesaurus".into(),
249 ))
250 }
251 }
252 } else {
253 log::error!(
254 "No fallback available for role {}: no local KG path configured",
255 role_name
256 );
257 Err(ServiceError::Config(
258 "No automata path and no local KG available".into(),
259 ))
260 }
261 }
262 }
263 } else if let Some(kg_local) = &kg.knowledge_graph_local {
264 log::info!(
266 "Role {} has no automata_path, building thesaurus from local KG files at {:?}",
267 role_name,
268 kg_local.path
269 );
270 let logseq_builder = Logseq::default();
271 match logseq_builder
272 .build(role_name.as_lowercase().to_string(), kg_local.path.clone())
273 .await
274 {
275 Ok(mut thesaurus) => {
276 log::info!(
277 "Successfully built thesaurus from local KG for role {}",
278 role_name
279 );
280
281 match thesaurus.save().await {
283 Ok(_) => {
284 log::info!(
285 "Local KG thesaurus for role `{}` saved to persistence",
286 role_name
287 );
288 match thesaurus.load().await {
290 Ok(persisted_thesaurus) => {
291 log::info!("Reloaded local KG thesaurus from persistence: {} entries", persisted_thesaurus.len());
292 thesaurus = persisted_thesaurus;
293 }
294 Err(e) => {
295 log::warn!("Failed to reload local KG thesaurus from persistence, using in-memory version: {:?}", e);
296 }
297 }
298 }
299 Err(e) => {
300 log::warn!(
301 "Failed to save local KG thesaurus to persistence: {:?}",
302 e
303 );
304 }
305 }
306
307 let rolegraph =
308 RoleGraph::new(role_name.clone(), thesaurus.clone()).await;
309 match rolegraph {
310 Ok(rolegraph) => {
311 let rolegraph_value = RoleGraphSync::from(rolegraph);
312 rolegraphs.insert(role_name.clone(), rolegraph_value);
313 }
314 Err(e) => {
315 log::error!("Failed to update role and thesaurus: {:?}", e)
316 }
317 }
318
319 Ok(thesaurus)
320 }
321 Err(e) => {
322 log::error!(
323 "Failed to build thesaurus from local KG for role {}: {:?}",
324 role_name,
325 e
326 );
327 Err(ServiceError::Config(
328 "Failed to build thesaurus from local KG".into(),
329 ))
330 }
331 }
332 } else {
333 log::warn!("Role {} is configured for TerraphimGraph but has neither automata_path nor knowledge_graph_local defined.", role_name);
334 if let Some(kg_local) = &kg.knowledge_graph_local {
335 log::info!(
337 "Building thesaurus from local KG files for role {} at {:?}",
338 role_name,
339 kg_local.path
340 );
341 let logseq_builder = Logseq::default();
342 match logseq_builder
343 .build(role_name.as_lowercase().to_string(), kg_local.path.clone())
344 .await
345 {
346 Ok(mut thesaurus) => {
347 log::info!(
348 "Successfully built thesaurus from local KG for role {}",
349 role_name
350 );
351
352 match thesaurus.save().await {
354 Ok(_) => {
355 log::info!("No-automata thesaurus for role `{}` saved to persistence", role_name);
356 match thesaurus.load().await {
358 Ok(persisted_thesaurus) => {
359 thesaurus = persisted_thesaurus;
360 log::debug!("Reloaded no-automata thesaurus from persistence");
361 }
362 Err(e) => {
363 log::warn!("Failed to reload no-automata thesaurus from persistence, using in-memory version: {:?}", e);
364 }
365 }
366 }
367 Err(e) => {
368 log::warn!("Failed to save no-automata thesaurus to persistence: {:?}", e);
369 }
370 }
371
372 let rolegraph =
373 RoleGraph::new(role_name.clone(), thesaurus.clone()).await;
374 match rolegraph {
375 Ok(rolegraph) => {
376 let rolegraph_value = RoleGraphSync::from(rolegraph);
377 rolegraphs.insert(role_name.clone(), rolegraph_value);
378 }
379 Err(e) => {
380 log::error!("Failed to update role and thesaurus: {:?}", e)
381 }
382 }
383
384 Ok(thesaurus)
385 }
386 Err(e) => {
387 log::error!(
388 "Failed to build thesaurus from local KG for role {}: {:?}",
389 role_name,
390 e
391 );
392 Err(ServiceError::Config(
393 "Failed to build thesaurus from local KG".into(),
394 ))
395 }
396 }
397 } else {
398 Err(ServiceError::Config(
399 "No local knowledge graph path available".into(),
400 ))
401 }
402 }
403 } else {
404 Err(ServiceError::Config(
405 "Knowledge graph not configured".into(),
406 ))
407 }
408 }
409
410 log::debug!("Loading thesaurus for role: {}", role_name);
411 log::debug!("Role keys {:?}", self.config_state.roles.keys());
412
413 if let Some(rolegraph_value) = self.config_state.roles.get(role_name) {
414 let thesaurus_result = rolegraph_value.lock().await.thesaurus.clone().load().await;
415 match thesaurus_result {
416 Ok(thesaurus) => {
417 log::debug!("Thesaurus loaded: {:?}", thesaurus);
418 log::info!("Rolegraph loaded: for role name {:?}", role_name);
419 Ok(thesaurus)
420 }
421 Err(e) => {
422 log::error!("Failed to load thesaurus: {:?}", e);
423 let mut rolegraphs = self.config_state.roles.clone();
425 let result = load_thesaurus_from_automata_path(
426 &self.config_state,
427 role_name,
428 &mut rolegraphs,
429 )
430 .await;
431
432 if result.is_ok() {
434 if let Some(updated_rolegraph) = rolegraphs.get(role_name) {
435 self.config_state
436 .roles
437 .insert(role_name.clone(), updated_rolegraph.clone());
438 log::info!(
439 "Updated config_state with new rolegraph for role: {}",
440 role_name
441 );
442 }
443 }
444
445 result
446 }
447 }
448 } else {
449 let mut rolegraphs = self.config_state.roles.clone();
451 let result =
452 load_thesaurus_from_automata_path(&self.config_state, role_name, &mut rolegraphs)
453 .await;
454
455 if result.is_ok() {
457 if let Some(new_rolegraph) = rolegraphs.get(role_name) {
458 self.config_state
459 .roles
460 .insert(role_name.clone(), new_rolegraph.clone());
461 log::info!(
462 "Added new rolegraph to config_state for role: {}",
463 role_name
464 );
465 }
466 }
467
468 result
469 }
470 }
471
472 pub async fn preprocess_document_content(
478 &mut self,
479 mut document: Document,
480 role: &Role,
481 ) -> Result<Document> {
482 if !role.terraphim_it {
484 log::info!(
485 "🔍 terraphim_it disabled for role '{}', skipping KG preprocessing",
486 role.name
487 );
488 return Ok(document);
489 }
490
491 let Some(_kg) = &role.kg else {
492 log::info!(
493 "⚠️ No KG configured for role '{}', skipping KG preprocessing",
494 role.name
495 );
496 return Ok(document);
497 };
498
499 log::info!(
500 "🧠 Starting KG preprocessing for document '{}' in role '{}' (terraphim_it enabled)",
501 document.title,
502 role.name
503 );
504 log::debug!(
505 "📄 Document preview: {} characters starting with: {}",
506 document.body.len(),
507 &document.body.chars().take(100).collect::<String>()
508 );
509
510 let thesaurus = match self.ensure_thesaurus_loaded(&role.name).await {
512 Ok(thesaurus) => thesaurus,
513 Err(e) => {
514 log::warn!("Failed to load thesaurus for role {}: {:?}", role.name, e);
515 return Ok(document); }
517 };
518
519 let mut kg_thesaurus = Thesaurus::new(format!("kg_links_{}", role.name));
521
522 let important_kg_terms = [
525 "graph",
526 "haystack",
527 "service",
528 "terraphim",
529 "knowledge",
530 "embedding",
531 "search",
532 "automata",
533 "thesaurus",
534 "rolegraph",
535 ];
536
537 let excluded_common_terms = [
539 "system",
540 "config",
541 "configuration",
542 "type",
543 "method",
544 "function",
545 "class",
546 "component",
547 "module",
548 "library",
549 "framework",
550 "interface",
551 "api",
552 "data",
553 "file",
554 "path",
555 "url",
556 "string",
557 "number",
558 "value",
559 "option",
560 "parameter",
561 "field",
562 "property",
563 "attribute",
564 "element",
565 "item",
566 "object",
567 "array",
568 "list",
569 "map",
570 "set",
571 "collection",
572 "server",
573 "client",
574 "request",
575 "response",
576 "error",
577 "result",
578 "success",
579 "failure",
580 "true",
581 "false",
582 "null",
583 "undefined",
584 "empty",
585 "full",
586 "start",
587 "end",
588 "begin",
589 "finish",
590 "create",
591 "delete",
592 "update",
593 "read",
594 "write",
595 "load",
596 "save",
597 "process",
598 "handle",
599 "manage",
600 "control",
601 "execute",
602 "run",
603 "call",
604 "invoke",
605 "trigger",
606 "event",
607 "action",
608 "command",
609 "query",
610 "search",
611 "filter",
612 "sort",
613 "order",
614 "group",
615 "match",
616 "find",
617 "replace",
618 "insert",
619 "remove",
620 "add",
621 "set",
622 "get",
623 "put",
624 "post",
625 "head",
626 "patch",
627 "delete",
628 ];
629
630 let mut sorted_terms: Vec<_> = (&thesaurus)
631 .into_iter()
632 .filter(|(key, _)| {
633 let term = key.as_str();
634
635 if term.is_empty() || term.len() < 3 {
637 return false;
638 }
639
640 if important_kg_terms.contains(&term) {
642 return true;
643 }
644
645 if excluded_common_terms.contains(&term) {
647 return false;
648 }
649
650 term.len() > 5
656 || term.contains('-')
657 || term.contains('_')
658 || term.chars().next().is_some_and(|c| c.is_uppercase())
659 })
660 .collect();
661
662 sorted_terms.sort_by(|a, b| {
664 let a_important = important_kg_terms.contains(&a.0.as_str());
665 let b_important = important_kg_terms.contains(&b.0.as_str());
666
667 match (a_important, b_important) {
668 (true, false) => std::cmp::Ordering::Less, (false, true) => std::cmp::Ordering::Greater, _ => b.1.id.cmp(&a.1.id), }
672 });
673
674 let max_kg_terms = 8;
676 for (key, value) in sorted_terms.into_iter().take(max_kg_terms) {
677 let mut kg_value = value.clone();
678 kg_value.value = key.clone(); kg_value.url = Some(format!("kg:{}", value.value)); kg_thesaurus.insert(key.clone(), kg_value);
684 }
685
686 let kg_terms_count = kg_thesaurus.len();
687 log::info!(
688 "📋 KG thesaurus filtering: {} → {} terms (prioritizing: {}, filters: len>5, hyphenated, or important KG terms)",
689 thesaurus.len(),
690 kg_terms_count,
691 important_kg_terms.join(", ")
692 );
693
694 if kg_terms_count > 0 {
696 let terms: Vec<String> = (&kg_thesaurus)
697 .into_iter()
698 .map(|(k, v)| format!("'{}' → kg:{}", k, v.value))
699 .collect();
700 log::info!("🔍 KG terms selected for linking: {}", terms.join(", "));
701 } else {
702 log::info!(
703 "⚠️ No KG terms passed filtering criteria - document '{}' will have no KG links",
704 document.title
705 );
706 }
707
708 if !kg_thesaurus.is_empty() {
710 let debug_thesaurus: Vec<String> = (&kg_thesaurus)
712 .into_iter()
713 .map(|(k, v)| format!("'{}' -> '{}' (url: {:?})", k, v.value, v.url))
714 .take(3) .collect();
716 log::info!(
717 "🔧 Passing to replace_matches: {} (total terms: {})",
718 debug_thesaurus.join(", "),
719 kg_thesaurus.len()
720 );
721 let preview = if document.body.chars().count() > 200 {
722 document.body.chars().take(200).collect::<String>() + "..."
723 } else {
724 document.body.clone()
725 };
726 log::info!("📝 Document body preview (first 200 chars): {}", preview);
727
728 match replace_matches(&document.body, kg_thesaurus, LinkType::MarkdownLinks) {
729 Ok(processed_bytes) => {
730 match String::from_utf8(processed_bytes) {
731 Ok(processed_content) => {
732 log::info!(
733 "✅ Successfully preprocessed document '{}' with {} KG terms → created [term](kg:concept) links",
734 document.title,
735 kg_terms_count
736 );
737
738 let content_changed = processed_content != document.body;
740 log::info!(
741 "🔄 Content changed: {} (original: {} chars, processed: {} chars)",
742 content_changed,
743 document.body.len(),
744 processed_content.len()
745 );
746
747 let kg_links: Vec<&str> = processed_content
749 .split("[")
750 .filter_map(|s| s.find("](kg:").map(|closing| &s[..closing]))
751 .collect();
752
753 if !kg_links.is_empty() {
754 log::info!(
755 "🔗 Found KG links in processed content: [{}](kg:...)",
756 kg_links.join("], [")
757 );
758
759 if let Some(first_link_pos) = processed_content.find("](kg:") {
761 let start = first_link_pos.saturating_sub(50);
762 let end = (first_link_pos + 100).min(processed_content.len());
763 log::info!(
764 "📄 Content snippet with KG link: ...{}...",
765 &processed_content[start..end]
766 );
767 }
768 } else {
769 log::warn!("⚠️ No KG links found in processed content despite successful replacement");
770 }
771
772 document.body = processed_content;
773 }
774 Err(e) => {
775 log::warn!("Failed to convert processed content to UTF-8 for document '{}': {:?}",
776 document.title, e);
777 }
778 }
779 }
780 Err(e) => {
781 log::warn!(
782 "Failed to replace KG terms in document '{}': {:?}",
783 document.title,
784 e
785 );
786 }
787 }
788 } else {
789 log::info!(
790 "💭 No specific KG terms found for document '{}' (filters excluded generic terms)",
791 document.title
792 );
793 }
794
795 Ok(document)
796 }
797
798 pub async fn preprocess_document_content_with_search(
800 &mut self,
801 document: Document,
802 role: &Role,
803 search_query: Option<&SearchQuery>,
804 ) -> Result<Document> {
805 let mut processed_doc = self.preprocess_document_content(document, role).await?;
807
808 if let Some(query) = search_query {
810 log::debug!(
811 "Applying search term highlighting to document '{}'",
812 processed_doc.title
813 );
814 processed_doc.body = Self::highlight_search_terms(&processed_doc.body, query);
815 }
816
817 Ok(processed_doc)
818 }
819
820 pub async fn create_document(&mut self, document: Document) -> Result<Document> {
822 document.save().await?;
825
826 self.config_state.add_to_roles(&document).await?;
829
830 use terraphim_config::ServiceType;
834 use terraphim_middleware::indexer::RipgrepIndexer;
835
836 let ripgrep = RipgrepIndexer::default();
837 let config_snapshot = { self.config_state.config.lock().await.clone() };
838
839 for role in config_snapshot.roles.values() {
840 for haystack in &role.haystacks {
841 if haystack.service == ServiceType::Ripgrep && !haystack.read_only {
842 if let Err(e) = ripgrep.update_document(&document).await {
843 log::warn!(
844 "Failed to write document {} to haystack {:?}: {:?}",
845 document.id,
846 haystack.location,
847 e
848 );
849 }
850 }
851 }
852 }
853
854 Ok(document)
855 }
856
857 pub async fn get_document_by_id(&mut self, document_id: &str) -> Result<Option<Document>> {
863 log::debug!("Getting document by ID: '{}'", document_id);
864
865 if document_id.trim().is_empty() {
867 log::warn!("Empty or whitespace-only document_id provided");
868 return Ok(None);
869 }
870
871 let mut placeholder = Document {
873 id: document_id.to_string(),
874 ..Default::default()
875 };
876 match placeholder.load().await {
877 Ok(doc) => {
878 log::debug!("Found document '{}' with direct ID lookup", document_id);
879 return self.apply_kg_preprocessing_if_needed(doc).await.map(Some);
880 }
881 Err(e) => {
882 log::debug!(
883 "Document '{}' not found with direct lookup: {:?}",
884 document_id,
885 e
886 );
887 }
888 }
889
890 if document_id.contains('.') || document_id.contains('-') || document_id.contains('_') {
892 let normalized_id = normalize_filename_to_id(document_id);
893 log::debug!(
894 "Trying normalized ID '{}' for filename '{}'",
895 normalized_id,
896 document_id
897 );
898
899 let mut normalized_placeholder = Document {
900 id: normalized_id.clone(),
901 ..Default::default()
902 };
903 match normalized_placeholder.load().await {
904 Ok(doc) => {
905 log::debug!(
906 "Found document '{}' with normalized ID '{}'",
907 document_id,
908 normalized_id
909 );
910 return self.apply_kg_preprocessing_if_needed(doc).await.map(Some);
911 }
912 Err(e) => {
913 log::debug!(
914 "Document '{}' not found with normalized ID '{}': {:?}",
915 document_id,
916 normalized_id,
917 e
918 );
919 }
920 }
921 }
922
923 log::debug!("Falling back to search for document '{}'", document_id);
925 let search_query = SearchQuery {
926 search_term: NormalizedTermValue::new(document_id.to_string()),
927 search_terms: None,
928 operator: None,
929 limit: Some(5), skip: None,
931 role: None,
932 };
933
934 let documents = self.search(&search_query).await?;
935
936 for doc in documents {
938 if doc.title == document_id || doc.id == document_id {
939 log::debug!("Found document '{}' via search fallback", document_id);
940 return self.apply_kg_preprocessing_if_needed(doc).await.map(Some);
941 }
942 }
943
944 log::debug!("Document '{}' not found anywhere", document_id);
945 Ok(None)
946 }
947
948 async fn apply_kg_preprocessing_if_needed(&mut self, document: Document) -> Result<Document> {
954 log::debug!(
955 "🔍 [KG-DEBUG] apply_kg_preprocessing_if_needed called for document: '{}'",
956 document.title
957 );
958 log::debug!(
959 "🔍 [KG-DEBUG] Document body preview: {}",
960 document.body.chars().take(100).collect::<String>()
961 );
962
963 let role = {
964 let config = self.config_state.config.lock().await;
965 let selected_role = &config.selected_role;
966
967 log::debug!("🔍 [KG-DEBUG] Selected role: '{}'", selected_role);
968
969 match config.roles.get(selected_role) {
970 Some(role) => {
971 log::debug!(
972 "🔍 [KG-DEBUG] Role found: '{}', terraphim_it: {}",
973 role.name,
974 role.terraphim_it
975 );
976 role.clone() }
978 None => {
979 log::warn!(
980 "❌ [KG-DEBUG] Selected role '{}' not found in config, skipping KG preprocessing",
981 selected_role
982 );
983 return Ok(document);
984 }
985 }
986 }; if !role.terraphim_it {
990 log::info!(
991 "🔍 [KG-DEBUG] terraphim_it disabled for role '{}', skipping KG preprocessing",
992 role.name
993 );
994 return Ok(document);
995 }
996
997 let has_existing_kg_links = document.body.contains("](kg:");
999 log::debug!(
1000 "🔍 [KG-DEBUG] Document already has KG links: {}",
1001 has_existing_kg_links
1002 );
1003 if has_existing_kg_links {
1004 log::info!(
1005 "🔍 [KG-DEBUG] Document '{}' already has KG links, skipping preprocessing to prevent double processing",
1006 document.title
1007 );
1008 return Ok(document);
1009 }
1010
1011 log::info!(
1012 "🧠 [KG-DEBUG] Starting KG preprocessing for document '{}' with role '{}' (terraphim_it enabled)",
1013 document.title,
1014 role.name
1015 );
1016
1017 let document_title = document.title.clone(); let processed_doc = match self.preprocess_document_content(document, &role).await {
1020 Ok(doc) => {
1021 let links_added = doc.body.contains("](kg:");
1022 log::info!(
1023 "✅ [KG-DEBUG] KG preprocessing completed for document '{}'. Links added: {}",
1024 doc.title,
1025 links_added
1026 );
1027 if links_added {
1028 log::debug!(
1029 "🔍 [KG-DEBUG] Processed body preview: {}",
1030 doc.body.chars().take(200).collect::<String>()
1031 );
1032 }
1033 doc
1034 }
1035 Err(e) => {
1036 log::error!(
1037 "❌ [KG-DEBUG] KG preprocessing failed for document '{}': {:?}",
1038 document_title,
1039 e
1040 );
1041 return Err(e);
1042 }
1043 };
1044
1045 Ok(processed_doc)
1046 }
1047
1048 #[allow(dead_code)] async fn enhance_descriptions_with_ai(
1054 &self,
1055 mut documents: Vec<Document>,
1056 role: &Role,
1057 ) -> Result<Vec<Document>> {
1058 use crate::llm::{build_llm_from_role, SummarizeOptions};
1059
1060 eprintln!("🤖 Attempting to build LLM client for role: {}", role.name);
1061 let llm = match build_llm_from_role(role) {
1062 Some(client) => {
1063 eprintln!("✅ LLM client successfully created: {}", client.name());
1064 client
1065 }
1066 None => {
1067 eprintln!("❌ No LLM client available for role: {}", role.name);
1068 return Ok(documents);
1069 }
1070 };
1071
1072 log::info!(
1073 "Enhancing {} document descriptions with LLM provider: {}",
1074 documents.len(),
1075 llm.name()
1076 );
1077
1078 let mut enhanced_count = 0;
1079 let mut error_count = 0;
1080
1081 for document in &mut documents {
1082 if self.should_generate_ai_summary(document) {
1083 let summary_length = 250;
1084 match llm
1085 .summarize(
1086 &document.body,
1087 SummarizeOptions {
1088 max_length: summary_length,
1089 },
1090 )
1091 .await
1092 {
1093 Ok(ai_summary) => {
1094 log::debug!(
1095 "Generated AI summary for '{}': {} characters",
1096 document.title,
1097 ai_summary.len()
1098 );
1099 document.description = Some(ai_summary);
1100 enhanced_count += 1;
1101 }
1102 Err(e) => {
1103 log::warn!(
1104 "Failed to generate AI summary for '{}': {}",
1105 document.title,
1106 e
1107 );
1108 error_count += 1;
1109 }
1110 }
1111 }
1112 }
1113
1114 log::info!(
1115 "LLM enhancement complete: {} enhanced, {} errors, {} skipped",
1116 enhanced_count,
1117 error_count,
1118 documents.len() - enhanced_count - error_count
1119 );
1120
1121 Ok(documents)
1122 }
1123
1124 #[allow(dead_code)] fn should_generate_ai_summary(&self, document: &Document) -> bool {
1130 if document.body.trim().len() < 200 {
1132 return false;
1133 }
1134
1135 if let Some(ref description) = document.description {
1137 if description.len() > 100 && !description.ends_with("...") {
1139 return false;
1140 }
1141 }
1142
1143 if document.body.len() > 8000 {
1145 return false;
1146 }
1147
1148 true
1150 }
1151
1152 async fn get_search_role(&self, search_query: &SearchQuery) -> Result<Role> {
1154 let search_role = match &search_query.role {
1155 Some(role) => role.clone(),
1156 None => self.config_state.get_default_role().await,
1157 };
1158
1159 log::debug!("Searching for role: {:?}", search_role);
1160 let Some(role) = self.config_state.get_role(&search_role).await else {
1161 return Err(ServiceError::Config(format!(
1162 "Role `{}` not found in config",
1163 search_role
1164 )));
1165 };
1166 Ok(role)
1167 }
1168
1169 fn term_matches_with_word_boundaries(term: &str, text: &str) -> bool {
1171 if let Ok(regex) = Regex::new(&format!(r"\b{}\b", regex::escape(term))) {
1173 regex.is_match(text)
1174 } else {
1175 text.contains(term)
1177 }
1178 }
1179
1180 pub async fn apply_logical_operators_to_documents(
1182 &mut self,
1183 search_query: &SearchQuery,
1184 documents: Vec<Document>,
1185 ) -> Result<Vec<Document>> {
1186 use terraphim_types::LogicalOperator;
1187
1188 let all_terms = search_query.get_all_terms();
1189 let operator = search_query.get_operator();
1190
1191 let initial_doc_count = documents.len();
1192
1193 log::debug!(
1194 "Applying {:?} operator to {} documents with {} search terms",
1195 operator,
1196 initial_doc_count,
1197 all_terms.len()
1198 );
1199
1200 let filtered_docs: Vec<Document> = documents
1201 .into_iter()
1202 .filter(|doc| {
1203 let searchable_text = format!(
1205 "{} {} {}",
1206 doc.title.to_lowercase(),
1207 doc.body.to_lowercase(),
1208 doc.description
1209 .as_ref()
1210 .unwrap_or(&String::new())
1211 .to_lowercase()
1212 );
1213
1214 match operator {
1215 LogicalOperator::And => {
1216 all_terms.iter().all(|term| {
1218 Self::term_matches_with_word_boundaries(
1219 &term.as_str().to_lowercase(),
1220 &searchable_text,
1221 )
1222 })
1223 }
1224 LogicalOperator::Or => {
1225 all_terms.iter().any(|term| {
1227 Self::term_matches_with_word_boundaries(
1228 &term.as_str().to_lowercase(),
1229 &searchable_text,
1230 )
1231 })
1232 }
1233 }
1234 })
1235 .collect();
1236
1237 log::debug!(
1238 "Logical operator filtering: {} -> {} documents",
1239 initial_doc_count,
1240 filtered_docs.len()
1241 );
1242
1243 let combined_query_string = all_terms
1245 .iter()
1246 .map(|t| t.as_str())
1247 .collect::<Vec<_>>()
1248 .join(" ");
1249 let query = Query::new(&combined_query_string);
1250 let sorted_docs = score::sort_documents(&query, filtered_docs);
1251
1252 Ok(sorted_docs)
1253 }
1254
1255 pub async fn search_documents_selected_role(
1258 &mut self,
1259 search_term: &NormalizedTermValue,
1260 ) -> Result<Vec<Document>> {
1261 let role = self.config_state.get_selected_role().await;
1262 let documents = self
1263 .search(&SearchQuery {
1264 search_term: search_term.clone(),
1265 search_terms: None,
1266 operator: None,
1267 role: Some(role),
1268 skip: None,
1269 limit: None,
1270 })
1271 .await?;
1272 Ok(documents)
1273 }
1274
1275 pub async fn search(&mut self, search_query: &SearchQuery) -> Result<Vec<Document>> {
1277 log::debug!("Role for searching: {:?}", search_query.role);
1279 let role = self.get_search_role(search_query).await?;
1280
1281 log::trace!("Building index for search query: {:?}", search_query);
1282 let index: Index =
1283 terraphim_middleware::search_haystacks(self.config_state.clone(), search_query.clone())
1284 .await?;
1285
1286 match role.relevance_function {
1287 RelevanceFunction::TitleScorer => {
1288 log::debug!("Searching haystack with title scorer");
1289
1290 let documents = index.get_all_documents();
1291
1292 log::debug!("Sorting documents by relevance");
1293
1294 let documents = if search_query.is_multi_term_query() {
1295 self.apply_logical_operators_to_documents(search_query, documents)
1297 .await?
1298 } else {
1299 let query = Query::new(&search_query.search_term.to_string());
1301 score::sort_documents(&query, documents)
1302 };
1303 let total_length = documents.len();
1304 let mut docs_ranked = Vec::new();
1305 for (idx, doc) in documents.iter().enumerate() {
1306 let mut document: terraphim_types::Document = doc.clone();
1307 let rank = (total_length - idx).try_into().unwrap();
1308 document.rank = Some(rank);
1309
1310 if document.id.starts_with("http://") || document.id.starts_with("https://") {
1312 log::debug!(
1314 "Processing Atomic Data document '{}' (URL: {})",
1315 document.title,
1316 document.id
1317 );
1318
1319 let mut placeholder = Document {
1321 id: document.id.clone(),
1322 ..Default::default()
1323 };
1324 match placeholder.load().await {
1325 Ok(persisted_doc) => {
1326 log::debug!(
1328 "Found cached Atomic Data document '{}' in persistence",
1329 document.title
1330 );
1331 if let Some(better_description) = persisted_doc.description {
1332 document.description = Some(better_description);
1333 }
1334 if !persisted_doc.body.is_empty() && !role.terraphim_it {
1338 log::debug!(
1339 "Updated body from persistence for Atomic document '{}' (role: '{}', terraphim_it: {})",
1340 document.title, role.name, role.terraphim_it
1341 );
1342 document.body = persisted_doc.body;
1343 } else if role.terraphim_it {
1344 log::debug!(
1345 "Keeping search result body for Atomic document '{}' because role '{}' uses KG preprocessing (terraphim_it=true)",
1346 document.title, role.name
1347 );
1348 }
1349 }
1350 Err(_) => {
1351 log::debug!("Caching Atomic Data document '{}' to persistence for future queries", document.title);
1353
1354 let doc_to_save = document.clone();
1356 tokio::spawn(async move {
1357 if let Err(e) = doc_to_save.save().await {
1358 log::warn!(
1359 "Failed to cache Atomic Data document '{}': {}",
1360 doc_to_save.title,
1361 e
1362 );
1363 } else {
1364 log::debug!(
1365 "Successfully cached Atomic Data document '{}'",
1366 doc_to_save.title
1367 );
1368 }
1369 });
1370 }
1371 }
1372 } else {
1373 let should_lookup_persistence = document
1375 .get_source_haystack()
1376 .and_then(|source| {
1377 role.haystacks
1378 .iter()
1379 .find(|haystack| haystack.location == *source)
1380 })
1381 .map(|haystack| haystack.fetch_content)
1382 .unwrap_or(true);
1383
1384 if !should_lookup_persistence {
1385 log::trace!(
1386 "Skipping persistence lookup for '{}' (haystack fetch_content=false)",
1387 document.title
1388 );
1389 } else {
1390 let mut placeholder = Document {
1391 id: document.id.clone(),
1392 ..Default::default()
1393 };
1394 if let Ok(persisted_doc) = placeholder.load().await {
1395 if let Some(better_description) = persisted_doc.description {
1396 log::debug!("Replaced ripgrep description for '{}' with persistence description", document.title);
1397 document.description = Some(better_description);
1398 }
1399 } else {
1400 let normalized_id = normalize_filename_to_id(&document.title);
1403
1404 let mut normalized_placeholder = Document {
1405 id: normalized_id.clone(),
1406 ..Default::default()
1407 };
1408 if let Ok(persisted_doc) = normalized_placeholder.load().await {
1409 if let Some(better_description) = persisted_doc.description {
1410 log::debug!("Replaced ripgrep description for '{}' with persistence description (normalized from title: {})", document.title, normalized_id);
1411 document.description = Some(better_description);
1412 }
1413 } else {
1414 let normalized_id_with_md = format!("{}md", normalized_id);
1416 let mut md_placeholder = Document {
1417 id: normalized_id_with_md.clone(),
1418 ..Default::default()
1419 };
1420 if let Ok(persisted_doc) = md_placeholder.load().await {
1421 if let Some(better_description) = persisted_doc.description
1422 {
1423 log::debug!("Replaced ripgrep description for '{}' with persistence description (normalized with md: {})", document.title, normalized_id_with_md);
1424 document.description = Some(better_description);
1425 }
1426 } else {
1427 log::debug!("No persistence document found for '{}' (tried ID: '{}', normalized: '{}', with md: '{}')", document.title, document.id, normalized_id, normalized_id_with_md);
1428 }
1429 }
1430 }
1431 }
1432 }
1433
1434 docs_ranked.push(document);
1435 }
1436
1437 #[cfg(feature = "openrouter")]
1440 if role.has_llm_config() && role.llm_auto_summarize {
1441 log::debug!(
1442 "Applying OpenRouter AI summarization to {} search results for role '{}'",
1443 docs_ranked.len(),
1444 role.name
1445 );
1446 docs_ranked = self
1447 .enhance_descriptions_with_ai(docs_ranked, &role)
1448 .await?;
1449 } else {
1450 eprintln!(
1452 "📋 Entering LLM AI summarization branch for role: {}",
1453 role.name
1454 );
1455 log::debug!(
1456 "Applying LLM AI summarization to {} search results for role '{}'",
1457 docs_ranked.len(),
1458 role.name
1459 );
1460 docs_ranked = self
1461 .enhance_descriptions_with_ai(docs_ranked, &role)
1462 .await?;
1463 }
1464
1465 if role.terraphim_it {
1467 log::info!(
1468 "🧠 Applying KG preprocessing to {} TerraphimGraph search results for role '{}'",
1469 docs_ranked.len(),
1470 role.name
1471 );
1472 let mut processed_docs = Vec::new();
1473 let mut total_kg_terms = 0;
1474 let mut docs_with_kg_links = 0;
1475
1476 for document in docs_ranked {
1477 let original_body_len = document.body.len();
1478 let processed_doc =
1479 self.preprocess_document_content(document, &role).await?;
1480
1481 let new_body_len = processed_doc.body.len();
1483 if new_body_len > original_body_len {
1484 docs_with_kg_links += 1;
1485 let estimated_links = (new_body_len - original_body_len) / 17;
1487 total_kg_terms += estimated_links;
1488 }
1489
1490 processed_docs.push(processed_doc);
1491 }
1492
1493 log::info!(
1494 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1495 processed_docs.len(),
1496 docs_with_kg_links,
1497 total_kg_terms
1498 );
1499 Ok(processed_docs)
1500 } else {
1501 Ok(docs_ranked)
1502 }
1503 }
1504 RelevanceFunction::BM25 => {
1505 log::debug!("Searching haystack with BM25 scorer");
1506
1507 let documents = index.get_all_documents();
1508
1509 log::debug!("Sorting documents by BM25 relevance");
1510
1511 let documents = if search_query.is_multi_term_query() {
1512 let filtered_docs = self
1514 .apply_logical_operators_to_documents(search_query, documents)
1515 .await?;
1516 let combined_query_string = search_query
1518 .get_all_terms()
1519 .iter()
1520 .map(|t| t.as_str())
1521 .collect::<Vec<_>>()
1522 .join(" ");
1523 let query =
1524 Query::new(&combined_query_string).name_scorer(score::QueryScorer::BM25);
1525 score::sort_documents(&query, filtered_docs)
1526 } else {
1527 let query = Query::new(&search_query.search_term.to_string())
1529 .name_scorer(score::QueryScorer::BM25);
1530 score::sort_documents(&query, documents)
1531 };
1532 let total_length = documents.len();
1533 let mut docs_ranked = Vec::new();
1534 for (idx, doc) in documents.iter().enumerate() {
1535 let mut document: terraphim_types::Document = doc.clone();
1536 let rank = (total_length - idx).try_into().unwrap();
1537 document.rank = Some(rank);
1538 docs_ranked.push(document);
1539 }
1540
1541 #[cfg(feature = "openrouter")]
1543 if role.has_llm_config() && role.llm_auto_summarize {
1544 log::debug!("Applying OpenRouter AI summarization to {} BM25 search results for role '{}'", docs_ranked.len(), role.name);
1545 docs_ranked = self
1546 .enhance_descriptions_with_ai(docs_ranked, &role)
1547 .await?;
1548 } else {
1549 log::debug!(
1551 "Applying LLM AI summarization to {} BM25 search results for role '{}'",
1552 docs_ranked.len(),
1553 role.name
1554 );
1555 docs_ranked = self
1556 .enhance_descriptions_with_ai(docs_ranked, &role)
1557 .await?;
1558 }
1559
1560 if role.terraphim_it {
1562 log::info!(
1563 "🧠 Applying KG preprocessing to {} BM25 search results for role '{}'",
1564 docs_ranked.len(),
1565 role.name
1566 );
1567 let mut processed_docs = Vec::new();
1568 let mut total_kg_terms = 0;
1569 let mut docs_with_kg_links = 0;
1570
1571 for document in docs_ranked {
1572 let original_body_len = document.body.len();
1573 let processed_doc =
1574 self.preprocess_document_content(document, &role).await?;
1575
1576 let new_body_len = processed_doc.body.len();
1578 if new_body_len > original_body_len {
1579 docs_with_kg_links += 1;
1580 let estimated_links = (new_body_len - original_body_len) / 17;
1581 total_kg_terms += estimated_links;
1582 }
1583
1584 processed_docs.push(processed_doc);
1585 }
1586
1587 log::info!(
1588 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1589 processed_docs.len(),
1590 docs_with_kg_links,
1591 total_kg_terms
1592 );
1593 Ok(processed_docs)
1594 } else {
1595 Ok(docs_ranked)
1596 }
1597 }
1598 RelevanceFunction::BM25F => {
1599 log::debug!("Searching haystack with BM25F scorer");
1600
1601 let documents = index.get_all_documents();
1602
1603 log::debug!("Sorting documents by BM25F relevance");
1604
1605 let documents = if search_query.is_multi_term_query() {
1606 let filtered_docs = self
1608 .apply_logical_operators_to_documents(search_query, documents)
1609 .await?;
1610 let combined_query_string = search_query
1612 .get_all_terms()
1613 .iter()
1614 .map(|t| t.as_str())
1615 .collect::<Vec<_>>()
1616 .join(" ");
1617 let query =
1618 Query::new(&combined_query_string).name_scorer(score::QueryScorer::BM25F);
1619 score::sort_documents(&query, filtered_docs)
1620 } else {
1621 let query = Query::new(&search_query.search_term.to_string())
1623 .name_scorer(score::QueryScorer::BM25F);
1624 score::sort_documents(&query, documents)
1625 };
1626 let total_length = documents.len();
1627 let mut docs_ranked = Vec::new();
1628 for (idx, doc) in documents.iter().enumerate() {
1629 let mut document: terraphim_types::Document = doc.clone();
1630 let rank = (total_length - idx).try_into().unwrap();
1631 document.rank = Some(rank);
1632 docs_ranked.push(document);
1633 }
1634
1635 #[cfg(feature = "openrouter")]
1637 if role.has_llm_config() && role.llm_auto_summarize {
1638 log::debug!("Applying OpenRouter AI summarization to {} BM25F search results for role '{}'", docs_ranked.len(), role.name);
1639 docs_ranked = self
1640 .enhance_descriptions_with_ai(docs_ranked, &role)
1641 .await?;
1642 } else {
1643 log::debug!(
1645 "Applying LLM AI summarization to {} BM25F search results for role '{}'",
1646 docs_ranked.len(),
1647 role.name
1648 );
1649 docs_ranked = self
1650 .enhance_descriptions_with_ai(docs_ranked, &role)
1651 .await?;
1652 }
1653
1654 if role.terraphim_it {
1656 log::info!(
1657 "🧠 Applying KG preprocessing to {} BM25F search results for role '{}'",
1658 docs_ranked.len(),
1659 role.name
1660 );
1661 let mut processed_docs = Vec::new();
1662 let mut total_kg_terms = 0;
1663 let mut docs_with_kg_links = 0;
1664
1665 for document in docs_ranked {
1666 let original_body_len = document.body.len();
1667 let processed_doc =
1668 self.preprocess_document_content(document, &role).await?;
1669
1670 let new_body_len = processed_doc.body.len();
1672 if new_body_len > original_body_len {
1673 docs_with_kg_links += 1;
1674 let estimated_links = (new_body_len - original_body_len) / 17;
1675 total_kg_terms += estimated_links;
1676 }
1677
1678 processed_docs.push(processed_doc);
1679 }
1680
1681 log::info!(
1682 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1683 processed_docs.len(),
1684 docs_with_kg_links,
1685 total_kg_terms
1686 );
1687 Ok(processed_docs)
1688 } else {
1689 Ok(docs_ranked)
1690 }
1691 }
1692 RelevanceFunction::BM25Plus => {
1693 log::debug!("Searching haystack with BM25Plus scorer");
1694
1695 let documents = index.get_all_documents();
1696
1697 log::debug!("Sorting documents by BM25Plus relevance");
1698
1699 let documents = if search_query.is_multi_term_query() {
1700 let filtered_docs = self
1702 .apply_logical_operators_to_documents(search_query, documents)
1703 .await?;
1704 let combined_query_string = search_query
1706 .get_all_terms()
1707 .iter()
1708 .map(|t| t.as_str())
1709 .collect::<Vec<_>>()
1710 .join(" ");
1711 let query = Query::new(&combined_query_string)
1712 .name_scorer(score::QueryScorer::BM25Plus);
1713 score::sort_documents(&query, filtered_docs)
1714 } else {
1715 let query = Query::new(&search_query.search_term.to_string())
1717 .name_scorer(score::QueryScorer::BM25Plus);
1718 score::sort_documents(&query, documents)
1719 };
1720 let total_length = documents.len();
1721 let mut docs_ranked = Vec::new();
1722 for (idx, doc) in documents.iter().enumerate() {
1723 let mut document: terraphim_types::Document = doc.clone();
1724 let rank = (total_length - idx).try_into().unwrap();
1725 document.rank = Some(rank);
1726 docs_ranked.push(document);
1727 }
1728
1729 #[cfg(feature = "openrouter")]
1731 if role.has_llm_config() && role.llm_auto_summarize {
1732 log::debug!("Applying OpenRouter AI summarization to {} BM25Plus search results for role '{}'", docs_ranked.len(), role.name);
1733 docs_ranked = self
1734 .enhance_descriptions_with_ai(docs_ranked, &role)
1735 .await?;
1736 }
1737
1738 if role.terraphim_it {
1740 log::info!(
1741 "🧠 Applying KG preprocessing to {} BM25Plus search results for role '{}'",
1742 docs_ranked.len(),
1743 role.name
1744 );
1745 let mut processed_docs = Vec::new();
1746 let mut total_kg_terms = 0;
1747 let mut docs_with_kg_links = 0;
1748
1749 for document in docs_ranked {
1750 let original_body_len = document.body.len();
1751 let processed_doc =
1752 self.preprocess_document_content(document, &role).await?;
1753
1754 let new_body_len = processed_doc.body.len();
1756 if new_body_len > original_body_len {
1757 docs_with_kg_links += 1;
1758 let estimated_links = (new_body_len - original_body_len) / 17;
1759 total_kg_terms += estimated_links;
1760 }
1761
1762 processed_docs.push(processed_doc);
1763 }
1764
1765 log::info!(
1766 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1767 processed_docs.len(),
1768 docs_with_kg_links,
1769 total_kg_terms
1770 );
1771 Ok(processed_docs)
1772 } else {
1773 Ok(docs_ranked)
1774 }
1775 }
1776 RelevanceFunction::TerraphimGraph => {
1777 eprintln!("🧠 TerraphimGraph search initiated for role: {}", role.name);
1778 self.build_thesaurus(search_query).await?;
1779 let _thesaurus = self.ensure_thesaurus_loaded(&role.name).await?;
1780 let scored_index_docs: Vec<IndexedDocument> = self
1781 .config_state
1782 .search_indexed_documents(search_query, &role)
1783 .await;
1784
1785 log::debug!(
1786 "TerraphimGraph search found {} indexed documents",
1787 scored_index_docs.len()
1788 );
1789
1790 log::debug!("Ranking documents with thesaurus");
1793 let mut documents = index.get_documents(scored_index_docs.clone());
1794
1795 let all_haystack_docs = index.get_all_documents();
1798 log::debug!(
1799 "Found {} total documents from haystacks, checking which need indexing",
1800 all_haystack_docs.len()
1801 );
1802 let mut need_reindexing = false;
1803
1804 if let Some(rolegraph_sync) = self.config_state.roles.get(&role.name) {
1805 let mut rolegraph = rolegraph_sync.lock().await;
1806 let mut newly_indexed = 0;
1807
1808 for doc in &all_haystack_docs {
1809 if !rolegraph.has_document(&doc.id) && !doc.body.is_empty() {
1811 log::debug!("Indexing new document '{}' into rolegraph for TerraphimGraph search", doc.id);
1812 rolegraph.insert_document(&doc.id, doc.clone());
1813
1814 drop(rolegraph);
1817 if let Err(e) = doc.save().await {
1818 log::warn!(
1819 "Failed to save document '{}' to persistence: {}",
1820 doc.id,
1821 e
1822 );
1823 } else {
1824 log::debug!(
1825 "Successfully saved document '{}' to persistence",
1826 doc.id
1827 );
1828 }
1829 rolegraph = rolegraph_sync.lock().await;
1831
1832 newly_indexed += 1;
1833 }
1834 }
1835
1836 if newly_indexed > 0 {
1837 log::info!(
1838 "✅ Indexed {} new documents into rolegraph for role '{}'",
1839 newly_indexed,
1840 role.name
1841 );
1842 log::debug!(
1843 "RoleGraph now has {} nodes, {} edges, {} documents",
1844 rolegraph.get_node_count(),
1845 rolegraph.get_edge_count(),
1846 rolegraph.get_document_count()
1847 );
1848 need_reindexing = true; }
1850 }
1851
1852 let mut documents_with_content = Vec::new();
1855
1856 for mut document in documents {
1857 if document.body.is_empty() {
1859 log::debug!(
1860 "Document '{}' has empty body, attempting to load from persistence",
1861 document.id
1862 );
1863
1864 let mut full_doc = Document::new(document.id.clone());
1866 match full_doc.load().await {
1867 Ok(loaded_doc) => {
1868 if !loaded_doc.body.is_empty() {
1869 log::info!(
1870 "✅ Loaded body content for document '{}' from persistence",
1871 document.id
1872 );
1873 document.body = loaded_doc.body.clone();
1874 if loaded_doc.description.is_some() {
1875 document.description = loaded_doc.description.clone();
1876 }
1877
1878 if let Some(rolegraph_sync) =
1880 self.config_state.roles.get(&role.name)
1881 {
1882 let mut rolegraph = rolegraph_sync.lock().await;
1883 rolegraph.insert_document(&document.id, loaded_doc);
1884 need_reindexing = true;
1885 log::debug!(
1886 "Re-indexed document '{}' into rolegraph with content",
1887 document.id
1888 );
1889 }
1890 } else {
1891 log::warn!("Document '{}' still has empty body after loading from persistence", document.id);
1892 }
1893 }
1894 Err(e) => {
1895 log::warn!(
1896 "Failed to load document '{}' from persistence: {}",
1897 document.id,
1898 e
1899 );
1900
1901 if document.url.starts_with('/')
1903 || document.url.starts_with("docs/")
1904 {
1905 match tokio::fs::read_to_string(&document.url).await {
1906 Ok(content) => {
1907 log::info!(
1908 "✅ Loaded content for '{}' from file: {}",
1909 document.id,
1910 document.url
1911 );
1912 document.body = content.clone();
1913
1914 let full_doc = Document {
1916 id: document.id.clone(),
1917 title: document.title.clone(),
1918 body: content,
1919 url: document.url.clone(),
1920 description: document.description.clone(),
1921 summarization: document.summarization.clone(),
1922 stub: None,
1923 tags: document.tags.clone(),
1924 rank: document.rank,
1925 source_haystack: document.source_haystack.clone(),
1926 };
1927
1928 if let Err(e) = full_doc.save().await {
1930 log::warn!("Failed to save document '{}' to persistence: {}", document.id, e);
1931 }
1932
1933 if let Some(rolegraph_sync) =
1935 self.config_state.roles.get(&role.name)
1936 {
1937 let mut rolegraph = rolegraph_sync.lock().await;
1938 rolegraph.insert_document(&document.id, full_doc);
1939 need_reindexing = true;
1940 log::debug!("Re-indexed document '{}' into rolegraph from file", document.id);
1941 }
1942 }
1943 Err(file_e) => {
1944 log::warn!(
1945 "Failed to read file '{}' for document '{}': {}",
1946 document.url,
1947 document.id,
1948 file_e
1949 );
1950 }
1951 }
1952 }
1953 }
1954 }
1955 }
1956 documents_with_content.push(document);
1957 }
1958
1959 documents = documents_with_content;
1960
1961 if need_reindexing {
1962 log::info!("🔄 Re-running TerraphimGraph search after indexing new documents");
1963
1964 let updated_scored_docs: Vec<IndexedDocument> = self
1966 .config_state
1967 .search_indexed_documents(search_query, &role)
1968 .await;
1969
1970 if !updated_scored_docs.is_empty() {
1971 log::debug!(
1972 "✅ Updated rolegraph search found {} documents",
1973 updated_scored_docs.len()
1974 );
1975 let updated_documents = index.get_documents(updated_scored_docs);
1977 if !updated_documents.is_empty() {
1978 documents = updated_documents;
1979 }
1980 }
1981 }
1982
1983 if !documents.is_empty() {
1985 log::debug!(
1986 "Applying TF-IDF scoring to {} documents for enhanced ranking",
1987 documents.len()
1988 );
1989
1990 use crate::score::bm25_additional::TFIDFScorer;
1991 let mut tfidf_scorer = TFIDFScorer::new();
1992 tfidf_scorer.initialize(&documents);
1993
1994 let query_text = &search_query.search_term.to_string();
1996 for document in &mut documents {
1997 let tfidf_score = tfidf_scorer.score(query_text, document);
1998 if let Some(rank) = document.rank {
2000 document.rank = Some(rank + (tfidf_score * 0.3) as u64);
2001 } else {
2003 document.rank = Some((tfidf_score * 10.0) as u64); }
2005 }
2006
2007 documents.sort_by(|a, b| b.rank.unwrap_or(0).cmp(&a.rank.unwrap_or(0)));
2009
2010 log::debug!("TF-IDF scoring applied successfully");
2011 }
2012
2013 for document in &mut documents {
2015 if document.id.starts_with("http://") || document.id.starts_with("https://") {
2016 log::debug!(
2018 "Processing Atomic Data document '{}' (URL: {})",
2019 document.title,
2020 document.id
2021 );
2022
2023 let mut placeholder = Document {
2025 id: document.id.clone(),
2026 ..Default::default()
2027 };
2028 match placeholder.load().await {
2029 Ok(persisted_doc) => {
2030 log::debug!(
2032 "Found cached Atomic Data document '{}' in persistence",
2033 document.title
2034 );
2035 if let Some(better_description) = persisted_doc.description {
2036 document.description = Some(better_description);
2037 }
2038 if !persisted_doc.body.is_empty() && !role.terraphim_it {
2042 log::debug!(
2043 "Updated body from persistence for Atomic document '{}' (role: '{}', terraphim_it: {})",
2044 document.title, role.name, role.terraphim_it
2045 );
2046 document.body = persisted_doc.body;
2047 } else if role.terraphim_it {
2048 log::debug!(
2049 "Keeping search result body for Atomic document '{}' because role '{}' uses KG preprocessing (terraphim_it=true)",
2050 document.title, role.name
2051 );
2052 }
2053 }
2054 Err(_) => {
2055 log::debug!("Caching Atomic Data document '{}' to persistence for future queries", document.title);
2057
2058 let doc_to_save = document.clone();
2060 tokio::spawn(async move {
2061 if let Err(e) = doc_to_save.save().await {
2062 log::warn!(
2063 "Failed to cache Atomic Data document '{}': {}",
2064 doc_to_save.title,
2065 e
2066 );
2067 } else {
2068 log::debug!(
2069 "Successfully cached Atomic Data document '{}'",
2070 doc_to_save.title
2071 );
2072 }
2073 });
2074 }
2075 }
2076 } else {
2077 let mut placeholder = Document {
2079 id: document.id.clone(),
2080 ..Default::default()
2081 };
2082 if let Ok(persisted_doc) = placeholder.load().await {
2083 if let Some(better_description) = persisted_doc.description {
2084 log::debug!("Replaced ripgrep description for '{}' with persistence description", document.title);
2085 document.description = Some(better_description);
2086 }
2087 } else {
2088 let normalized_id = normalize_filename_to_id(&document.title);
2091
2092 let mut normalized_placeholder = Document {
2093 id: normalized_id.clone(),
2094 ..Default::default()
2095 };
2096 if let Ok(persisted_doc) = normalized_placeholder.load().await {
2097 if let Some(better_description) = persisted_doc.description {
2098 log::debug!("Replaced ripgrep description for '{}' with persistence description (normalized from title: {})", document.title, normalized_id);
2099 document.description = Some(better_description);
2100 }
2101 } else {
2102 let normalized_id_with_md = format!("{}md", normalized_id);
2104 let mut md_placeholder = Document {
2105 id: normalized_id_with_md.clone(),
2106 ..Default::default()
2107 };
2108 if let Ok(persisted_doc) = md_placeholder.load().await {
2109 if let Some(better_description) = persisted_doc.description {
2110 log::debug!("Replaced ripgrep description for '{}' with persistence description (normalized with md: {})", document.title, normalized_id_with_md);
2111 document.description = Some(better_description);
2112 }
2113 } else {
2114 log::debug!("No persistence document found for '{}' (tried ID: '{}', normalized: '{}', with md: '{}')", document.title, document.id, normalized_id, normalized_id_with_md);
2115 }
2116 }
2117 }
2118 }
2119 }
2120
2121 #[cfg(feature = "openrouter")]
2123 if role.has_llm_config() {
2124 log::debug!(
2125 "Applying OpenRouter AI summarization to {} search results for role '{}'",
2126 documents.len(),
2127 role.name
2128 );
2129 documents = self.enhance_descriptions_with_ai(documents, &role).await?;
2130 } else {
2131 log::debug!(
2133 "Applying LLM AI summarization to {} search results for role '{}'",
2134 documents.len(),
2135 role.name
2136 );
2137 documents = self.enhance_descriptions_with_ai(documents, &role).await?;
2138 }
2139
2140 if role.terraphim_it {
2142 log::debug!(
2143 "Applying KG preprocessing to {} search results for role '{}'",
2144 documents.len(),
2145 role.name
2146 );
2147 let mut processed_docs = Vec::new();
2148 for document in documents {
2149 let processed_doc =
2150 self.preprocess_document_content(document, &role).await?;
2151 processed_docs.push(processed_doc);
2152 }
2153 Ok(processed_docs)
2154 } else {
2155 Ok(documents)
2156 }
2157 }
2158 }
2159 }
2160
2161 fn is_hash_based_id(id: &str) -> bool {
2163 id.len() == 16 && id.chars().all(|c| c.is_ascii_hexdigit())
2164 }
2165
2166 pub async fn find_documents_for_kg_term(
2177 &mut self,
2178 role_name: &RoleName,
2179 term: &str,
2180 ) -> Result<Vec<Document>> {
2181 log::debug!(
2182 "Finding documents for KG term '{}' in role '{}'",
2183 term,
2184 role_name
2185 );
2186
2187 let thesaurus = self.ensure_thesaurus_loaded(role_name).await?;
2189
2190 let role = self.config_state.get_role(role_name).await.ok_or_else(|| {
2192 ServiceError::Config(format!("Role '{}' not found in config", role_name))
2193 })?;
2194
2195 let mut documents = Vec::new();
2196
2197 if let Some(kg_config) = &role.kg {
2201 log::debug!("Found KG config for role");
2202 if let Some(kg_local) = &kg_config.knowledge_graph_local {
2203 let mut potential_concepts = vec![term.to_string()];
2204
2205 log::debug!("Checking thesaurus for term '{}'", term);
2207
2208 let normalized_search_term =
2210 terraphim_types::NormalizedTermValue::new(term.to_string());
2211
2212 if let Some(root_concept) = thesaurus.get(&normalized_search_term) {
2214 log::debug!("Found root concept for '{}': {:?}", term, root_concept);
2215
2216 let root_concept_name = root_concept.value.as_str();
2218
2219 let concept_name = if let Some(url) = &root_concept.url {
2221 url.split('/')
2222 .next_back()
2223 .and_then(|s| s.strip_suffix(".md"))
2224 .unwrap_or(root_concept_name)
2225 } else {
2226 root_concept_name
2227 };
2228
2229 if !potential_concepts.contains(&concept_name.to_string()) {
2230 potential_concepts.push(concept_name.to_string());
2231 log::debug!(
2232 "Added concept from thesaurus: {} (root: {})",
2233 concept_name,
2234 root_concept_name
2235 );
2236 }
2237 } else {
2238 log::debug!("No direct mapping found for '{}' in thesaurus", term);
2239 }
2240
2241 log::debug!(
2242 "Trying {} potential concepts: {:?}",
2243 potential_concepts.len(),
2244 potential_concepts
2245 );
2246
2247 for concept in potential_concepts {
2249 let potential_kg_file = kg_local.path.join(format!("{}.md", concept));
2250 log::debug!("Looking for KG definition file: {:?}", potential_kg_file);
2251
2252 if potential_kg_file.exists() {
2253 log::info!("Found KG definition file: {:?}", potential_kg_file);
2254
2255 let file_path = potential_kg_file.to_string_lossy().to_string();
2257 if documents.iter().any(|d: &Document| d.url == file_path) {
2258 log::debug!("Skipping duplicate KG document: {}", file_path);
2259 continue;
2260 }
2261
2262 match std::fs::read_to_string(&potential_kg_file) {
2265 Ok(content) => {
2266 let mut kg_doc =
2267 Document::new(potential_kg_file.to_string_lossy().to_string());
2268 kg_doc.url = potential_kg_file.to_string_lossy().to_string();
2269 kg_doc.body = content.clone();
2270
2271 let title = content
2273 .lines()
2274 .find(|line| line.starts_with("# "))
2275 .map(|line| line.trim_start_matches("# ").trim())
2276 .unwrap_or(&concept)
2277 .to_string();
2278 kg_doc.title = title;
2279
2280 log::debug!(
2281 "Successfully loaded KG definition document: {}",
2282 kg_doc.title
2283 );
2284 documents.push(kg_doc);
2285
2286 break;
2288 }
2289 Err(e) => {
2290 log::warn!(
2291 "Failed to read KG definition file '{}': {}",
2292 potential_kg_file.display(),
2293 e
2294 );
2295 }
2296 }
2297 } else {
2298 log::debug!("KG definition file not found: {:?}", potential_kg_file);
2299 }
2300 }
2301 } else {
2302 log::debug!("No KG local config found");
2303 }
2304 } else {
2305 log::debug!("No KG config found for role");
2306 }
2307
2308 let rolegraph_sync = self
2310 .config_state
2311 .roles
2312 .get(role_name)
2313 .ok_or_else(|| ServiceError::Config(format!("Role '{}' not found", role_name)))?;
2314
2315 let rolegraph = rolegraph_sync.lock().await;
2316 let document_ids = rolegraph.find_document_ids_for_term(term);
2317 drop(rolegraph); log::debug!(
2320 "Found {} document IDs from rolegraph for term '{}'",
2321 document_ids.len(),
2322 term
2323 );
2324
2325 for doc_id in &document_ids {
2327 if documents
2329 .iter()
2330 .any(|d| d.id == *doc_id || d.url == *doc_id)
2331 {
2332 log::debug!("Skipping duplicate document from rolegraph: {}", doc_id);
2333 continue;
2334 }
2335
2336 if doc_id.starts_with("http://") || doc_id.starts_with("https://") {
2339 log::debug!("Loading Atomic Data document '{}' from persistence", doc_id);
2341 let mut placeholder = Document {
2342 id: doc_id.clone(),
2343 ..Default::default()
2344 };
2345 match placeholder.load().await {
2346 Ok(loaded_doc) => {
2347 log::debug!(
2348 "Found cached Atomic Data document '{}' in persistence",
2349 doc_id
2350 );
2351 documents.push(loaded_doc);
2352 }
2353 Err(_) => {
2354 log::warn!("Atomic Data document '{}' not found in persistence - this may indicate the document hasn't been cached yet", doc_id);
2355 }
2358 }
2359 } else {
2360 let mut doc = Document::new(doc_id.clone());
2362 match doc.load().await {
2363 Ok(loaded_doc) => {
2364 documents.push(loaded_doc);
2365 log::trace!("Successfully loaded local document: {}", doc_id);
2366 }
2367 Err(e) => {
2368 log::warn!("Failed to load local document '{}': {}", doc_id, e);
2369
2370 if Self::is_hash_based_id(doc_id) {
2372 log::debug!("Document ID '{}' appears to be hash-based (legacy document), skipping for now", doc_id);
2373 log::info!("💡 Hash-based document IDs are deprecated. This document will be re-indexed with normalized IDs on next haystack search.");
2374 }
2377
2378 }
2380 }
2381 }
2382 }
2383
2384 if role.terraphim_it {
2386 log::info!(
2387 "🧠 Applying KG preprocessing to {} KG term documents for role '{}' (terraphim_it enabled)",
2388 documents.len(),
2389 role_name
2390 );
2391 let mut processed_documents = Vec::new();
2392 let mut total_kg_terms = 0;
2393 let mut docs_with_kg_links = 0;
2394
2395 for document in documents {
2396 let original_body_len = document.body.len();
2397 let processed_doc = self.preprocess_document_content(document, &role).await?;
2398
2399 let new_body_len = processed_doc.body.len();
2401 if new_body_len > original_body_len {
2402 docs_with_kg_links += 1;
2403 let estimated_links = (new_body_len - original_body_len) / 17;
2404 total_kg_terms += estimated_links;
2405 }
2406
2407 processed_documents.push(processed_doc);
2408 }
2409
2410 log::info!(
2411 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
2412 processed_documents.len(),
2413 docs_with_kg_links,
2414 total_kg_terms
2415 );
2416 documents = processed_documents;
2417 } else {
2418 log::info!(
2419 "🔍 terraphim_it disabled for role '{}', skipping KG preprocessing for {} documents",
2420 role_name,
2421 documents.len()
2422 );
2423 }
2424
2425 let total_length = documents.len();
2428 for (idx, doc) in documents.iter_mut().enumerate() {
2429 let rank = (total_length - idx) as u64;
2430 doc.rank = Some(rank);
2431 log::trace!("Assigned rank {} to document '{}'", rank, doc.title);
2432 }
2433
2434 log::debug!(
2435 "Successfully loaded and processed {} documents for term '{}', ranks assigned from {} to 1",
2436 documents.len(),
2437 term,
2438 total_length
2439 );
2440 Ok(documents)
2441 }
2442
2443 #[cfg(feature = "openrouter")]
2460 pub async fn generate_document_summary(
2461 &self,
2462 document: &Document,
2463 api_key: &str,
2464 model: &str,
2465 max_length: usize,
2466 ) -> Result<String> {
2467 use crate::openrouter::OpenRouterService;
2468
2469 log::debug!(
2470 "Generating summary for document '{}' using model '{}'",
2471 document.id,
2472 model
2473 );
2474
2475 let openrouter_service =
2477 OpenRouterService::new(api_key, model).map_err(ServiceError::OpenRouter)?;
2478
2479 let content = &document.body;
2481
2482 if content.trim().is_empty() {
2483 return Err(ServiceError::Config(
2484 "Document body is empty, cannot generate summary".to_string(),
2485 ));
2486 }
2487
2488 let summary = openrouter_service
2490 .generate_summary(content, max_length)
2491 .await
2492 .map_err(ServiceError::OpenRouter)?;
2493
2494 log::info!(
2495 "Generated {}-character summary for document '{}' using model '{}'",
2496 summary.len(),
2497 document.id,
2498 model
2499 );
2500
2501 Ok(summary)
2502 }
2503
2504 #[cfg(not(feature = "openrouter"))]
2506 pub async fn generate_document_summary(
2507 &self,
2508 _document: &Document,
2509 _api_key: &str,
2510 _model: &str,
2511 _max_length: usize,
2512 ) -> Result<String> {
2513 Err(ServiceError::Config(
2514 "OpenRouter feature not enabled during compilation".to_string(),
2515 ))
2516 }
2517
2518 pub async fn fetch_config(&self) -> terraphim_config::Config {
2520 let current_config = self.config_state.config.lock().await;
2521 current_config.clone()
2522 }
2523
2524 #[cfg(test)]
2526 pub async fn get_role(&self, role_name: &RoleName) -> Result<Role> {
2527 let config = self.config_state.config.lock().await;
2528 config
2529 .roles
2530 .get(role_name)
2531 .cloned()
2532 .ok_or_else(|| ServiceError::Config(format!("Role '{}' not found", role_name)))
2533 }
2534
2535 pub async fn update_config(
2540 &self,
2541 config: terraphim_config::Config,
2542 ) -> Result<terraphim_config::Config> {
2543 let mut current_config = self.config_state.config.lock().await;
2544 *current_config = config.clone();
2545 current_config.save().await?;
2546 log::info!("Config updated");
2547 Ok(config)
2548 }
2549
2550 pub async fn update_selected_role(
2553 &self,
2554 role_name: terraphim_types::RoleName,
2555 ) -> Result<terraphim_config::Config> {
2556 let mut current_config = self.config_state.config.lock().await;
2557
2558 if !current_config.roles.contains_key(&role_name) {
2560 return Err(ServiceError::Config(format!(
2561 "Role `{}` not found in config",
2562 role_name
2563 )));
2564 }
2565
2566 current_config.selected_role = role_name.clone();
2567 current_config.save().await?;
2568
2569 if let Some(role) = current_config.roles.get(&role_name) {
2571 if role.terraphim_it {
2572 log::info!("🎯 Selected role '{}' → terraphim_it: ✅ ENABLED (KG preprocessing will be applied)", role_name);
2573 if role.kg.is_some() {
2574 log::info!("📚 KG configuration: Available for role '{}'", role_name);
2575 } else {
2576 log::warn!("⚠️ KG configuration: Missing for role '{}' (terraphim_it enabled but no KG)", role_name);
2577 }
2578 } else {
2579 log::info!(
2580 "🎯 Selected role '{}' → terraphim_it: ❌ DISABLED (KG preprocessing skipped)",
2581 role_name
2582 );
2583 }
2584 } else {
2585 log::info!("🎯 Selected role updated to '{}'", role_name);
2586 }
2587
2588 Ok(current_config.clone())
2589 }
2590
2591 fn highlight_search_terms(content: &str, search_query: &SearchQuery) -> String {
2596 let mut highlighted_content = content.to_string();
2597
2598 let terms = search_query.get_all_terms();
2600
2601 let mut sorted_terms: Vec<&str> = terms.iter().map(|t| t.as_str()).collect();
2603 sorted_terms.sort_by_key(|term| std::cmp::Reverse(term.len()));
2604
2605 for term in sorted_terms {
2606 if term.trim().is_empty() {
2607 continue;
2608 }
2609
2610 let escaped_term = regex::escape(term);
2613
2614 if let Ok(regex) = regex::RegexBuilder::new(&escaped_term)
2615 .case_insensitive(true)
2616 .build()
2617 {
2618 let highlight_open = "<mark class=\"search-highlight\">";
2621 let highlight_close = "</mark>";
2622
2623 highlighted_content = regex
2624 .replace_all(
2625 &highlighted_content,
2626 format!("{}{}{}", highlight_open, "$0", highlight_close),
2627 )
2628 .to_string();
2629 }
2630 }
2631
2632 highlighted_content
2633 }
2634}
2635
2636#[cfg(test)]
2637mod tests {
2638 use super::*;
2639 use terraphim_config::ConfigBuilder;
2640 use terraphim_types::NormalizedTermValue;
2641
2642 #[tokio::test]
2643 async fn test_get_config() {
2644 let mut config = ConfigBuilder::new()
2645 .build_default_desktop()
2646 .build()
2647 .unwrap();
2648 let config_state = ConfigState::new(&mut config).await.unwrap();
2649 let service = TerraphimService::new(config_state);
2650 let fetched_config = service.fetch_config().await;
2651 assert_eq!(fetched_config.id, terraphim_config::ConfigId::Desktop);
2652 }
2653
2654 #[tokio::test]
2655 async fn test_search_documents_selected_role() {
2656 let mut config = ConfigBuilder::new()
2657 .build_default_desktop()
2658 .build()
2659 .unwrap();
2660 let config_state = ConfigState::new(&mut config).await.unwrap();
2661 let mut service = TerraphimService::new(config_state);
2662 let search_term = NormalizedTermValue::new("terraphim".to_string());
2663 let documents = service
2664 .search_documents_selected_role(&search_term)
2665 .await
2666 .unwrap();
2667 assert!(documents.is_empty() || !documents.is_empty()); }
2669
2670 #[tokio::test]
2671 async fn test_ensure_thesaurus_loaded_terraphim_engineer() {
2672 let project_root =
2674 std::env::current_dir().unwrap_or_else(|_| std::path::PathBuf::from("."));
2675 let kg_path = project_root.join("docs/src/kg");
2676
2677 if !kg_path.exists() {
2679 println!("⚠️ KG directory not found at {:?}, skipping test", kg_path);
2680 return;
2681 }
2682
2683 let mut config = ConfigBuilder::new()
2684 .build_default_desktop()
2685 .build()
2686 .unwrap();
2687
2688 if let Some(terr_eng_role) = config.roles.get_mut(&"Terraphim Engineer".into()) {
2690 if let Some(kg) = &mut terr_eng_role.kg {
2691 if let Some(kg_local) = &mut kg.knowledge_graph_local {
2692 kg_local.path = kg_path;
2693 }
2694 }
2695 }
2696
2697 let config_state = ConfigState::new(&mut config).await.unwrap();
2698 let mut service = TerraphimService::new(config_state);
2699
2700 let role_name = RoleName::new("Terraphim Engineer");
2701 let thesaurus_result = service.ensure_thesaurus_loaded(&role_name).await;
2702
2703 match thesaurus_result {
2704 Ok(thesaurus) => {
2705 println!(
2706 "✅ Successfully loaded thesaurus with {} entries",
2707 thesaurus.len()
2708 );
2709 assert!(!thesaurus.is_empty(), "Thesaurus should not be empty");
2711
2712 let has_terraphim = (&thesaurus)
2714 .into_iter()
2715 .any(|(term, _)| term.as_str().to_lowercase().contains("terraphim"));
2716 let has_graph = (&thesaurus)
2717 .into_iter()
2718 .any(|(term, _)| term.as_str().to_lowercase().contains("graph"));
2719
2720 println!(" Contains 'terraphim': {}", has_terraphim);
2721 println!(" Contains 'graph': {}", has_graph);
2722
2723 assert!(
2725 has_terraphim || has_graph,
2726 "Thesaurus should contain expected terms"
2727 );
2728 }
2729 Err(e) => {
2730 println!("❌ Failed to load thesaurus: {:?}", e);
2731 }
2734 }
2735 }
2736
2737 #[tokio::test]
2738 #[ignore = "Requires local KG fixtures at ~/.terraphim/kg"]
2739 async fn test_config_building_with_local_kg() {
2740 let mut config = ConfigBuilder::new()
2742 .build_default_desktop()
2743 .build()
2744 .unwrap();
2745 let config_state_result = ConfigState::new(&mut config).await;
2746
2747 match config_state_result {
2748 Ok(config_state) => {
2749 println!("✅ Successfully built config state");
2750 assert!(
2752 !config_state.roles.is_empty(),
2753 "Config state should have roles"
2754 );
2755
2756 let terraphim_engineer_role = RoleName::new("Terraphim Engineer");
2758 let has_terraphim_engineer =
2759 config_state.roles.contains_key(&terraphim_engineer_role);
2760 println!(" Has Terraphim Engineer role: {}", has_terraphim_engineer);
2761
2762 assert!(
2764 has_terraphim_engineer,
2765 "Terraphim Engineer role should exist"
2766 );
2767 }
2768 Err(e) => {
2769 println!("❌ Failed to build config state: {:?}", e);
2770 }
2773 }
2774 }
2775
2776 #[tokio::test]
2777 async fn test_atomic_data_persistence_skip() {
2778 use ahash::AHashMap;
2779 use terraphim_config::{Config, Haystack, Role, ServiceType};
2780 use terraphim_persistence::DeviceStorage;
2781 use terraphim_types::{NormalizedTermValue, RoleName, SearchQuery};
2782
2783 DeviceStorage::init_memory_only().await.unwrap();
2785
2786 let mut config = Config::default();
2788 let role_name = RoleName::new("test_role");
2789 let role = Role {
2790 shortname: None,
2791 name: "test_role".into(),
2792 haystacks: vec![Haystack {
2793 location: "test".to_string(),
2794 service: ServiceType::Ripgrep,
2795 read_only: false,
2796 atomic_server_secret: None,
2797 extra_parameters: std::collections::HashMap::new(),
2798 fetch_content: false,
2799 }],
2800 kg: None,
2801 terraphim_it: false,
2802 theme: "default".to_string(),
2803 relevance_function: terraphim_types::RelevanceFunction::TitleScorer,
2804 llm_enabled: false,
2805 llm_api_key: None,
2806 llm_model: None,
2807 llm_auto_summarize: false,
2808 llm_chat_enabled: false,
2809 llm_chat_system_prompt: None,
2810 llm_chat_model: None,
2811 llm_context_window: None,
2812 extra: AHashMap::new(),
2813 };
2814 config.roles.insert(role_name.clone(), role);
2815
2816 let config_state = ConfigState::new(&mut config).await.unwrap();
2817 let mut service = TerraphimService::new(config_state);
2818
2819 let search_query = SearchQuery {
2821 search_term: NormalizedTermValue::new("test".to_string()),
2822 search_terms: None,
2823 operator: None,
2824 limit: Some(10),
2825 skip: None,
2826 role: Some(role_name),
2827 };
2828
2829 let result = service.search(&search_query).await;
2832
2833 assert!(result.is_ok(), "Search should complete without errors");
2836 }
2837
2838 #[tokio::test]
2839 async fn test_atomic_data_caching() {
2840 use ahash::AHashMap;
2841 use terraphim_config::{Config, Haystack, Role, ServiceType};
2842 use terraphim_persistence::DeviceStorage;
2843 use terraphim_types::{Document, NormalizedTermValue, RoleName, SearchQuery};
2844
2845 DeviceStorage::init_memory_only().await.unwrap();
2847
2848 let mut config = Config::default();
2850 let role_name = RoleName::new("test_role");
2851 let role = Role {
2852 shortname: None,
2853 name: "test_role".into(),
2854 haystacks: vec![Haystack {
2855 location: "test".to_string(),
2856 service: ServiceType::Ripgrep,
2857 read_only: false,
2858 atomic_server_secret: None,
2859 extra_parameters: std::collections::HashMap::new(),
2860 fetch_content: false,
2861 }],
2862 kg: None,
2863 terraphim_it: false,
2864 theme: "default".to_string(),
2865 relevance_function: terraphim_types::RelevanceFunction::TitleScorer,
2866 llm_enabled: false,
2867 llm_api_key: None,
2868 llm_model: None,
2869 llm_auto_summarize: false,
2870 llm_chat_enabled: false,
2871 llm_chat_system_prompt: None,
2872 llm_chat_model: None,
2873 llm_context_window: None,
2874 extra: AHashMap::new(),
2875 };
2876 config.roles.insert(role_name.clone(), role);
2877
2878 let config_state = ConfigState::new(&mut config).await.unwrap();
2879 let mut service = TerraphimService::new(config_state);
2880
2881 let atomic_doc = Document {
2883 id: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
2884 url: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
2885 title: "Requested Loan Amount ($)".to_string(),
2886 body: "Form field for Requested Loan Amount ($)".to_string(),
2887 description: Some("Form field for Requested Loan Amount ($)".to_string()),
2888 summarization: None,
2889 stub: None,
2890 tags: None,
2891 rank: None,
2892 source_haystack: None,
2893 };
2894
2895 log::info!("Testing Atomic Data document caching...");
2897 match atomic_doc.save().await {
2898 Ok(_) => log::info!("✅ Successfully saved Atomic Data document to persistence"),
2899 Err(e) => {
2900 log::error!("❌ Failed to save Atomic Data document: {}", e);
2901 panic!("Atomic Data document save failed");
2902 }
2903 }
2904
2905 let mut placeholder = Document {
2907 id: atomic_doc.id.clone(),
2908 ..Default::default()
2909 };
2910 match placeholder.load().await {
2911 Ok(loaded_doc) => {
2912 log::info!("✅ Successfully loaded Atomic Data document from persistence");
2913 assert_eq!(loaded_doc.title, atomic_doc.title);
2914 assert_eq!(loaded_doc.body, atomic_doc.body);
2915 assert_eq!(loaded_doc.description, atomic_doc.description);
2916 }
2917 Err(e) => {
2918 log::error!(
2919 "❌ Failed to load Atomic Data document from persistence: {}",
2920 e
2921 );
2922 panic!("Atomic Data document load failed");
2923 }
2924 }
2925
2926 let search_query = SearchQuery {
2928 search_term: NormalizedTermValue::new("test".to_string()),
2929 search_terms: None,
2930 operator: None,
2931 limit: Some(10),
2932 skip: None,
2933 role: Some(role_name),
2934 };
2935
2936 let result = service.search(&search_query).await;
2937 assert!(result.is_ok(), "Search should complete without errors");
2938
2939 log::info!("✅ All Atomic Data caching tests passed!");
2940 }
2941
2942 #[tokio::test]
2943 #[ignore = "Requires local KG fixtures at 'test' directory"]
2944 async fn test_kg_term_search_with_atomic_data() {
2945 use ahash::AHashMap;
2946 use std::path::PathBuf;
2947 use terraphim_config::{
2948 Config, Haystack, KnowledgeGraph, KnowledgeGraphLocal, Role, ServiceType,
2949 };
2950 use terraphim_persistence::DeviceStorage;
2951 use terraphim_types::{Document, KnowledgeGraphInputType, RoleName};
2952
2953 DeviceStorage::init_memory_only().await.unwrap();
2955
2956 let mut config = Config::default();
2958 let role_name = RoleName::new("test_kg_role");
2959 let role = Role {
2960 shortname: None,
2961 name: "test_kg_role".into(),
2962 haystacks: vec![Haystack {
2963 location: "test".to_string(),
2964 service: ServiceType::Ripgrep,
2965 read_only: false,
2966 atomic_server_secret: None,
2967 extra_parameters: std::collections::HashMap::new(),
2968 fetch_content: false,
2969 }],
2970 kg: Some(KnowledgeGraph {
2971 automata_path: None,
2972 knowledge_graph_local: Some(KnowledgeGraphLocal {
2973 input_type: KnowledgeGraphInputType::Markdown,
2974 path: PathBuf::from("test"),
2975 }),
2976 public: true,
2977 publish: true,
2978 }),
2979 terraphim_it: true,
2980 theme: "default".to_string(),
2981 relevance_function: terraphim_types::RelevanceFunction::TerraphimGraph,
2982 llm_enabled: false,
2983 llm_api_key: None,
2984 llm_model: None,
2985 llm_auto_summarize: false,
2986 llm_chat_enabled: false,
2987 llm_chat_system_prompt: None,
2988 llm_chat_model: None,
2989 llm_context_window: None,
2990 extra: AHashMap::new(),
2991 };
2992 config.roles.insert(role_name.clone(), role);
2993
2994 let config_state = ConfigState::new(&mut config).await.unwrap();
2995 let mut service = TerraphimService::new(config_state);
2996
2997 let atomic_doc = Document {
2999 id: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3000 url: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3001 title: "Requested Loan Amount ($)".to_string(),
3002 body: "Form field for Requested Loan Amount ($)".to_string(),
3003 description: Some("Form field for Requested Loan Amount ($)".to_string()),
3004 summarization: None,
3005 stub: None,
3006 tags: None,
3007 rank: None,
3008 source_haystack: None,
3009 };
3010
3011 log::info!("Testing KG term search with Atomic Data documents...");
3013 match atomic_doc.save().await {
3014 Ok(_) => log::info!("✅ Successfully saved Atomic Data document to persistence"),
3015 Err(e) => {
3016 log::error!("❌ Failed to save Atomic Data document: {}", e);
3017 panic!("Atomic Data document save failed");
3018 }
3019 }
3020
3021 let result = service.find_documents_for_kg_term(&role_name, "test").await;
3025
3026 assert!(
3029 result.is_ok(),
3030 "find_documents_for_kg_term should complete without errors"
3031 );
3032
3033 let documents = result.unwrap();
3034 log::info!(
3035 "✅ KG term search completed successfully, found {} documents",
3036 documents.len()
3037 );
3038
3039 let atomic_doc_id = "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount";
3042 let mut placeholder = Document {
3043 id: atomic_doc_id.to_string(),
3044 ..Default::default()
3045 };
3046
3047 match placeholder.load().await {
3048 Ok(loaded_doc) => {
3049 log::info!("✅ Successfully loaded Atomic Data document from persistence in KG term search context");
3050 assert_eq!(loaded_doc.title, atomic_doc.title);
3051 assert_eq!(loaded_doc.body, atomic_doc.body);
3052 }
3053 Err(e) => {
3054 log::error!(
3055 "❌ Failed to load Atomic Data document in KG term search context: {}",
3056 e
3057 );
3058 panic!("Atomic Data document load failed in KG term search context");
3059 }
3060 }
3061
3062 log::info!("✅ All KG term search with Atomic Data tests passed!");
3063 }
3064
3065 #[tokio::test]
3066 async fn test_kg_term_search_rank_assignment() -> Result<()> {
3067 use ahash::AHashMap;
3068 use terraphim_config::{Config, Haystack, Role, ServiceType};
3069 use terraphim_persistence::DeviceStorage;
3070 use terraphim_types::{Document, RoleName};
3071
3072 DeviceStorage::init_memory_only().await.unwrap();
3074
3075 let mut config = Config::default();
3077 let role_name = RoleName::new("Test KG Role");
3078 let role = Role {
3079 shortname: Some("test-kg".to_string()),
3080 name: role_name.clone(),
3081 haystacks: vec![Haystack {
3082 location: "test".to_string(),
3083 service: ServiceType::Ripgrep,
3084 read_only: false,
3085 atomic_server_secret: None,
3086 extra_parameters: std::collections::HashMap::new(),
3087 fetch_content: false,
3088 }],
3089 kg: Some(terraphim_config::KnowledgeGraph {
3090 automata_path: Some(terraphim_automata::AutomataPath::local_example()),
3091 knowledge_graph_local: None,
3092 public: false,
3093 publish: false,
3094 }),
3095 terraphim_it: false,
3096 theme: "default".to_string(),
3097 relevance_function: terraphim_types::RelevanceFunction::TitleScorer,
3098 llm_enabled: false,
3099 llm_api_key: None,
3100 llm_model: None,
3101 llm_auto_summarize: false,
3102 llm_chat_enabled: false,
3103 llm_chat_system_prompt: None,
3104 llm_chat_model: None,
3105 llm_context_window: None,
3106 extra: AHashMap::new(),
3107 };
3108 config.roles.insert(role_name.clone(), role);
3109
3110 let config_state = ConfigState::new(&mut config).await.unwrap();
3111 let _service = TerraphimService::new(config_state);
3112
3113 let test_documents = vec![
3115 Document {
3116 id: "test-doc-1".to_string(),
3117 title: "First Test Document".to_string(),
3118 body: "This is the first test document body".to_string(),
3119 url: "test://doc1".to_string(),
3120 description: Some("First document description".to_string()),
3121 summarization: None,
3122 stub: None,
3123 tags: Some(vec!["test".to_string(), "first".to_string()]),
3124 rank: None, source_haystack: None,
3126 },
3127 Document {
3128 id: "test-doc-2".to_string(),
3129 title: "Second Test Document".to_string(),
3130 body: "This is the second test document body".to_string(),
3131 url: "test://doc2".to_string(),
3132 description: Some("Second document description".to_string()),
3133 summarization: None,
3134 stub: None,
3135 tags: Some(vec!["test".to_string(), "second".to_string()]),
3136 rank: None, source_haystack: None,
3138 },
3139 Document {
3140 id: "test-doc-3".to_string(),
3141 title: "Third Test Document".to_string(),
3142 body: "This is the third test document body".to_string(),
3143 url: "test://doc3".to_string(),
3144 description: Some("Third document description".to_string()),
3145 summarization: None,
3146 stub: None,
3147 tags: Some(vec!["test".to_string(), "third".to_string()]),
3148 rank: None, source_haystack: None,
3150 },
3151 ];
3152
3153 for doc in &test_documents {
3155 doc.save().await.expect("Failed to save test document");
3156 }
3157
3158 let mut simulated_documents = test_documents.clone();
3164
3165 let total_length = simulated_documents.len();
3167 for (idx, doc) in simulated_documents.iter_mut().enumerate() {
3168 let rank = (total_length - idx) as u64;
3169 doc.rank = Some(rank);
3170 }
3171
3172 assert_eq!(simulated_documents.len(), 3, "Should have 3 test documents");
3174
3175 for doc in &simulated_documents {
3177 assert!(
3178 doc.rank.is_some(),
3179 "Document '{}' should have a rank assigned",
3180 doc.title
3181 );
3182 assert!(
3183 doc.rank.unwrap() > 0,
3184 "Document '{}' should have a positive rank",
3185 doc.title
3186 );
3187 }
3188
3189 assert_eq!(
3191 simulated_documents[0].rank,
3192 Some(3),
3193 "First document should have highest rank (3)"
3194 );
3195 assert_eq!(
3196 simulated_documents[1].rank,
3197 Some(2),
3198 "Second document should have rank 2"
3199 );
3200 assert_eq!(
3201 simulated_documents[2].rank,
3202 Some(1),
3203 "Third document should have rank 1"
3204 );
3205
3206 let mut ranks: Vec<u64> = simulated_documents
3208 .iter()
3209 .map(|doc| doc.rank.unwrap())
3210 .collect();
3211 ranks.sort_by(|a, b| b.cmp(a)); assert_eq!(
3213 ranks,
3214 vec![3, 2, 1],
3215 "Ranks should be unique and in descending order"
3216 );
3217
3218 log::info!("✅ KG term search rank assignment test completed successfully!");
3219 Ok(())
3220 }
3221}