1use ahash::AHashMap;
7use terraphim_automata::builder::{Logseq, ThesaurusBuilder, compute_kg_source_hash};
8use terraphim_automata::load_thesaurus;
9use terraphim_automata::{LinkType, replace_matches};
10use terraphim_config::{ConfigState, Role};
11use terraphim_middleware::thesaurus::build_thesaurus_from_haystack;
12use terraphim_persistence::Persistable;
13use terraphim_rolegraph::{RoleGraph, RoleGraphSync};
14use terraphim_types::{
15 Document, Index, IndexedDocument, Layer, NormalizedTermValue, RelevanceFunction, RoleName,
16 SearchQuery, Thesaurus,
17};
18mod score;
19use crate::score::Query;
20
21pub mod auto_route;
22pub use auto_route::{
23 AutoRouteContext, AutoRouteReason, AutoRouteResult, JMAP_MISSING_TOKEN_PENALTY,
24 auto_select_role,
25};
26
27#[cfg(feature = "openrouter")]
28pub mod openrouter;
29
30pub mod llm;
32
33pub mod llm_proxy;
39
40pub mod http_client;
44
45pub mod logging;
47
48pub mod conversation_service;
50pub mod rate_limiter;
51pub mod summarization_manager;
52pub mod summarization_queue;
53pub mod summarization_worker;
54
55pub mod error;
57
58pub mod context;
60
61#[cfg(test)]
62mod context_tests;
63
64fn normalize_filename_to_id(filename: &str) -> String {
68 let re = regex::Regex::new(r"[^a-zA-Z0-9]+").expect("Failed to create regex");
69 re.replace_all(filename, "").to_lowercase()
70}
71
72#[derive(thiserror::Error, Debug)]
74pub enum ServiceError {
75 #[error("Middleware error: {0}")]
76 Middleware(#[from] terraphim_middleware::Error),
77
78 #[error("OpenDal error: {0}")]
79 OpenDal(Box<opendal::Error>),
80
81 #[error("Persistence error: {0}")]
82 Persistence(#[from] terraphim_persistence::Error),
83
84 #[error("Config error: {0}")]
85 Config(String),
86
87 #[cfg(feature = "openrouter")]
88 #[error("OpenRouter error: {0}")]
89 OpenRouter(#[from] crate::openrouter::OpenRouterError),
90
91 #[error("Common error: {0}")]
92 Common(#[from] crate::error::CommonError),
93}
94
95impl From<opendal::Error> for ServiceError {
96 fn from(err: opendal::Error) -> Self {
97 ServiceError::OpenDal(Box::new(err))
98 }
99}
100
101impl crate::error::TerraphimError for ServiceError {
102 fn category(&self) -> crate::error::ErrorCategory {
103 use crate::error::ErrorCategory;
104 match self {
105 ServiceError::Middleware(_) => ErrorCategory::Integration,
106 ServiceError::OpenDal(_) => ErrorCategory::Storage,
107 ServiceError::Persistence(_) => ErrorCategory::Storage,
108 ServiceError::Config(_) => ErrorCategory::Configuration,
109 #[cfg(feature = "openrouter")]
110 ServiceError::OpenRouter(_) => ErrorCategory::Integration,
111 ServiceError::Common(err) => err.category(),
112 }
113 }
114
115 fn is_recoverable(&self) -> bool {
116 match self {
117 ServiceError::Middleware(_) => true,
118 ServiceError::OpenDal(_) => false,
119 ServiceError::Persistence(_) => false,
120 ServiceError::Config(_) => false,
121 #[cfg(feature = "openrouter")]
122 ServiceError::OpenRouter(_) => true,
123 ServiceError::Common(err) => err.is_recoverable(),
124 }
125 }
126}
127
128pub type Result<T> = std::result::Result<T, ServiceError>;
129
130pub struct TerraphimService {
132 config_state: ConfigState,
133}
134
135impl TerraphimService {
136 pub fn new(config_state: ConfigState) -> Self {
138 Self { config_state }
139 }
140
141 async fn build_thesaurus(&mut self, search_query: &SearchQuery) -> Result<()> {
143 Ok(build_thesaurus_from_haystack(&mut self.config_state, search_query).await?)
144 }
145 pub async fn ensure_thesaurus_loaded(&mut self, role_name: &RoleName) -> Result<Thesaurus> {
147 async fn load_thesaurus_from_automata_path(
148 config_state: &ConfigState,
149 role_name: &RoleName,
150 rolegraphs: &mut AHashMap<RoleName, RoleGraphSync>,
151 ) -> Result<Thesaurus> {
152 let role = {
158 let config = config_state.config.lock().await;
159 let Some(role) = config.roles.get(role_name).cloned() else {
160 return Err(ServiceError::Config(format!(
161 "Role '{}' not found in config",
162 role_name
163 )));
164 };
165 role
166 };
167 if let Some(kg) = &role.kg {
168 if let Some(automata_path) = &kg.automata_path {
169 log::info!("Loading Role `{}` - URL: {:?}", role_name, automata_path);
170
171 match load_thesaurus(automata_path).await {
173 Ok(mut thesaurus) => {
174 log::info!("Successfully loaded thesaurus from automata path");
175
176 match thesaurus.save().await {
178 Ok(_) => {
179 log::info!(
180 "Thesaurus for role `{}` saved to persistence",
181 role_name
182 );
183 match thesaurus.load().await {
185 Ok(persisted_thesaurus) => {
186 thesaurus = persisted_thesaurus;
187 log::debug!("Reloaded thesaurus from persistence");
188 }
189 Err(e) => {
190 log::warn!(
191 "Failed to reload thesaurus from persistence, using in-memory version: {:?}",
192 e
193 );
194 }
195 }
196 }
197 Err(e) => {
198 log::warn!("Failed to save thesaurus to persistence: {:?}", e);
199 }
200 }
201
202 let rolegraph =
203 RoleGraph::new(role_name.clone(), thesaurus.clone()).await;
204 match rolegraph {
205 Ok(rolegraph) => {
206 let rolegraph_value = RoleGraphSync::from(rolegraph);
207 rolegraphs.insert(role_name.clone(), rolegraph_value);
208 }
209 Err(e) => {
210 log::error!("Failed to update role and thesaurus: {:?}", e)
211 }
212 }
213 Ok(thesaurus)
214 }
215 Err(e) => {
216 log::warn!("Failed to load thesaurus from automata path: {:?}", e);
217 if let Some(kg_local) = &kg.knowledge_graph_local {
219 log::info!(
220 "Fallback: building thesaurus from local KG for role {}",
221 role_name
222 );
223 let logseq_builder = Logseq::default();
224 match logseq_builder
225 .build(
226 role_name.as_lowercase().to_string(),
227 kg_local.path.clone(),
228 )
229 .await
230 {
231 Ok(mut thesaurus) => {
232 match thesaurus.save().await {
234 Ok(_) => {
235 log::info!(
236 "Fallback thesaurus for role `{}` saved to persistence",
237 role_name
238 );
239 match thesaurus.load().await {
241 Ok(persisted_thesaurus) => {
242 thesaurus = persisted_thesaurus;
243 log::debug!(
244 "Reloaded fallback thesaurus from persistence"
245 );
246 }
247 Err(e) => {
248 log::warn!(
249 "Failed to reload fallback thesaurus from persistence, using in-memory version: {:?}",
250 e
251 );
252 }
253 }
254 }
255 Err(e) => {
256 log::warn!(
257 "Failed to save fallback thesaurus to persistence: {:?}",
258 e
259 );
260 }
261 }
262
263 let rolegraph =
264 RoleGraph::new(role_name.clone(), thesaurus.clone())
265 .await;
266 match rolegraph {
267 Ok(rolegraph) => {
268 let rolegraph_value =
269 RoleGraphSync::from(rolegraph);
270 rolegraphs
271 .insert(role_name.clone(), rolegraph_value);
272 }
273 Err(e) => log::error!(
274 "Failed to update role and thesaurus: {:?}",
275 e
276 ),
277 }
278
279 Ok(thesaurus)
280 }
281 Err(e) => {
282 let is_file_not_found =
285 e.to_string().contains("file not found")
286 || e.to_string().contains("not found:");
287
288 if is_file_not_found {
289 log::debug!(
290 "Failed to build thesaurus from local KG (optional file not found) for role {}: {:?}",
291 role_name,
292 e
293 );
294 } else {
295 log::error!(
296 "Failed to build thesaurus from local KG for role {}: {:?}",
297 role_name,
298 e
299 );
300 }
301 Err(ServiceError::Config(
302 "Failed to load or build thesaurus".into(),
303 ))
304 }
305 }
306 } else {
307 log::warn!(
308 "No fallback available for role {}: no local KG path configured, returning empty thesaurus",
309 role_name
310 );
311 Ok(Thesaurus::new(role_name.as_lowercase().to_string()))
312 }
313 }
314 }
315 } else if let Some(kg_local) = &kg.knowledge_graph_local {
316 log::info!(
318 "Role {} has no automata_path, building thesaurus from local KG files at {:?}",
319 role_name,
320 kg_local.path
321 );
322 let logseq_builder = Logseq::default();
323 match logseq_builder
324 .build(role_name.as_lowercase().to_string(), kg_local.path.clone())
325 .await
326 {
327 Ok(mut thesaurus) => {
328 log::info!(
329 "Successfully built thesaurus from local KG for role {}",
330 role_name
331 );
332
333 match thesaurus.save().await {
335 Ok(_) => {
336 log::info!(
337 "Local KG thesaurus for role `{}` saved to persistence",
338 role_name
339 );
340 match thesaurus.load().await {
342 Ok(persisted_thesaurus) => {
343 log::info!(
344 "Reloaded local KG thesaurus from persistence: {} entries",
345 persisted_thesaurus.len()
346 );
347 thesaurus = persisted_thesaurus;
348 }
349 Err(e) => {
350 log::warn!(
351 "Failed to reload local KG thesaurus from persistence, using in-memory version: {:?}",
352 e
353 );
354 }
355 }
356 }
357 Err(e) => {
358 log::warn!(
359 "Failed to save local KG thesaurus to persistence: {:?}",
360 e
361 );
362 }
363 }
364
365 let rolegraph =
366 RoleGraph::new(role_name.clone(), thesaurus.clone()).await;
367 match rolegraph {
368 Ok(rolegraph) => {
369 let rolegraph_value = RoleGraphSync::from(rolegraph);
370 rolegraphs.insert(role_name.clone(), rolegraph_value);
371 }
372 Err(e) => {
373 log::error!("Failed to update role and thesaurus: {:?}", e)
374 }
375 }
376
377 Ok(thesaurus)
378 }
379 Err(e) => {
380 let is_file_not_found = e.to_string().contains("file not found");
383
384 if is_file_not_found {
385 log::debug!(
386 "Failed to build thesaurus from local KG (optional file not found) for role {}: {:?}",
387 role_name,
388 e
389 );
390 } else {
391 log::error!(
392 "Failed to build thesaurus from local KG for role {}: {:?}",
393 role_name,
394 e
395 );
396 }
397 Err(ServiceError::Config(format!(
398 "Failed to build thesaurus from local KG for role {}: {}",
399 role_name, e
400 )))
401 }
402 }
403 } else {
404 log::warn!(
405 "Role {} is configured for TerraphimGraph but has neither automata_path nor knowledge_graph_local defined.",
406 role_name
407 );
408 if let Some(kg_local) = &kg.knowledge_graph_local {
409 log::info!(
411 "Building thesaurus from local KG files for role {} at {:?}",
412 role_name,
413 kg_local.path
414 );
415 let logseq_builder = Logseq::default();
416 match logseq_builder
417 .build(role_name.as_lowercase().to_string(), kg_local.path.clone())
418 .await
419 {
420 Ok(mut thesaurus) => {
421 log::info!(
422 "Successfully built thesaurus from local KG for role {}",
423 role_name
424 );
425
426 match thesaurus.save().await {
428 Ok(_) => {
429 log::info!(
430 "No-automata thesaurus for role `{}` saved to persistence",
431 role_name
432 );
433 match thesaurus.load().await {
435 Ok(persisted_thesaurus) => {
436 thesaurus = persisted_thesaurus;
437 log::debug!(
438 "Reloaded no-automata thesaurus from persistence"
439 );
440 }
441 Err(e) => {
442 log::warn!(
443 "Failed to reload no-automata thesaurus from persistence, using in-memory version: {:?}",
444 e
445 );
446 }
447 }
448 }
449 Err(e) => {
450 log::warn!(
451 "Failed to save no-automata thesaurus to persistence: {:?}",
452 e
453 );
454 }
455 }
456
457 let rolegraph =
458 RoleGraph::new(role_name.clone(), thesaurus.clone()).await;
459 match rolegraph {
460 Ok(rolegraph) => {
461 let rolegraph_value = RoleGraphSync::from(rolegraph);
462 rolegraphs.insert(role_name.clone(), rolegraph_value);
463 }
464 Err(e) => {
465 let is_file_not_found =
468 e.to_string().contains("file not found");
469
470 if is_file_not_found {
471 log::debug!(
472 "Failed to update role and thesaurus (optional file not found): {:?}",
473 e
474 );
475 } else {
476 log::error!(
477 "Failed to update role and thesaurus: {:?}",
478 e
479 );
480 }
481 }
482 }
483
484 Ok(thesaurus)
485 }
486 Err(e) => {
487 log::error!(
488 "Failed to build thesaurus from local KG for role {}: {:?}",
489 role_name,
490 e
491 );
492 Err(ServiceError::Config(
493 "Failed to build thesaurus from local KG".into(),
494 ))
495 }
496 }
497 } else {
498 log::debug!(
499 "Role '{}' has no local KG path, returning empty thesaurus",
500 role_name
501 );
502 Ok(Thesaurus::new(role_name.as_lowercase().to_string()))
503 }
504 }
505 } else {
506 log::debug!("Role '{}' has no knowledge graph configured", role_name);
507 Err(ServiceError::Config(format!(
508 "Knowledge graph not configured for role '{}'",
509 role_name
510 )))
511 }
512 }
513
514 log::debug!("Loading thesaurus for role: {}", role_name);
515 log::debug!("Role keys {:?}", self.config_state.roles.keys());
516
517 if let Some(rolegraph_value) = self.config_state.roles.get(role_name) {
518 let thesaurus_result = rolegraph_value.lock().await.thesaurus.clone().load().await;
519 match thesaurus_result {
520 Ok(thesaurus) => {
521 log::debug!("Thesaurus loaded: {:?}", thesaurus);
522 log::info!("Rolegraph loaded: for role name {:?}", role_name);
523
524 let is_stale = if let Some(ref cached_hash) = thesaurus.source_hash {
526 let role = {
527 let config = self.config_state.config.lock().await;
528 config.roles.get(role_name).cloned()
529 };
530 if let Some(role) = role {
531 if let Some(ref kg) = role.kg {
532 if let Some(ref kg_local) = kg.knowledge_graph_local {
533 match compute_kg_source_hash(&kg_local.path) {
534 Ok(Some(current_hash)) => {
535 let stale = current_hash != *cached_hash;
536 if stale {
537 log::info!(
538 "Thesaurus cache stale for role '{}': hash mismatch (cached {} != current {})",
539 role_name,
540 cached_hash,
541 current_hash
542 );
543 }
544 stale
545 }
546 Ok(None) => {
547 log::debug!(
548 "No markdown files found in KG path {:?}",
549 kg_local.path
550 );
551 false
552 }
553 Err(e) => {
554 log::warn!(
555 "Failed to compute source hash for role '{}': {}",
556 role_name,
557 e
558 );
559 false
560 }
561 }
562 } else {
563 false
564 }
565 } else {
566 false
567 }
568 } else {
569 false
570 }
571 } else {
572 log::debug!(
573 "No source_hash in cached thesaurus for role '{}'",
574 role_name
575 );
576 false
577 };
578
579 if is_stale {
580 let mut rolegraphs = self.config_state.roles.clone();
581 let result = load_thesaurus_from_automata_path(
582 &self.config_state,
583 role_name,
584 &mut rolegraphs,
585 )
586 .await;
587
588 if result.is_ok() {
589 if let Some(updated_rolegraph) = rolegraphs.get(role_name) {
590 self.config_state
591 .roles
592 .insert(role_name.clone(), updated_rolegraph.clone());
593 log::info!(
594 "Updated config_state with rebuilt rolegraph for role: {}",
595 role_name
596 );
597 }
598 }
599 result
600 } else {
601 Ok(thesaurus)
602 }
603 }
604 Err(e) => {
605 let is_file_not_found = e.to_string().contains("file not found")
608 || e.to_string().contains("not found:");
609
610 if is_file_not_found {
611 log::debug!("Thesaurus file not found (optional): {:?}", e);
612 } else {
613 log::error!("Failed to load thesaurus: {:?}", e);
614 }
615 let mut rolegraphs = self.config_state.roles.clone();
617 let result = load_thesaurus_from_automata_path(
618 &self.config_state,
619 role_name,
620 &mut rolegraphs,
621 )
622 .await;
623
624 if result.is_ok() {
626 if let Some(updated_rolegraph) = rolegraphs.get(role_name) {
627 self.config_state
628 .roles
629 .insert(role_name.clone(), updated_rolegraph.clone());
630 log::info!(
631 "Updated config_state with new rolegraph for role: {}",
632 role_name
633 );
634 }
635 }
636
637 result
638 }
639 }
640 } else {
641 let mut rolegraphs = self.config_state.roles.clone();
643 let result =
644 load_thesaurus_from_automata_path(&self.config_state, role_name, &mut rolegraphs)
645 .await;
646
647 if result.is_ok() {
649 if let Some(new_rolegraph) = rolegraphs.get(role_name) {
650 self.config_state
651 .roles
652 .insert(role_name.clone(), new_rolegraph.clone());
653 log::info!(
654 "Added new rolegraph to config_state for role: {}",
655 role_name
656 );
657 }
658 }
659
660 result
661 }
662 }
663
664 pub async fn preprocess_document_content(
670 &mut self,
671 mut document: Document,
672 role: &Role,
673 ) -> Result<Document> {
674 if !role.terraphim_it {
676 log::info!(
677 "🔍 terraphim_it disabled for role '{}', skipping KG preprocessing",
678 role.name
679 );
680 return Ok(document);
681 }
682
683 let Some(_kg) = &role.kg else {
684 log::info!(
685 "⚠️ No KG configured for role '{}', skipping KG preprocessing",
686 role.name
687 );
688 return Ok(document);
689 };
690
691 log::info!(
692 "🧠 Starting KG preprocessing for document '{}' in role '{}' (terraphim_it enabled)",
693 document.title,
694 role.name
695 );
696 log::debug!(
697 "📄 Document preview: {} characters starting with: {}",
698 document.body.len(),
699 &document.body.chars().take(100).collect::<String>()
700 );
701
702 let thesaurus = match self.ensure_thesaurus_loaded(&role.name).await {
704 Ok(thesaurus) => thesaurus,
705 Err(e) => {
706 log::warn!("Failed to load thesaurus for role {}: {:?}", role.name, e);
707 return Ok(document); }
709 };
710
711 let mut kg_thesaurus = Thesaurus::new(format!("kg_links_{}", role.name));
713
714 let important_kg_terms = [
717 "graph",
718 "haystack",
719 "service",
720 "terraphim",
721 "knowledge",
722 "embedding",
723 "search",
724 "automata",
725 "thesaurus",
726 "rolegraph",
727 ];
728
729 let excluded_common_terms = [
731 "system",
732 "config",
733 "configuration",
734 "type",
735 "method",
736 "function",
737 "class",
738 "component",
739 "module",
740 "library",
741 "framework",
742 "interface",
743 "api",
744 "data",
745 "file",
746 "path",
747 "url",
748 "string",
749 "number",
750 "value",
751 "option",
752 "parameter",
753 "field",
754 "property",
755 "attribute",
756 "element",
757 "item",
758 "object",
759 "array",
760 "list",
761 "map",
762 "set",
763 "collection",
764 "server",
765 "client",
766 "request",
767 "response",
768 "error",
769 "result",
770 "success",
771 "failure",
772 "true",
773 "false",
774 "null",
775 "undefined",
776 "empty",
777 "full",
778 "start",
779 "end",
780 "begin",
781 "finish",
782 "create",
783 "delete",
784 "update",
785 "read",
786 "write",
787 "load",
788 "save",
789 "process",
790 "handle",
791 "manage",
792 "control",
793 "execute",
794 "run",
795 "call",
796 "invoke",
797 "trigger",
798 "event",
799 "action",
800 "command",
801 "query",
802 "search",
803 "filter",
804 "sort",
805 "order",
806 "group",
807 "match",
808 "find",
809 "replace",
810 "insert",
811 "remove",
812 "add",
813 "set",
814 "get",
815 "put",
816 "post",
817 "head",
818 "patch",
819 "delete",
820 ];
821
822 let mut sorted_terms: Vec<_> = (&thesaurus)
823 .into_iter()
824 .filter(|(key, _)| {
825 let term = key.as_str();
826
827 if term.is_empty() || term.len() < 3 {
829 return false;
830 }
831
832 if important_kg_terms.contains(&term) {
834 return true;
835 }
836
837 if excluded_common_terms.contains(&term) {
839 return false;
840 }
841
842 term.len() > 5
848 || term.contains('-')
849 || term.contains('_')
850 || term.chars().next().is_some_and(|c| c.is_uppercase())
851 })
852 .collect();
853
854 #[allow(clippy::unnecessary_sort_by)]
856 sorted_terms.sort_by(|a, b| {
857 let a_important = important_kg_terms.contains(&a.0.as_str());
858 let b_important = important_kg_terms.contains(&b.0.as_str());
859
860 match (a_important, b_important) {
861 (true, false) => std::cmp::Ordering::Less, (false, true) => std::cmp::Ordering::Greater, _ => b.1.id.cmp(&a.1.id), }
865 });
866
867 let max_kg_terms = 8;
869 for (key, value) in sorted_terms.into_iter().take(max_kg_terms) {
870 let mut kg_value = value.clone();
871 kg_value.value = key.clone(); kg_value.url = Some(format!("kg:{}", value.value)); kg_thesaurus.insert(key.clone(), kg_value);
877 }
878
879 let kg_terms_count = kg_thesaurus.len();
880 log::info!(
881 "📋 KG thesaurus filtering: {} → {} terms (prioritizing: {}, filters: len>5, hyphenated, or important KG terms)",
882 thesaurus.len(),
883 kg_terms_count,
884 important_kg_terms.join(", ")
885 );
886
887 if kg_terms_count > 0 {
889 let terms: Vec<String> = (&kg_thesaurus)
890 .into_iter()
891 .map(|(k, v)| format!("'{}' → kg:{}", k, v.value))
892 .collect();
893 log::info!("🔍 KG terms selected for linking: {}", terms.join(", "));
894 } else {
895 log::info!(
896 "⚠️ No KG terms passed filtering criteria - document '{}' will have no KG links",
897 document.title
898 );
899 }
900
901 if !kg_thesaurus.is_empty() {
903 let debug_thesaurus: Vec<String> = (&kg_thesaurus)
905 .into_iter()
906 .map(|(k, v)| format!("'{}' -> '{}' (url: {:?})", k, v.value, v.url))
907 .take(3) .collect();
909 log::info!(
910 "🔧 Passing to replace_matches: {} (total terms: {})",
911 debug_thesaurus.join(", "),
912 kg_thesaurus.len()
913 );
914 let preview = if document.body.chars().count() > 200 {
915 document.body.chars().take(200).collect::<String>() + "..."
916 } else {
917 document.body.clone()
918 };
919 log::info!("📝 Document body preview (first 200 chars): {}", preview);
920
921 match replace_matches(&document.body, kg_thesaurus, LinkType::MarkdownLinks) {
922 Ok(processed_bytes) => {
923 match String::from_utf8(processed_bytes) {
924 Ok(processed_content) => {
925 log::info!(
926 "✅ Successfully preprocessed document '{}' with {} KG terms → created [term](kg:concept) links",
927 document.title,
928 kg_terms_count
929 );
930
931 let content_changed = processed_content != document.body;
933 log::info!(
934 "🔄 Content changed: {} (original: {} chars, processed: {} chars)",
935 content_changed,
936 document.body.len(),
937 processed_content.len()
938 );
939
940 let kg_links: Vec<&str> = processed_content
942 .split("[")
943 .filter_map(|s| s.find("](kg:").map(|closing| &s[..closing]))
944 .collect();
945
946 if !kg_links.is_empty() {
947 log::info!(
948 "🔗 Found KG links in processed content: [{}](kg:...)",
949 kg_links.join("], [")
950 );
951
952 let snippet = snippet_around(&processed_content, "](kg:", 50, 100);
953 if !snippet.is_empty() {
954 log::info!(
955 "📄 Content snippet with KG link: ...{}...",
956 snippet
957 );
958 }
959 } else {
960 log::warn!(
961 "⚠️ No KG links found in processed content despite successful replacement"
962 );
963 }
964
965 document.body = processed_content;
966 }
967 Err(e) => {
968 log::warn!(
969 "Failed to convert processed content to UTF-8 for document '{}': {:?}",
970 document.title,
971 e
972 );
973 }
974 }
975 }
976 Err(e) => {
977 log::warn!(
978 "Failed to replace KG terms in document '{}': {:?}",
979 document.title,
980 e
981 );
982 }
983 }
984 } else {
985 log::info!(
986 "💭 No specific KG terms found for document '{}' (filters excluded generic terms)",
987 document.title
988 );
989 }
990
991 Ok(document)
992 }
993
994 pub async fn preprocess_document_content_with_search(
996 &mut self,
997 document: Document,
998 role: &Role,
999 search_query: Option<&SearchQuery>,
1000 ) -> Result<Document> {
1001 let mut processed_doc = self.preprocess_document_content(document, role).await?;
1003
1004 if let Some(query) = search_query {
1006 log::debug!(
1007 "Applying search term highlighting to document '{}'",
1008 processed_doc.title
1009 );
1010 processed_doc.body = Self::highlight_search_terms(&processed_doc.body, query);
1011 }
1012
1013 Ok(processed_doc)
1014 }
1015
1016 pub async fn create_document(&mut self, document: Document) -> Result<Document> {
1018 document.save().await?;
1021
1022 self.config_state.add_to_roles(&document).await?;
1025
1026 use terraphim_config::ServiceType;
1030 use terraphim_middleware::indexer::RipgrepIndexer;
1031
1032 let ripgrep = RipgrepIndexer::default();
1033 let config_snapshot = { self.config_state.config.lock().await.clone() };
1034
1035 for role in config_snapshot.roles.values() {
1036 for haystack in &role.haystacks {
1037 if haystack.service == ServiceType::Ripgrep && !haystack.read_only {
1038 if let Err(e) = ripgrep.update_document(&document).await {
1039 log::warn!(
1040 "Failed to write document {} to haystack {:?}: {:?}",
1041 document.id,
1042 haystack.location,
1043 e
1044 );
1045 }
1046 }
1047 }
1048 }
1049
1050 Ok(document)
1051 }
1052
1053 pub async fn get_document_by_id(&mut self, document_id: &str) -> Result<Option<Document>> {
1059 log::debug!("Getting document by ID: '{}'", document_id);
1060
1061 if document_id.trim().is_empty() {
1063 log::warn!("Empty or whitespace-only document_id provided");
1064 return Ok(None);
1065 }
1066
1067 let mut placeholder = Document {
1069 id: document_id.to_string(),
1070 ..Default::default()
1071 };
1072 match placeholder.load().await {
1073 Ok(doc) => {
1074 log::debug!("Found document '{}' with direct ID lookup", document_id);
1075 return self.apply_kg_preprocessing_if_needed(doc).await.map(Some);
1076 }
1077 Err(e) => {
1078 log::debug!(
1079 "Document '{}' not found with direct lookup: {:?}",
1080 document_id,
1081 e
1082 );
1083 }
1084 }
1085
1086 if document_id.contains('.') || document_id.contains('-') || document_id.contains('_') {
1088 let normalized_id = normalize_filename_to_id(document_id);
1089 log::debug!(
1090 "Trying normalized ID '{}' for filename '{}'",
1091 normalized_id,
1092 document_id
1093 );
1094
1095 let mut normalized_placeholder = Document {
1096 id: normalized_id.clone(),
1097 ..Default::default()
1098 };
1099 match normalized_placeholder.load().await {
1100 Ok(doc) => {
1101 log::debug!(
1102 "Found document '{}' with normalized ID '{}'",
1103 document_id,
1104 normalized_id
1105 );
1106 return self.apply_kg_preprocessing_if_needed(doc).await.map(Some);
1107 }
1108 Err(e) => {
1109 log::debug!(
1110 "Document '{}' not found with normalized ID '{}': {:?}",
1111 document_id,
1112 normalized_id,
1113 e
1114 );
1115 }
1116 }
1117 }
1118
1119 log::debug!("Falling back to search for document '{}'", document_id);
1121 let search_query = SearchQuery {
1122 search_term: NormalizedTermValue::new(document_id.to_string()),
1123 search_terms: None,
1124 operator: None,
1125 limit: Some(5), skip: None,
1127 role: None,
1128 layer: Layer::default(),
1129 include_pinned: false,
1130 min_quality: None,
1131 };
1132
1133 let documents = self.search(&search_query).await?;
1134
1135 for doc in documents {
1137 if doc.title == document_id || doc.id == document_id {
1138 log::debug!("Found document '{}' via search fallback", document_id);
1139 return self.apply_kg_preprocessing_if_needed(doc).await.map(Some);
1140 }
1141 }
1142
1143 log::debug!("Document '{}' not found anywhere", document_id);
1144 Ok(None)
1145 }
1146
1147 async fn apply_kg_preprocessing_if_needed(&mut self, document: Document) -> Result<Document> {
1153 log::debug!(
1154 "🔍 [KG-DEBUG] apply_kg_preprocessing_if_needed called for document: '{}'",
1155 document.title
1156 );
1157 log::debug!(
1158 "🔍 [KG-DEBUG] Document body preview: {}",
1159 document.body.chars().take(100).collect::<String>()
1160 );
1161
1162 let role = {
1163 let config = self.config_state.config.lock().await;
1164 let selected_role = &config.selected_role;
1165
1166 log::debug!("🔍 [KG-DEBUG] Selected role: '{}'", selected_role);
1167
1168 match config.roles.get(selected_role) {
1169 Some(role) => {
1170 log::debug!(
1171 "🔍 [KG-DEBUG] Role found: '{}', terraphim_it: {}",
1172 role.name,
1173 role.terraphim_it
1174 );
1175 role.clone() }
1177 None => {
1178 log::warn!(
1179 "❌ [KG-DEBUG] Selected role '{}' not found in config, skipping KG preprocessing",
1180 selected_role
1181 );
1182 return Ok(document);
1183 }
1184 }
1185 }; if !role.terraphim_it {
1189 log::info!(
1190 "🔍 [KG-DEBUG] terraphim_it disabled for role '{}', skipping KG preprocessing",
1191 role.name
1192 );
1193 return Ok(document);
1194 }
1195
1196 let has_existing_kg_links = document.body.contains("](kg:");
1198 log::debug!(
1199 "🔍 [KG-DEBUG] Document already has KG links: {}",
1200 has_existing_kg_links
1201 );
1202 if has_existing_kg_links {
1203 log::info!(
1204 "🔍 [KG-DEBUG] Document '{}' already has KG links, skipping preprocessing to prevent double processing",
1205 document.title
1206 );
1207 return Ok(document);
1208 }
1209
1210 log::info!(
1211 "🧠 [KG-DEBUG] Starting KG preprocessing for document '{}' with role '{}' (terraphim_it enabled)",
1212 document.title,
1213 role.name
1214 );
1215
1216 let document_title = document.title.clone(); let processed_doc = match self.preprocess_document_content(document, &role).await {
1219 Ok(doc) => {
1220 let links_added = doc.body.contains("](kg:");
1221 log::info!(
1222 "✅ [KG-DEBUG] KG preprocessing completed for document '{}'. Links added: {}",
1223 doc.title,
1224 links_added
1225 );
1226 if links_added {
1227 log::debug!(
1228 "🔍 [KG-DEBUG] Processed body preview: {}",
1229 doc.body.chars().take(200).collect::<String>()
1230 );
1231 }
1232 doc
1233 }
1234 Err(e) => {
1235 log::error!(
1236 "❌ [KG-DEBUG] KG preprocessing failed for document '{}': {:?}",
1237 document_title,
1238 e
1239 );
1240 return Err(e);
1241 }
1242 };
1243
1244 Ok(processed_doc)
1245 }
1246
1247 #[allow(dead_code)] async fn enhance_descriptions_with_ai(
1253 &self,
1254 mut documents: Vec<Document>,
1255 role: &Role,
1256 ) -> Result<Vec<Document>> {
1257 use crate::llm::{SummarizeOptions, build_llm_from_role};
1258
1259 eprintln!("🤖 Attempting to build LLM client for role: {}", role.name);
1260 let llm = match build_llm_from_role(role) {
1261 Some(client) => {
1262 eprintln!("✅ LLM client successfully created: {}", client.name());
1263 client
1264 }
1265 None => {
1266 eprintln!("❌ No LLM client available for role: {}", role.name);
1267 return Ok(documents);
1268 }
1269 };
1270
1271 log::info!(
1272 "Enhancing {} document descriptions with LLM provider: {}",
1273 documents.len(),
1274 llm.name()
1275 );
1276
1277 let mut enhanced_count = 0;
1278 let mut error_count = 0;
1279
1280 for document in &mut documents {
1281 if self.should_generate_ai_summary(document) {
1282 let summary_length = 250;
1283 match llm
1284 .summarize(
1285 &document.body,
1286 SummarizeOptions {
1287 max_length: summary_length,
1288 },
1289 )
1290 .await
1291 {
1292 Ok(ai_summary) => {
1293 log::debug!(
1294 "Generated AI summary for '{}': {} characters",
1295 document.title,
1296 ai_summary.len()
1297 );
1298 document.description = Some(ai_summary);
1299 enhanced_count += 1;
1300 }
1301 Err(e) => {
1302 log::warn!(
1303 "Failed to generate AI summary for '{}': {}",
1304 document.title,
1305 e
1306 );
1307 error_count += 1;
1308 }
1309 }
1310 }
1311 }
1312
1313 log::info!(
1314 "LLM enhancement complete: {} enhanced, {} errors, {} skipped",
1315 enhanced_count,
1316 error_count,
1317 documents.len() - enhanced_count - error_count
1318 );
1319
1320 Ok(documents)
1321 }
1322
1323 #[allow(dead_code)] fn should_generate_ai_summary(&self, document: &Document) -> bool {
1329 if document.body.trim().len() < 200 {
1331 return false;
1332 }
1333
1334 if let Some(ref description) = document.description {
1336 if description.len() > 100 && !description.ends_with("...") {
1338 return false;
1339 }
1340 }
1341
1342 if document.body.len() > 8000 {
1344 return false;
1345 }
1346
1347 true
1349 }
1350
1351 async fn get_search_role(&self, search_query: &SearchQuery) -> Result<Role> {
1353 let search_role = match &search_query.role {
1354 Some(role) => role.clone(),
1355 None => self.config_state.get_default_role().await,
1356 };
1357
1358 log::debug!("Searching for role: {:?}", search_role);
1359 let Some(role) = self.config_state.get_role(&search_role).await else {
1360 return Err(ServiceError::Config(format!(
1361 "Role `{}` not found in config",
1362 search_role
1363 )));
1364 };
1365 Ok(role)
1366 }
1367
1368 fn is_word_boundary_char(c: char) -> bool {
1371 !c.is_alphanumeric() && c != '_'
1372 }
1373
1374 fn is_at_word_boundary(text: &str, start: usize, end: usize) -> bool {
1378 let before_ok = if start == 0 {
1379 true
1380 } else {
1381 text[..start]
1382 .chars()
1383 .last()
1384 .map(Self::is_word_boundary_char)
1385 .unwrap_or(true)
1386 };
1387
1388 let after_ok = if end >= text.len() {
1389 true
1390 } else {
1391 text[end..]
1392 .chars()
1393 .next()
1394 .map(Self::is_word_boundary_char)
1395 .unwrap_or(true)
1396 };
1397
1398 before_ok && after_ok
1399 }
1400
1401 fn term_matches_with_word_boundaries(term: &str, text: &str) -> bool {
1405 let mut start = 0;
1407 while let Some(pos) = text[start..].find(term) {
1408 let abs_start = start + pos;
1409 let abs_end = abs_start + term.len();
1410
1411 if Self::is_at_word_boundary(text, abs_start, abs_end) {
1412 return true;
1413 }
1414 start = abs_end;
1415 }
1416 false
1417 }
1418
1419 pub async fn apply_logical_operators_to_documents(
1421 &mut self,
1422 search_query: &SearchQuery,
1423 documents: Vec<Document>,
1424 ) -> Result<Vec<Document>> {
1425 use terraphim_types::LogicalOperator;
1426
1427 let all_terms = search_query.get_all_terms();
1428 let operator = search_query.get_operator();
1429
1430 let initial_doc_count = documents.len();
1431
1432 log::debug!(
1433 "Applying {:?} operator to {} documents with {} search terms",
1434 operator,
1435 initial_doc_count,
1436 all_terms.len()
1437 );
1438
1439 let terms_lower: Vec<String> = all_terms
1441 .iter()
1442 .map(|t| t.as_str().to_lowercase())
1443 .collect();
1444
1445 let filtered_docs: Vec<Document> = documents
1446 .into_iter()
1447 .filter(|doc| {
1448 let searchable_text = format!(
1450 "{} {} {}",
1451 doc.title.to_lowercase(),
1452 doc.body.to_lowercase(),
1453 doc.description
1454 .as_ref()
1455 .unwrap_or(&String::new())
1456 .to_lowercase()
1457 );
1458
1459 match operator {
1460 LogicalOperator::And => {
1461 terms_lower.iter().all(|term| {
1463 Self::term_matches_with_word_boundaries(term, &searchable_text)
1464 })
1465 }
1466 LogicalOperator::Or => {
1467 terms_lower.iter().any(|term| {
1469 Self::term_matches_with_word_boundaries(term, &searchable_text)
1470 })
1471 }
1472 }
1473 })
1474 .collect();
1475
1476 log::debug!(
1477 "Logical operator filtering: {} -> {} documents",
1478 initial_doc_count,
1479 filtered_docs.len()
1480 );
1481
1482 let combined_query_string = terms_lower.join(" ");
1484 let query = Query::new(&combined_query_string);
1485 let sorted_docs = score::sort_documents(&query, filtered_docs);
1486
1487 Ok(sorted_docs)
1488 }
1489
1490 pub async fn search_documents_selected_role(
1493 &mut self,
1494 search_term: &NormalizedTermValue,
1495 ) -> Result<Vec<Document>> {
1496 let role = self.config_state.get_selected_role().await;
1497 let documents = self
1498 .search(&SearchQuery {
1499 search_term: search_term.clone(),
1500 search_terms: None,
1501 operator: None,
1502 role: Some(role),
1503 skip: None,
1504 limit: None,
1505 layer: Layer::default(),
1506 include_pinned: false,
1507 min_quality: None,
1508 })
1509 .await?;
1510 Ok(documents)
1511 }
1512
1513 fn apply_min_quality_filter(docs: Vec<Document>, min_quality: Option<f64>) -> Vec<Document> {
1518 let Some(threshold) = min_quality else {
1519 return docs;
1520 };
1521 let threshold = threshold.clamp(0.0, 1.0);
1522 docs.into_iter()
1523 .filter(|doc| {
1524 doc.quality_score
1525 .as_ref()
1526 .map(|qs| qs.composite() >= threshold)
1527 .unwrap_or(false)
1528 })
1529 .collect()
1530 }
1531
1532 pub async fn search(&mut self, search_query: &SearchQuery) -> Result<Vec<Document>> {
1534 log::debug!("Role for searching: {:?}", search_query.role);
1536 let role = self.get_search_role(search_query).await?;
1537
1538 log::trace!("Building index for search query: {:?}", search_query);
1539 let index: Index =
1540 terraphim_middleware::search_haystacks(self.config_state.clone(), search_query.clone())
1541 .await?;
1542
1543 let min_quality = search_query.min_quality;
1544
1545 let docs_result: Result<Vec<Document>> = match role.relevance_function {
1546 RelevanceFunction::TitleScorer => {
1547 log::debug!("Searching haystack with title scorer");
1548
1549 let documents = index.get_all_documents();
1550
1551 log::debug!("Sorting documents by relevance");
1552
1553 let documents = if search_query.is_multi_term_query() {
1554 self.apply_logical_operators_to_documents(search_query, documents)
1556 .await?
1557 } else {
1558 let query = Query::new(&search_query.search_term.to_string());
1560 score::sort_documents(&query, documents)
1561 };
1562 let total_length = documents.len();
1563 let mut docs_ranked = Vec::new();
1564 for (idx, doc) in documents.iter().enumerate() {
1565 let mut document: terraphim_types::Document = doc.clone();
1566 let rank = (total_length - idx).try_into().unwrap();
1567 document.rank = Some(rank);
1568
1569 if document.id.starts_with("http://") || document.id.starts_with("https://") {
1571 log::debug!(
1573 "Processing Atomic Data document '{}' (URL: {})",
1574 document.title,
1575 document.id
1576 );
1577
1578 let mut placeholder = Document {
1580 id: document.id.clone(),
1581 ..Default::default()
1582 };
1583 match placeholder.load().await {
1584 Ok(persisted_doc) => {
1585 log::debug!(
1587 "Found cached Atomic Data document '{}' in persistence",
1588 document.title
1589 );
1590 if let Some(better_description) = persisted_doc.description {
1591 document.description = Some(better_description);
1592 }
1593 if !persisted_doc.body.is_empty() && !role.terraphim_it {
1597 log::debug!(
1598 "Updated body from persistence for Atomic document '{}' (role: '{}', terraphim_it: {})",
1599 document.title,
1600 role.name,
1601 role.terraphim_it
1602 );
1603 document.body = persisted_doc.body;
1604 } else if role.terraphim_it {
1605 log::debug!(
1606 "Keeping search result body for Atomic document '{}' because role '{}' uses KG preprocessing (terraphim_it=true)",
1607 document.title,
1608 role.name
1609 );
1610 }
1611 }
1612 Err(_) => {
1613 log::debug!(
1615 "Caching Atomic Data document '{}' to persistence for future queries",
1616 document.title
1617 );
1618
1619 let doc_to_save = document.clone();
1621 tokio::spawn(async move {
1622 if let Err(e) = doc_to_save.save().await {
1623 log::warn!(
1624 "Failed to cache Atomic Data document '{}': {}",
1625 doc_to_save.title,
1626 e
1627 );
1628 } else {
1629 log::debug!(
1630 "Successfully cached Atomic Data document '{}'",
1631 doc_to_save.title
1632 );
1633 }
1634 });
1635 }
1636 }
1637 } else {
1638 let should_lookup_persistence = document
1640 .get_source_haystack()
1641 .and_then(|source| {
1642 role.haystacks
1643 .iter()
1644 .find(|haystack| haystack.location == *source)
1645 })
1646 .map(|haystack| haystack.fetch_content)
1647 .unwrap_or(true);
1648
1649 if !should_lookup_persistence {
1650 log::trace!(
1651 "Skipping persistence lookup for '{}' (haystack fetch_content=false)",
1652 document.title
1653 );
1654 } else {
1655 let mut placeholder = Document {
1656 id: document.id.clone(),
1657 ..Default::default()
1658 };
1659 if let Ok(persisted_doc) = placeholder.load().await {
1660 if let Some(better_description) = persisted_doc.description {
1661 log::debug!(
1662 "Replaced ripgrep description for '{}' with persistence description",
1663 document.title
1664 );
1665 document.description = Some(better_description);
1666 }
1667 } else {
1668 let normalized_id = normalize_filename_to_id(&document.title);
1671
1672 let mut normalized_placeholder = Document {
1673 id: normalized_id.clone(),
1674 ..Default::default()
1675 };
1676 if let Ok(persisted_doc) = normalized_placeholder.load().await {
1677 if let Some(better_description) = persisted_doc.description {
1678 log::debug!(
1679 "Replaced ripgrep description for '{}' with persistence description (normalized from title: {})",
1680 document.title,
1681 normalized_id
1682 );
1683 document.description = Some(better_description);
1684 }
1685 } else {
1686 let normalized_id_with_md = format!("{}md", normalized_id);
1688 let mut md_placeholder = Document {
1689 id: normalized_id_with_md.clone(),
1690 ..Default::default()
1691 };
1692 if let Ok(persisted_doc) = md_placeholder.load().await {
1693 if let Some(better_description) = persisted_doc.description
1694 {
1695 log::debug!(
1696 "Replaced ripgrep description for '{}' with persistence description (normalized with md: {})",
1697 document.title,
1698 normalized_id_with_md
1699 );
1700 document.description = Some(better_description);
1701 }
1702 } else {
1703 log::debug!(
1704 "No persistence document found for '{}' (tried ID: '{}', normalized: '{}', with md: '{}')",
1705 document.title,
1706 document.id,
1707 normalized_id,
1708 normalized_id_with_md
1709 );
1710 }
1711 }
1712 }
1713 }
1714 }
1715
1716 docs_ranked.push(document);
1717 }
1718
1719 #[cfg(feature = "openrouter")]
1722 if role.has_llm_config() && role.llm_auto_summarize {
1723 log::debug!(
1724 "Applying OpenRouter AI summarization to {} search results for role '{}'",
1725 docs_ranked.len(),
1726 role.name
1727 );
1728 docs_ranked = self
1729 .enhance_descriptions_with_ai(docs_ranked, &role)
1730 .await?;
1731 } else {
1732 eprintln!(
1734 "📋 Entering LLM AI summarization branch for role: {}",
1735 role.name
1736 );
1737 log::debug!(
1738 "Applying LLM AI summarization to {} search results for role '{}'",
1739 docs_ranked.len(),
1740 role.name
1741 );
1742 docs_ranked = self
1743 .enhance_descriptions_with_ai(docs_ranked, &role)
1744 .await?;
1745 }
1746
1747 if role.terraphim_it {
1749 log::info!(
1750 "🧠 Applying KG preprocessing to {} TerraphimGraph search results for role '{}'",
1751 docs_ranked.len(),
1752 role.name
1753 );
1754 let mut processed_docs = Vec::new();
1755 let mut total_kg_terms = 0;
1756 let mut docs_with_kg_links = 0;
1757
1758 for document in docs_ranked {
1759 let original_body_len = document.body.len();
1760 let processed_doc =
1761 self.preprocess_document_content(document, &role).await?;
1762
1763 let new_body_len = processed_doc.body.len();
1765 if new_body_len > original_body_len {
1766 docs_with_kg_links += 1;
1767 let estimated_links = (new_body_len - original_body_len) / 17;
1769 total_kg_terms += estimated_links;
1770 }
1771
1772 processed_docs.push(processed_doc);
1773 }
1774
1775 log::info!(
1776 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1777 processed_docs.len(),
1778 docs_with_kg_links,
1779 total_kg_terms
1780 );
1781 Ok(processed_docs)
1782 } else {
1783 Ok(docs_ranked)
1784 }
1785 }
1786 RelevanceFunction::BM25 => {
1787 log::debug!("Searching haystack with BM25 scorer");
1788
1789 let documents = index.get_all_documents();
1790
1791 log::debug!("Sorting documents by BM25 relevance");
1792
1793 let documents = if search_query.is_multi_term_query() {
1794 let filtered_docs = self
1796 .apply_logical_operators_to_documents(search_query, documents)
1797 .await?;
1798 let combined_query_string = search_query
1800 .get_all_terms()
1801 .iter()
1802 .map(|t| t.as_str())
1803 .collect::<Vec<_>>()
1804 .join(" ");
1805 let query =
1806 Query::new(&combined_query_string).name_scorer(score::QueryScorer::BM25);
1807 score::sort_documents(&query, filtered_docs)
1808 } else {
1809 let query = Query::new(&search_query.search_term.to_string())
1811 .name_scorer(score::QueryScorer::BM25);
1812 score::sort_documents(&query, documents)
1813 };
1814 let total_length = documents.len();
1815 let mut docs_ranked = Vec::new();
1816 for (idx, doc) in documents.iter().enumerate() {
1817 let mut document: terraphim_types::Document = doc.clone();
1818 let rank = (total_length - idx).try_into().unwrap();
1819 document.rank = Some(rank);
1820 docs_ranked.push(document);
1821 }
1822
1823 #[cfg(feature = "openrouter")]
1825 if role.has_llm_config() && role.llm_auto_summarize {
1826 log::debug!(
1827 "Applying OpenRouter AI summarization to {} BM25 search results for role '{}'",
1828 docs_ranked.len(),
1829 role.name
1830 );
1831 docs_ranked = self
1832 .enhance_descriptions_with_ai(docs_ranked, &role)
1833 .await?;
1834 } else {
1835 log::debug!(
1837 "Applying LLM AI summarization to {} BM25 search results for role '{}'",
1838 docs_ranked.len(),
1839 role.name
1840 );
1841 docs_ranked = self
1842 .enhance_descriptions_with_ai(docs_ranked, &role)
1843 .await?;
1844 }
1845
1846 if role.terraphim_it {
1848 log::info!(
1849 "🧠 Applying KG preprocessing to {} BM25 search results for role '{}'",
1850 docs_ranked.len(),
1851 role.name
1852 );
1853 let mut processed_docs = Vec::new();
1854 let mut total_kg_terms = 0;
1855 let mut docs_with_kg_links = 0;
1856
1857 for document in docs_ranked {
1858 let original_body_len = document.body.len();
1859 let processed_doc =
1860 self.preprocess_document_content(document, &role).await?;
1861
1862 let new_body_len = processed_doc.body.len();
1864 if new_body_len > original_body_len {
1865 docs_with_kg_links += 1;
1866 let estimated_links = (new_body_len - original_body_len) / 17;
1867 total_kg_terms += estimated_links;
1868 }
1869
1870 processed_docs.push(processed_doc);
1871 }
1872
1873 log::info!(
1874 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1875 processed_docs.len(),
1876 docs_with_kg_links,
1877 total_kg_terms
1878 );
1879 Ok(processed_docs)
1880 } else {
1881 Ok(docs_ranked)
1882 }
1883 }
1884 RelevanceFunction::BM25F => {
1885 log::debug!("Searching haystack with BM25F scorer");
1886
1887 let documents = index.get_all_documents();
1888
1889 log::debug!("Sorting documents by BM25F relevance");
1890
1891 let documents = if search_query.is_multi_term_query() {
1892 let filtered_docs = self
1894 .apply_logical_operators_to_documents(search_query, documents)
1895 .await?;
1896 let combined_query_string = search_query
1898 .get_all_terms()
1899 .iter()
1900 .map(|t| t.as_str())
1901 .collect::<Vec<_>>()
1902 .join(" ");
1903 let query =
1904 Query::new(&combined_query_string).name_scorer(score::QueryScorer::BM25F);
1905 score::sort_documents(&query, filtered_docs)
1906 } else {
1907 let query = Query::new(&search_query.search_term.to_string())
1909 .name_scorer(score::QueryScorer::BM25F);
1910 score::sort_documents(&query, documents)
1911 };
1912 let total_length = documents.len();
1913 let mut docs_ranked = Vec::new();
1914 for (idx, doc) in documents.iter().enumerate() {
1915 let mut document: terraphim_types::Document = doc.clone();
1916 let rank = (total_length - idx).try_into().unwrap();
1917 document.rank = Some(rank);
1918 docs_ranked.push(document);
1919 }
1920
1921 #[cfg(feature = "openrouter")]
1923 if role.has_llm_config() && role.llm_auto_summarize {
1924 log::debug!(
1925 "Applying OpenRouter AI summarization to {} BM25F search results for role '{}'",
1926 docs_ranked.len(),
1927 role.name
1928 );
1929 docs_ranked = self
1930 .enhance_descriptions_with_ai(docs_ranked, &role)
1931 .await?;
1932 } else {
1933 log::debug!(
1935 "Applying LLM AI summarization to {} BM25F search results for role '{}'",
1936 docs_ranked.len(),
1937 role.name
1938 );
1939 docs_ranked = self
1940 .enhance_descriptions_with_ai(docs_ranked, &role)
1941 .await?;
1942 }
1943
1944 if role.terraphim_it {
1946 log::info!(
1947 "🧠 Applying KG preprocessing to {} BM25F search results for role '{}'",
1948 docs_ranked.len(),
1949 role.name
1950 );
1951 let mut processed_docs = Vec::new();
1952 let mut total_kg_terms = 0;
1953 let mut docs_with_kg_links = 0;
1954
1955 for document in docs_ranked {
1956 let original_body_len = document.body.len();
1957 let processed_doc =
1958 self.preprocess_document_content(document, &role).await?;
1959
1960 let new_body_len = processed_doc.body.len();
1962 if new_body_len > original_body_len {
1963 docs_with_kg_links += 1;
1964 let estimated_links = (new_body_len - original_body_len) / 17;
1965 total_kg_terms += estimated_links;
1966 }
1967
1968 processed_docs.push(processed_doc);
1969 }
1970
1971 log::info!(
1972 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1973 processed_docs.len(),
1974 docs_with_kg_links,
1975 total_kg_terms
1976 );
1977 Ok(processed_docs)
1978 } else {
1979 Ok(docs_ranked)
1980 }
1981 }
1982 RelevanceFunction::BM25Plus => {
1983 log::debug!("Searching haystack with BM25Plus scorer");
1984
1985 let documents = index.get_all_documents();
1986
1987 log::debug!("Sorting documents by BM25Plus relevance");
1988
1989 let documents = if search_query.is_multi_term_query() {
1990 let filtered_docs = self
1992 .apply_logical_operators_to_documents(search_query, documents)
1993 .await?;
1994 let combined_query_string = search_query
1996 .get_all_terms()
1997 .iter()
1998 .map(|t| t.as_str())
1999 .collect::<Vec<_>>()
2000 .join(" ");
2001 let query = Query::new(&combined_query_string)
2002 .name_scorer(score::QueryScorer::BM25Plus);
2003 score::sort_documents(&query, filtered_docs)
2004 } else {
2005 let query = Query::new(&search_query.search_term.to_string())
2007 .name_scorer(score::QueryScorer::BM25Plus);
2008 score::sort_documents(&query, documents)
2009 };
2010 let total_length = documents.len();
2011 let mut docs_ranked = Vec::new();
2012 for (idx, doc) in documents.iter().enumerate() {
2013 let mut document: terraphim_types::Document = doc.clone();
2014 let rank = (total_length - idx).try_into().unwrap();
2015 document.rank = Some(rank);
2016 docs_ranked.push(document);
2017 }
2018
2019 #[cfg(feature = "openrouter")]
2021 if role.has_llm_config() && role.llm_auto_summarize {
2022 log::debug!(
2023 "Applying OpenRouter AI summarization to {} BM25Plus search results for role '{}'",
2024 docs_ranked.len(),
2025 role.name
2026 );
2027 docs_ranked = self
2028 .enhance_descriptions_with_ai(docs_ranked, &role)
2029 .await?;
2030 }
2031
2032 if role.terraphim_it {
2034 log::info!(
2035 "🧠 Applying KG preprocessing to {} BM25Plus search results for role '{}'",
2036 docs_ranked.len(),
2037 role.name
2038 );
2039 let mut processed_docs = Vec::new();
2040 let mut total_kg_terms = 0;
2041 let mut docs_with_kg_links = 0;
2042
2043 for document in docs_ranked {
2044 let original_body_len = document.body.len();
2045 let processed_doc =
2046 self.preprocess_document_content(document, &role).await?;
2047
2048 let new_body_len = processed_doc.body.len();
2050 if new_body_len > original_body_len {
2051 docs_with_kg_links += 1;
2052 let estimated_links = (new_body_len - original_body_len) / 17;
2053 total_kg_terms += estimated_links;
2054 }
2055
2056 processed_docs.push(processed_doc);
2057 }
2058
2059 log::info!(
2060 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
2061 processed_docs.len(),
2062 docs_with_kg_links,
2063 total_kg_terms
2064 );
2065 Ok(processed_docs)
2066 } else {
2067 Ok(docs_ranked)
2068 }
2069 }
2070 RelevanceFunction::TerraphimGraph => {
2071 log::debug!("TerraphimGraph search initiated for role: {}", role.name);
2072 self.build_thesaurus(search_query).await?;
2073 let _thesaurus = self.ensure_thesaurus_loaded(&role.name).await?;
2074 let scored_index_docs: Vec<IndexedDocument> = self
2075 .config_state
2076 .search_indexed_documents(search_query, &role)
2077 .await;
2078
2079 log::debug!(
2080 "TerraphimGraph search found {} indexed documents",
2081 scored_index_docs.len()
2082 );
2083
2084 log::debug!("Ranking documents with thesaurus");
2087 let mut documents = index.get_documents(scored_index_docs.clone());
2088
2089 let all_haystack_docs = index.get_all_documents();
2092 log::debug!(
2093 "Found {} total documents from haystacks, checking which need indexing",
2094 all_haystack_docs.len()
2095 );
2096 let mut need_reindexing = false;
2097
2098 if let Some(rolegraph_sync) = self.config_state.roles.get(&role.name) {
2099 let mut rolegraph = rolegraph_sync.lock().await;
2100 let mut newly_indexed = 0;
2101
2102 for doc in &all_haystack_docs {
2103 if !rolegraph.has_document(&doc.id) && !doc.body.is_empty() {
2105 log::debug!(
2106 "Indexing new document '{}' into rolegraph for TerraphimGraph search",
2107 doc.id
2108 );
2109 rolegraph.insert_document(&doc.id, doc.clone());
2110
2111 drop(rolegraph);
2114 if let Err(e) = doc.save().await {
2115 log::warn!(
2116 "Failed to save document '{}' to persistence: {}",
2117 doc.id,
2118 e
2119 );
2120 } else {
2121 log::debug!(
2122 "Successfully saved document '{}' to persistence",
2123 doc.id
2124 );
2125 }
2126 rolegraph = rolegraph_sync.lock().await;
2128
2129 newly_indexed += 1;
2130 }
2131 }
2132
2133 if newly_indexed > 0 {
2134 log::info!(
2135 "✅ Indexed {} new documents into rolegraph for role '{}'",
2136 newly_indexed,
2137 role.name
2138 );
2139 log::debug!(
2140 "RoleGraph now has {} nodes, {} edges, {} documents",
2141 rolegraph.get_node_count(),
2142 rolegraph.get_edge_count(),
2143 rolegraph.get_document_count()
2144 );
2145 need_reindexing = true; }
2147 }
2148
2149 let mut documents_with_content = Vec::new();
2152
2153 for mut document in documents {
2154 if document.body.is_empty() {
2156 log::debug!(
2157 "Document '{}' has empty body, attempting to load from persistence",
2158 document.id
2159 );
2160
2161 let mut full_doc = Document::new(document.id.clone());
2163 match full_doc.load().await {
2164 Ok(loaded_doc) => {
2165 if !loaded_doc.body.is_empty() {
2166 log::info!(
2167 "✅ Loaded body content for document '{}' from persistence",
2168 document.id
2169 );
2170 document.body = loaded_doc.body.clone();
2171 if loaded_doc.description.is_some() {
2172 document.description = loaded_doc.description.clone();
2173 }
2174
2175 if let Some(rolegraph_sync) =
2177 self.config_state.roles.get(&role.name)
2178 {
2179 let mut rolegraph = rolegraph_sync.lock().await;
2180 rolegraph.insert_document(&document.id, loaded_doc);
2181 need_reindexing = true;
2182 log::debug!(
2183 "Re-indexed document '{}' into rolegraph with content",
2184 document.id
2185 );
2186 }
2187 } else {
2188 log::warn!(
2189 "Document '{}' still has empty body after loading from persistence",
2190 document.id
2191 );
2192 }
2193 }
2194 Err(e) => {
2195 log::warn!(
2196 "Failed to load document '{}' from persistence: {}",
2197 document.id,
2198 e
2199 );
2200
2201 if document.url.starts_with('/')
2203 || document.url.starts_with("docs/")
2204 {
2205 match tokio::fs::read_to_string(&document.url).await {
2206 Ok(content) => {
2207 log::info!(
2208 "✅ Loaded content for '{}' from file: {}",
2209 document.id,
2210 document.url
2211 );
2212 document.body = content.clone();
2213
2214 let full_doc = Document {
2216 id: document.id.clone(),
2217 title: document.title.clone(),
2218 body: content,
2219 url: document.url.clone(),
2220 description: document.description.clone(),
2221 summarization: document.summarization.clone(),
2222 stub: None,
2223 tags: document.tags.clone(),
2224 rank: document.rank,
2225 source_haystack: document.source_haystack.clone(),
2226 doc_type: terraphim_types::DocumentType::KgEntry,
2227 synonyms: None,
2228 route: None,
2229 priority: None,
2230 quality_score: None,
2231 };
2232
2233 if let Err(e) = full_doc.save().await {
2235 log::warn!(
2236 "Failed to save document '{}' to persistence: {}",
2237 document.id,
2238 e
2239 );
2240 }
2241
2242 if let Some(rolegraph_sync) =
2244 self.config_state.roles.get(&role.name)
2245 {
2246 let mut rolegraph = rolegraph_sync.lock().await;
2247 rolegraph.insert_document(&document.id, full_doc);
2248 need_reindexing = true;
2249 log::debug!(
2250 "Re-indexed document '{}' into rolegraph from file",
2251 document.id
2252 );
2253 }
2254 }
2255 Err(file_e) => {
2256 log::warn!(
2257 "Failed to read file '{}' for document '{}': {}",
2258 document.url,
2259 document.id,
2260 file_e
2261 );
2262 }
2263 }
2264 }
2265 }
2266 }
2267 }
2268 documents_with_content.push(document);
2269 }
2270
2271 documents = documents_with_content;
2272
2273 if need_reindexing {
2274 log::info!("🔄 Re-running TerraphimGraph search after indexing new documents");
2275
2276 let updated_scored_docs: Vec<IndexedDocument> = self
2278 .config_state
2279 .search_indexed_documents(search_query, &role)
2280 .await;
2281
2282 if !updated_scored_docs.is_empty() {
2283 log::debug!(
2284 "✅ Updated rolegraph search found {} documents",
2285 updated_scored_docs.len()
2286 );
2287 let updated_documents = index.get_documents(updated_scored_docs);
2289 if !updated_documents.is_empty() {
2290 documents = updated_documents;
2291 }
2292 }
2293 }
2294
2295 if documents.is_empty() && !all_haystack_docs.is_empty() {
2296 log::info!(
2297 "TerraphimGraph returned no results for role '{}'; falling back to lexical haystack ranking",
2298 role.name
2299 );
2300 documents = if search_query.is_multi_term_query() {
2301 let filtered_docs = self
2302 .apply_logical_operators_to_documents(
2303 search_query,
2304 all_haystack_docs.clone(),
2305 )
2306 .await?;
2307 let combined_query_string = search_query
2308 .get_all_terms()
2309 .iter()
2310 .map(|t| t.as_str())
2311 .collect::<Vec<_>>()
2312 .join(" ");
2313 let query = Query::new(&combined_query_string);
2314 score::sort_documents(&query, filtered_docs)
2315 } else {
2316 let query = Query::new(&search_query.search_term.to_string());
2317 score::sort_documents(&query, all_haystack_docs.clone())
2318 };
2319 }
2320
2321 if !documents.is_empty() {
2323 log::debug!(
2324 "Applying TF-IDF scoring to {} documents for enhanced ranking",
2325 documents.len()
2326 );
2327
2328 use crate::score::bm25_additional::TFIDFScorer;
2329 let mut tfidf_scorer = TFIDFScorer::new();
2330 tfidf_scorer.initialize(&documents);
2331
2332 let query_text = &search_query.search_term.to_string();
2334 for document in &mut documents {
2335 let tfidf_score = tfidf_scorer.score(query_text, document);
2336 if let Some(rank) = document.rank {
2338 document.rank = Some(rank + (tfidf_score * 0.3) as u64);
2339 } else {
2341 document.rank = Some((tfidf_score * 10.0) as u64); }
2343 }
2344
2345 documents.sort_by_key(|d| std::cmp::Reverse(d.rank.unwrap_or(0)));
2347
2348 log::debug!("TF-IDF scoring applied successfully");
2349 }
2350
2351 for document in &mut documents {
2353 if document.id.starts_with("http://") || document.id.starts_with("https://") {
2354 log::debug!(
2356 "Processing Atomic Data document '{}' (URL: {})",
2357 document.title,
2358 document.id
2359 );
2360
2361 let mut placeholder = Document {
2363 id: document.id.clone(),
2364 ..Default::default()
2365 };
2366 match placeholder.load().await {
2367 Ok(persisted_doc) => {
2368 log::debug!(
2370 "Found cached Atomic Data document '{}' in persistence",
2371 document.title
2372 );
2373 if let Some(better_description) = persisted_doc.description {
2374 document.description = Some(better_description);
2375 }
2376 if !persisted_doc.body.is_empty() && !role.terraphim_it {
2380 log::debug!(
2381 "Updated body from persistence for Atomic document '{}' (role: '{}', terraphim_it: {})",
2382 document.title,
2383 role.name,
2384 role.terraphim_it
2385 );
2386 document.body = persisted_doc.body;
2387 } else if role.terraphim_it {
2388 log::debug!(
2389 "Keeping search result body for Atomic document '{}' because role '{}' uses KG preprocessing (terraphim_it=true)",
2390 document.title,
2391 role.name
2392 );
2393 }
2394 }
2395 Err(_) => {
2396 log::debug!(
2398 "Caching Atomic Data document '{}' to persistence for future queries",
2399 document.title
2400 );
2401
2402 let doc_to_save = document.clone();
2404 tokio::spawn(async move {
2405 if let Err(e) = doc_to_save.save().await {
2406 log::warn!(
2407 "Failed to cache Atomic Data document '{}': {}",
2408 doc_to_save.title,
2409 e
2410 );
2411 } else {
2412 log::debug!(
2413 "Successfully cached Atomic Data document '{}'",
2414 doc_to_save.title
2415 );
2416 }
2417 });
2418 }
2419 }
2420 } else {
2421 let mut placeholder = Document {
2423 id: document.id.clone(),
2424 ..Default::default()
2425 };
2426 if let Ok(persisted_doc) = placeholder.load().await {
2427 if let Some(better_description) = persisted_doc.description {
2428 log::debug!(
2429 "Replaced ripgrep description for '{}' with persistence description",
2430 document.title
2431 );
2432 document.description = Some(better_description);
2433 }
2434 } else {
2435 let normalized_id = normalize_filename_to_id(&document.title);
2438
2439 let mut normalized_placeholder = Document {
2440 id: normalized_id.clone(),
2441 ..Default::default()
2442 };
2443 if let Ok(persisted_doc) = normalized_placeholder.load().await {
2444 if let Some(better_description) = persisted_doc.description {
2445 log::debug!(
2446 "Replaced ripgrep description for '{}' with persistence description (normalized from title: {})",
2447 document.title,
2448 normalized_id
2449 );
2450 document.description = Some(better_description);
2451 }
2452 } else {
2453 let normalized_id_with_md = format!("{}md", normalized_id);
2455 let mut md_placeholder = Document {
2456 id: normalized_id_with_md.clone(),
2457 ..Default::default()
2458 };
2459 if let Ok(persisted_doc) = md_placeholder.load().await {
2460 if let Some(better_description) = persisted_doc.description {
2461 log::debug!(
2462 "Replaced ripgrep description for '{}' with persistence description (normalized with md: {})",
2463 document.title,
2464 normalized_id_with_md
2465 );
2466 document.description = Some(better_description);
2467 }
2468 } else {
2469 log::debug!(
2470 "No persistence document found for '{}' (tried ID: '{}', normalized: '{}', with md: '{}')",
2471 document.title,
2472 document.id,
2473 normalized_id,
2474 normalized_id_with_md
2475 );
2476 }
2477 }
2478 }
2479 }
2480 }
2481
2482 #[cfg(feature = "openrouter")]
2484 if role.has_llm_config() {
2485 log::debug!(
2486 "Applying OpenRouter AI summarization to {} search results for role '{}'",
2487 documents.len(),
2488 role.name
2489 );
2490 documents = self.enhance_descriptions_with_ai(documents, &role).await?;
2491 } else {
2492 log::debug!(
2494 "Applying LLM AI summarization to {} search results for role '{}'",
2495 documents.len(),
2496 role.name
2497 );
2498 documents = self.enhance_descriptions_with_ai(documents, &role).await?;
2499 }
2500
2501 if role.terraphim_it {
2503 log::debug!(
2504 "Applying KG preprocessing to {} search results for role '{}'",
2505 documents.len(),
2506 role.name
2507 );
2508 let mut processed_docs = Vec::new();
2509 for document in documents {
2510 let processed_doc =
2511 self.preprocess_document_content(document, &role).await?;
2512 processed_docs.push(processed_doc);
2513 }
2514 Ok(processed_docs)
2515 } else {
2516 Ok(documents)
2517 }
2518 }
2519 };
2520 let docs = docs_result?;
2521 Ok(Self::apply_min_quality_filter(docs, min_quality))
2522 }
2523
2524 fn is_hash_based_id(id: &str) -> bool {
2526 id.len() == 16 && id.chars().all(|c| c.is_ascii_hexdigit())
2527 }
2528
2529 pub async fn find_documents_for_kg_term(
2540 &mut self,
2541 role_name: &RoleName,
2542 term: &str,
2543 ) -> Result<Vec<Document>> {
2544 log::debug!(
2545 "Finding documents for KG term '{}' in role '{}'",
2546 term,
2547 role_name
2548 );
2549
2550 let thesaurus = self.ensure_thesaurus_loaded(role_name).await?;
2552
2553 let role = self.config_state.get_role(role_name).await.ok_or_else(|| {
2555 ServiceError::Config(format!("Role '{}' not found in config", role_name))
2556 })?;
2557
2558 let mut documents = Vec::new();
2559
2560 if let Some(kg_config) = &role.kg {
2564 log::debug!("Found KG config for role");
2565 if let Some(kg_local) = &kg_config.knowledge_graph_local {
2566 let mut potential_concepts = vec![term.to_string()];
2567
2568 log::debug!("Checking thesaurus for term '{}'", term);
2570
2571 let normalized_search_term =
2573 terraphim_types::NormalizedTermValue::new(term.to_string());
2574
2575 if let Some(root_concept) = thesaurus.get(&normalized_search_term) {
2577 log::debug!("Found root concept for '{}': {:?}", term, root_concept);
2578
2579 let root_concept_name = root_concept.value.as_str();
2581
2582 let concept_name = if let Some(url) = &root_concept.url {
2584 url.split('/')
2585 .next_back()
2586 .and_then(|s| s.strip_suffix(".md"))
2587 .unwrap_or(root_concept_name)
2588 } else {
2589 root_concept_name
2590 };
2591
2592 if !potential_concepts.contains(&concept_name.to_string()) {
2593 potential_concepts.push(concept_name.to_string());
2594 log::debug!(
2595 "Added concept from thesaurus: {} (root: {})",
2596 concept_name,
2597 root_concept_name
2598 );
2599 }
2600 } else {
2601 log::debug!("No direct mapping found for '{}' in thesaurus", term);
2602 }
2603
2604 log::debug!(
2605 "Trying {} potential concepts: {:?}",
2606 potential_concepts.len(),
2607 potential_concepts
2608 );
2609
2610 for concept in potential_concepts {
2612 let potential_kg_file = kg_local.path.join(format!("{}.md", concept));
2613 log::debug!("Looking for KG definition file: {:?}", potential_kg_file);
2614
2615 if potential_kg_file.exists() {
2616 log::info!("Found KG definition file: {:?}", potential_kg_file);
2617
2618 let file_path = potential_kg_file.to_string_lossy().to_string();
2620 if documents.iter().any(|d: &Document| d.url == file_path) {
2621 log::debug!("Skipping duplicate KG document: {}", file_path);
2622 continue;
2623 }
2624
2625 match std::fs::read_to_string(&potential_kg_file) {
2628 Ok(content) => {
2629 let mut kg_doc =
2630 Document::new(potential_kg_file.to_string_lossy().to_string());
2631 kg_doc.url = potential_kg_file.to_string_lossy().to_string();
2632 kg_doc.body = content.clone();
2633
2634 let title = content
2636 .lines()
2637 .find(|line| line.starts_with("# "))
2638 .map(|line| line.trim_start_matches("# ").trim())
2639 .unwrap_or(&concept)
2640 .to_string();
2641 kg_doc.title = title;
2642
2643 log::debug!(
2644 "Successfully loaded KG definition document: {}",
2645 kg_doc.title
2646 );
2647 documents.push(kg_doc);
2648
2649 break;
2651 }
2652 Err(e) => {
2653 log::warn!(
2654 "Failed to read KG definition file '{}': {}",
2655 potential_kg_file.display(),
2656 e
2657 );
2658 }
2659 }
2660 } else {
2661 log::debug!("KG definition file not found: {:?}", potential_kg_file);
2662 }
2663 }
2664 } else {
2665 log::debug!("No KG local config found");
2666 }
2667 } else {
2668 log::debug!("No KG config found for role");
2669 }
2670
2671 let rolegraph_sync = self
2673 .config_state
2674 .roles
2675 .get(role_name)
2676 .ok_or_else(|| ServiceError::Config(format!("Role '{}' not found", role_name)))?;
2677
2678 let rolegraph = rolegraph_sync.lock().await;
2679 let document_ids = rolegraph.find_document_ids_for_term(term);
2680 drop(rolegraph); log::debug!(
2683 "Found {} document IDs from rolegraph for term '{}'",
2684 document_ids.len(),
2685 term
2686 );
2687
2688 for doc_id in &document_ids {
2690 if documents
2692 .iter()
2693 .any(|d| d.id == *doc_id || d.url == *doc_id)
2694 {
2695 log::debug!("Skipping duplicate document from rolegraph: {}", doc_id);
2696 continue;
2697 }
2698
2699 if doc_id.starts_with("http://") || doc_id.starts_with("https://") {
2702 log::debug!("Loading Atomic Data document '{}' from persistence", doc_id);
2704 let mut placeholder = Document {
2705 id: doc_id.clone(),
2706 ..Default::default()
2707 };
2708 match placeholder.load().await {
2709 Ok(loaded_doc) => {
2710 log::debug!(
2711 "Found cached Atomic Data document '{}' in persistence",
2712 doc_id
2713 );
2714 documents.push(loaded_doc);
2715 }
2716 Err(_) => {
2717 log::warn!(
2718 "Atomic Data document '{}' not found in persistence - this may indicate the document hasn't been cached yet",
2719 doc_id
2720 );
2721 }
2724 }
2725 } else {
2726 let mut doc = Document::new(doc_id.clone());
2728 match doc.load().await {
2729 Ok(loaded_doc) => {
2730 documents.push(loaded_doc);
2731 log::trace!("Successfully loaded local document: {}", doc_id);
2732 }
2733 Err(e) => {
2734 log::warn!("Failed to load local document '{}': {}", doc_id, e);
2735
2736 if Self::is_hash_based_id(doc_id) {
2738 log::debug!(
2739 "Document ID '{}' appears to be hash-based (legacy document), skipping for now",
2740 doc_id
2741 );
2742 log::info!(
2743 "💡 Hash-based document IDs are deprecated. This document will be re-indexed with normalized IDs on next haystack search."
2744 );
2745 }
2748
2749 }
2751 }
2752 }
2753 }
2754
2755 if role.terraphim_it {
2757 log::info!(
2758 "🧠 Applying KG preprocessing to {} KG term documents for role '{}' (terraphim_it enabled)",
2759 documents.len(),
2760 role_name
2761 );
2762 let mut processed_documents = Vec::new();
2763 let mut total_kg_terms = 0;
2764 let mut docs_with_kg_links = 0;
2765
2766 for document in documents {
2767 let original_body_len = document.body.len();
2768 let processed_doc = self.preprocess_document_content(document, &role).await?;
2769
2770 let new_body_len = processed_doc.body.len();
2772 if new_body_len > original_body_len {
2773 docs_with_kg_links += 1;
2774 let estimated_links = (new_body_len - original_body_len) / 17;
2775 total_kg_terms += estimated_links;
2776 }
2777
2778 processed_documents.push(processed_doc);
2779 }
2780
2781 log::info!(
2782 "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
2783 processed_documents.len(),
2784 docs_with_kg_links,
2785 total_kg_terms
2786 );
2787 documents = processed_documents;
2788 } else {
2789 log::info!(
2790 "🔍 terraphim_it disabled for role '{}', skipping KG preprocessing for {} documents",
2791 role_name,
2792 documents.len()
2793 );
2794 }
2795
2796 let total_length = documents.len();
2799 for (idx, doc) in documents.iter_mut().enumerate() {
2800 let rank = (total_length - idx) as u64;
2801 doc.rank = Some(rank);
2802 log::trace!("Assigned rank {} to document '{}'", rank, doc.title);
2803 }
2804
2805 log::debug!(
2806 "Successfully loaded and processed {} documents for term '{}', ranks assigned from {} to 1",
2807 documents.len(),
2808 term,
2809 total_length
2810 );
2811 Ok(documents)
2812 }
2813
2814 #[cfg(feature = "openrouter")]
2831 pub async fn generate_document_summary(
2832 &self,
2833 document: &Document,
2834 api_key: &str,
2835 model: &str,
2836 max_length: usize,
2837 ) -> Result<String> {
2838 use crate::openrouter::OpenRouterService;
2839
2840 log::debug!(
2841 "Generating summary for document '{}' using model '{}'",
2842 document.id,
2843 model
2844 );
2845
2846 let openrouter_service =
2848 OpenRouterService::new(api_key, model).map_err(ServiceError::OpenRouter)?;
2849
2850 let content = &document.body;
2852
2853 if content.trim().is_empty() {
2854 return Err(ServiceError::Config(
2855 "Document body is empty, cannot generate summary".to_string(),
2856 ));
2857 }
2858
2859 let summary = openrouter_service
2861 .generate_summary(content, max_length)
2862 .await
2863 .map_err(ServiceError::OpenRouter)?;
2864
2865 log::info!(
2866 "Generated {}-character summary for document '{}' using model '{}'",
2867 summary.len(),
2868 document.id,
2869 model
2870 );
2871
2872 Ok(summary)
2873 }
2874
2875 #[cfg(not(feature = "openrouter"))]
2877 pub async fn generate_document_summary(
2878 &self,
2879 _document: &Document,
2880 _api_key: &str,
2881 _model: &str,
2882 _max_length: usize,
2883 ) -> Result<String> {
2884 Err(ServiceError::Config(
2885 "OpenRouter feature not enabled during compilation".to_string(),
2886 ))
2887 }
2888
2889 pub async fn fetch_config(&self) -> terraphim_config::Config {
2891 let current_config = self.config_state.config.lock().await;
2892 current_config.clone()
2893 }
2894
2895 #[cfg(test)]
2897 pub async fn get_role(&self, role_name: &RoleName) -> Result<Role> {
2898 let config = self.config_state.config.lock().await;
2899 config
2900 .roles
2901 .get(role_name)
2902 .cloned()
2903 .ok_or_else(|| ServiceError::Config(format!("Role '{}' not found", role_name)))
2904 }
2905
2906 pub async fn update_config(
2911 &self,
2912 config: terraphim_config::Config,
2913 ) -> Result<terraphim_config::Config> {
2914 {
2917 let mut current_config = self.config_state.config.lock().await;
2918 *current_config = config.clone();
2919 }
2920 config.save().await?;
2921 log::info!("Config updated");
2922 Ok(config)
2923 }
2924
2925 pub async fn update_selected_role(
2928 &self,
2929 role_name: terraphim_types::RoleName,
2930 ) -> Result<terraphim_config::Config> {
2931 let snapshot = {
2936 let mut current_config = self.config_state.config.lock().await;
2937
2938 if !current_config.roles.contains_key(&role_name) {
2939 return Err(ServiceError::Config(format!(
2940 "Role `{}` not found in config",
2941 role_name
2942 )));
2943 }
2944
2945 current_config.selected_role = role_name.clone();
2946 current_config.clone()
2947 };
2948 let snapshot_for_save = snapshot.clone();
2954 let role_for_log = role_name.clone();
2955 tokio::spawn(async move {
2956 if let Err(e) = snapshot_for_save.save().await {
2957 log::warn!(
2958 "background persist of selected_role={} failed: {}",
2959 role_for_log,
2960 e
2961 );
2962 }
2963 });
2964 if let Some(role) = snapshot.roles.get(&role_name) {
2966 if role.terraphim_it {
2967 log::info!(
2968 "🎯 Selected role '{}' → terraphim_it: ENABLED (KG preprocessing will be applied)",
2969 role_name
2970 );
2971 } else {
2972 log::info!("🎯 Selected role '{}' → terraphim_it: DISABLED", role_name);
2973 }
2974 }
2975
2976 Ok(snapshot)
2977 }
2978
2979 fn highlight_search_terms(content: &str, search_query: &SearchQuery) -> String {
2984 let mut highlighted_content = content.to_string();
2985
2986 let terms = search_query.get_all_terms();
2988
2989 let mut sorted_terms: Vec<&str> = terms.iter().map(|t| t.as_str()).collect();
2991 sorted_terms.sort_by_key(|term| std::cmp::Reverse(term.len()));
2992
2993 for term in sorted_terms {
2994 if term.trim().is_empty() {
2995 continue;
2996 }
2997
2998 let escaped_term = regex::escape(term);
3001
3002 if let Ok(regex) = regex::RegexBuilder::new(&escaped_term)
3003 .case_insensitive(true)
3004 .build()
3005 {
3006 let highlight_open = "<mark class=\"search-highlight\">";
3009 let highlight_close = "</mark>";
3010
3011 highlighted_content = regex
3012 .replace_all(
3013 &highlighted_content,
3014 format!("{}{}{}", highlight_open, "$0", highlight_close),
3015 )
3016 .to_string();
3017 }
3018 }
3019
3020 highlighted_content
3021 }
3022}
3023
3024pub(crate) fn snippet_around(s: &str, marker: &str, before: usize, after: usize) -> String {
3025 let Some(marker_byte) = s.find(marker) else {
3026 return String::new();
3027 };
3028 let marker_char_index = s[..marker_byte].chars().count();
3029 let total_chars = s.chars().count();
3030
3031 let start_char_index = marker_char_index.saturating_sub(before);
3032 let end_char_index = (marker_char_index + marker.len() + after).min(total_chars);
3033
3034 if start_char_index >= end_char_index {
3035 return String::new();
3036 }
3037
3038 s.chars()
3039 .skip(start_char_index)
3040 .take(end_char_index - start_char_index)
3041 .collect()
3042}
3043
3044#[cfg(test)]
3045mod tests {
3046 use super::*;
3047 use std::path::PathBuf;
3048 use terraphim_config::ConfigBuilder;
3049 use terraphim_types::NormalizedTermValue;
3050
3051 #[tokio::test]
3052 async fn test_get_config() {
3053 let mut config = ConfigBuilder::new()
3054 .build_default_desktop()
3055 .build()
3056 .unwrap();
3057 let config_state = ConfigState::new(&mut config).await.unwrap();
3058 let service = TerraphimService::new(config_state);
3059 let fetched_config = service.fetch_config().await;
3060 assert_eq!(fetched_config.id, terraphim_config::ConfigId::Desktop);
3061 }
3062
3063 #[tokio::test]
3064 async fn test_search_documents_selected_role() {
3065 let project_root = std::env::current_dir().unwrap_or_else(|_| PathBuf::from("."));
3067 let kg_path = project_root.join("docs/src/kg");
3068 if !kg_path.exists() {
3069 println!("Skipping test: KG directory not found at {:?}", kg_path);
3070 return;
3071 }
3072
3073 let mut config = ConfigBuilder::new()
3074 .build_default_desktop()
3075 .build()
3076 .unwrap();
3077 let config_state = match ConfigState::new(&mut config).await {
3078 Ok(state) => state,
3079 Err(e) => {
3080 println!("Skipping test: Failed to create config state: {:?}", e);
3081 return;
3082 }
3083 };
3084 let mut service = TerraphimService::new(config_state);
3085 let search_term = NormalizedTermValue::new("terraphim".to_string());
3086 let documents = match service.search_documents_selected_role(&search_term).await {
3087 Ok(docs) => docs,
3088 Err(e) => {
3089 println!(
3090 "Skipping test: Search failed (expected in some environments): {:?}",
3091 e
3092 );
3093 return;
3094 }
3095 };
3096 assert!(documents.is_empty() || !documents.is_empty()); }
3098
3099 #[tokio::test]
3100 async fn test_ensure_thesaurus_loaded_terraphim_engineer() {
3101 let project_root = std::env::current_dir().unwrap_or_else(|_| PathBuf::from("."));
3103 let kg_path = project_root.join("docs/src/kg");
3104
3105 if !kg_path.exists() {
3107 println!("⚠️ KG directory not found at {:?}, skipping test", kg_path);
3108 return;
3109 }
3110
3111 let mut config = ConfigBuilder::new()
3112 .build_default_desktop()
3113 .build()
3114 .unwrap();
3115
3116 if let Some(terr_eng_role) = config.roles.get_mut(&"Terraphim Engineer".into()) {
3118 if let Some(kg) = &mut terr_eng_role.kg {
3119 if let Some(kg_local) = &mut kg.knowledge_graph_local {
3120 kg_local.path = kg_path;
3121 }
3122 }
3123 }
3124
3125 let config_state = ConfigState::new(&mut config).await.unwrap();
3126 let mut service = TerraphimService::new(config_state);
3127
3128 let role_name = RoleName::new("Terraphim Engineer");
3129 let thesaurus_result = service.ensure_thesaurus_loaded(&role_name).await;
3130
3131 match thesaurus_result {
3132 Ok(thesaurus) => {
3133 println!(
3134 "✅ Successfully loaded thesaurus with {} entries",
3135 thesaurus.len()
3136 );
3137 assert!(!thesaurus.is_empty(), "Thesaurus should not be empty");
3139
3140 let has_terraphim = (&thesaurus)
3142 .into_iter()
3143 .any(|(term, _)| term.as_str().to_lowercase().contains("terraphim"));
3144 let has_graph = (&thesaurus)
3145 .into_iter()
3146 .any(|(term, _)| term.as_str().to_lowercase().contains("graph"));
3147
3148 println!(" Contains 'terraphim': {}", has_terraphim);
3149 println!(" Contains 'graph': {}", has_graph);
3150
3151 assert!(
3153 has_terraphim || has_graph,
3154 "Thesaurus should contain expected terms"
3155 );
3156 }
3157 Err(e) => {
3158 println!("❌ Failed to load thesaurus: {:?}", e);
3159 }
3162 }
3163 }
3164
3165 #[tokio::test]
3166 #[ignore = "Requires local KG fixtures at ~/.terraphim/kg"]
3167 async fn test_config_building_with_local_kg() {
3168 let mut config = ConfigBuilder::new()
3170 .build_default_desktop()
3171 .build()
3172 .unwrap();
3173 let config_state_result = ConfigState::new(&mut config).await;
3174
3175 match config_state_result {
3176 Ok(config_state) => {
3177 println!("✅ Successfully built config state");
3178 assert!(
3180 !config_state.roles.is_empty(),
3181 "Config state should have roles"
3182 );
3183
3184 let terraphim_engineer_role = RoleName::new("Terraphim Engineer");
3186 let has_terraphim_engineer =
3187 config_state.roles.contains_key(&terraphim_engineer_role);
3188 println!(" Has Terraphim Engineer role: {}", has_terraphim_engineer);
3189
3190 assert!(
3192 has_terraphim_engineer,
3193 "Terraphim Engineer role should exist"
3194 );
3195 }
3196 Err(e) => {
3197 println!("❌ Failed to build config state: {:?}", e);
3198 }
3201 }
3202 }
3203
3204 #[tokio::test]
3205 async fn test_atomic_data_persistence_skip() {
3206 use ahash::AHashMap;
3207 use terraphim_config::{Config, Haystack, Role, ServiceType};
3208 use terraphim_persistence::DeviceStorage;
3209 use terraphim_types::{NormalizedTermValue, RoleName, SearchQuery};
3210
3211 DeviceStorage::init_memory_only().await.unwrap();
3213
3214 let mut config = Config::default();
3216 let role_name = RoleName::new("test_role");
3217 let role = Role {
3218 shortname: None,
3219 name: "test_role".into(),
3220 haystacks: vec![Haystack {
3221 location: "test".to_string(),
3222 service: ServiceType::Ripgrep,
3223 read_only: false,
3224 atomic_server_secret: None,
3225 extra_parameters: std::collections::HashMap::new(),
3226 fetch_content: false,
3227 }],
3228 kg: None,
3229 terraphim_it: false,
3230 theme: "default".to_string(),
3231 relevance_function: terraphim_types::RelevanceFunction::TitleScorer,
3232 llm_enabled: false,
3233 llm_api_key: None,
3234 llm_model: None,
3235 llm_auto_summarize: false,
3236 llm_chat_enabled: false,
3237 llm_chat_system_prompt: None,
3238 llm_chat_model: None,
3239 llm_context_window: None,
3240 extra: AHashMap::new(),
3241 llm_router_enabled: false,
3242 llm_router_config: None,
3243 };
3244 config.roles.insert(role_name.clone(), role);
3245
3246 let config_state = ConfigState::new(&mut config).await.unwrap();
3247 let mut service = TerraphimService::new(config_state);
3248
3249 let search_query = SearchQuery {
3251 search_term: NormalizedTermValue::new("test".to_string()),
3252 search_terms: None,
3253 operator: None,
3254 limit: Some(10),
3255 skip: None,
3256 role: Some(role_name),
3257 layer: Layer::default(),
3258 include_pinned: false,
3259 min_quality: None,
3260 };
3261
3262 let result = service.search(&search_query).await;
3265
3266 assert!(result.is_ok(), "Search should complete without errors");
3269 }
3270
3271 #[tokio::test]
3272 async fn test_atomic_data_caching() {
3273 use ahash::AHashMap;
3274 use terraphim_config::{Config, Haystack, Role, ServiceType};
3275 use terraphim_persistence::DeviceStorage;
3276 use terraphim_types::{Document, NormalizedTermValue, RoleName, SearchQuery};
3277
3278 DeviceStorage::init_memory_only().await.unwrap();
3280
3281 let mut config = Config::default();
3283 let role_name = RoleName::new("test_role");
3284 let role = Role {
3285 shortname: None,
3286 name: "test_role".into(),
3287 haystacks: vec![Haystack {
3288 location: "test".to_string(),
3289 service: ServiceType::Ripgrep,
3290 read_only: false,
3291 atomic_server_secret: None,
3292 extra_parameters: std::collections::HashMap::new(),
3293 fetch_content: false,
3294 }],
3295 kg: None,
3296 terraphim_it: false,
3297 theme: "default".to_string(),
3298 relevance_function: terraphim_types::RelevanceFunction::TitleScorer,
3299 llm_enabled: false,
3300 llm_api_key: None,
3301 llm_model: None,
3302 llm_auto_summarize: false,
3303 llm_chat_enabled: false,
3304 llm_chat_system_prompt: None,
3305 llm_chat_model: None,
3306 llm_context_window: None,
3307 extra: AHashMap::new(),
3308 llm_router_enabled: false,
3309 llm_router_config: None,
3310 };
3311 config.roles.insert(role_name.clone(), role);
3312
3313 let config_state = ConfigState::new(&mut config).await.unwrap();
3314 let mut service = TerraphimService::new(config_state);
3315
3316 let atomic_doc = Document {
3318 id: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3319 url: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3320 title: "Requested Loan Amount ($)".to_string(),
3321 body: "Form field for Requested Loan Amount ($)".to_string(),
3322 description: Some("Form field for Requested Loan Amount ($)".to_string()),
3323 summarization: None,
3324 stub: None,
3325 tags: None,
3326 rank: None,
3327 source_haystack: None,
3328 doc_type: terraphim_types::DocumentType::KgEntry,
3329 synonyms: None,
3330 route: None,
3331 priority: None,
3332 quality_score: None,
3333 };
3334
3335 log::info!("Testing Atomic Data document caching...");
3337 match atomic_doc.save().await {
3338 Ok(_) => log::info!("✅ Successfully saved Atomic Data document to persistence"),
3339 Err(e) => {
3340 log::error!("❌ Failed to save Atomic Data document: {}", e);
3341 panic!("Atomic Data document save failed");
3342 }
3343 }
3344
3345 let mut placeholder = Document {
3347 id: atomic_doc.id.clone(),
3348 ..Default::default()
3349 };
3350 match placeholder.load().await {
3351 Ok(loaded_doc) => {
3352 log::info!("✅ Successfully loaded Atomic Data document from persistence");
3353 assert_eq!(loaded_doc.title, atomic_doc.title);
3354 assert_eq!(loaded_doc.body, atomic_doc.body);
3355 assert_eq!(loaded_doc.description, atomic_doc.description);
3356 }
3357 Err(e) => {
3358 log::error!(
3359 "❌ Failed to load Atomic Data document from persistence: {}",
3360 e
3361 );
3362 panic!("Atomic Data document load failed");
3363 }
3364 }
3365
3366 let search_query = SearchQuery {
3368 search_term: NormalizedTermValue::new("test".to_string()),
3369 search_terms: None,
3370 operator: None,
3371 limit: Some(10),
3372 skip: None,
3373 role: Some(role_name),
3374 layer: Layer::default(),
3375 include_pinned: false,
3376 min_quality: None,
3377 };
3378
3379 let result = service.search(&search_query).await;
3380 assert!(result.is_ok(), "Search should complete without errors");
3381
3382 log::info!("✅ All Atomic Data caching tests passed!");
3383 }
3384
3385 #[tokio::test]
3386 #[ignore = "Requires local KG fixtures at 'test' directory"]
3387 async fn test_kg_term_search_with_atomic_data() {
3388 use ahash::AHashMap;
3389 use std::path::PathBuf;
3390 use terraphim_config::{
3391 Config, Haystack, KnowledgeGraph, KnowledgeGraphLocal, Role, ServiceType,
3392 };
3393 use terraphim_persistence::DeviceStorage;
3394 use terraphim_types::{Document, KnowledgeGraphInputType, RoleName};
3395
3396 DeviceStorage::init_memory_only().await.unwrap();
3398
3399 let mut config = Config::default();
3401 let role_name = RoleName::new("test_kg_role");
3402 let role = Role {
3403 shortname: None,
3404 name: "test_kg_role".into(),
3405 haystacks: vec![Haystack {
3406 location: "test".to_string(),
3407 service: ServiceType::Ripgrep,
3408 read_only: false,
3409 atomic_server_secret: None,
3410 extra_parameters: std::collections::HashMap::new(),
3411 fetch_content: false,
3412 }],
3413 kg: Some(KnowledgeGraph {
3414 automata_path: None,
3415 knowledge_graph_local: Some(KnowledgeGraphLocal {
3416 input_type: KnowledgeGraphInputType::Markdown,
3417 path: PathBuf::from("test"),
3418 }),
3419 public: true,
3420 publish: true,
3421 }),
3422 terraphim_it: true,
3423 theme: "default".to_string(),
3424 relevance_function: terraphim_types::RelevanceFunction::TerraphimGraph,
3425 llm_enabled: false,
3426 llm_api_key: None,
3427 llm_model: None,
3428 llm_auto_summarize: false,
3429 llm_chat_enabled: false,
3430 llm_chat_system_prompt: None,
3431 llm_chat_model: None,
3432 llm_context_window: None,
3433 extra: AHashMap::new(),
3434 llm_router_enabled: false,
3435 llm_router_config: None,
3436 };
3437 config.roles.insert(role_name.clone(), role);
3438
3439 let config_state = ConfigState::new(&mut config).await.unwrap();
3440 let mut service = TerraphimService::new(config_state);
3441
3442 let atomic_doc = Document {
3444 id: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3445 url: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3446 title: "Requested Loan Amount ($)".to_string(),
3447 body: "Form field for Requested Loan Amount ($)".to_string(),
3448 description: Some("Form field for Requested Loan Amount ($)".to_string()),
3449 summarization: None,
3450 stub: None,
3451 tags: None,
3452 rank: None,
3453 source_haystack: None,
3454 doc_type: terraphim_types::DocumentType::KgEntry,
3455 synonyms: None,
3456 route: None,
3457 priority: None,
3458 quality_score: None,
3459 };
3460
3461 log::info!("Testing KG term search with Atomic Data documents...");
3463 match atomic_doc.save().await {
3464 Ok(_) => log::info!("✅ Successfully saved Atomic Data document to persistence"),
3465 Err(e) => {
3466 log::error!("❌ Failed to save Atomic Data document: {}", e);
3467 panic!("Atomic Data document save failed");
3468 }
3469 }
3470
3471 let result = service.find_documents_for_kg_term(&role_name, "test").await;
3475
3476 assert!(
3479 result.is_ok(),
3480 "find_documents_for_kg_term should complete without errors"
3481 );
3482
3483 let documents = result.unwrap();
3484 log::info!(
3485 "✅ KG term search completed successfully, found {} documents",
3486 documents.len()
3487 );
3488
3489 let atomic_doc_id = "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount";
3492 let mut placeholder = Document {
3493 id: atomic_doc_id.to_string(),
3494 ..Default::default()
3495 };
3496
3497 match placeholder.load().await {
3498 Ok(loaded_doc) => {
3499 log::info!(
3500 "✅ Successfully loaded Atomic Data document from persistence in KG term search context"
3501 );
3502 assert_eq!(loaded_doc.title, atomic_doc.title);
3503 assert_eq!(loaded_doc.body, atomic_doc.body);
3504 }
3505 Err(e) => {
3506 log::error!(
3507 "❌ Failed to load Atomic Data document in KG term search context: {}",
3508 e
3509 );
3510 panic!("Atomic Data document load failed in KG term search context");
3511 }
3512 }
3513
3514 log::info!("✅ All KG term search with Atomic Data tests passed!");
3515 }
3516
3517 #[tokio::test]
3518 async fn test_kg_term_search_rank_assignment() -> Result<()> {
3519 use ahash::AHashMap;
3520 use terraphim_config::{Config, Haystack, Role, ServiceType};
3521 use terraphim_persistence::DeviceStorage;
3522 use terraphim_types::{Document, RoleName};
3523
3524 DeviceStorage::init_memory_only().await.unwrap();
3526
3527 let mut config = Config::default();
3529 let role_name = RoleName::new("Test KG Role");
3530 let role = Role {
3531 shortname: Some("test-kg".to_string()),
3532 name: role_name.clone(),
3533 haystacks: vec![Haystack {
3534 location: "test".to_string(),
3535 service: ServiceType::Ripgrep,
3536 read_only: false,
3537 atomic_server_secret: None,
3538 extra_parameters: std::collections::HashMap::new(),
3539 fetch_content: false,
3540 }],
3541 kg: Some(terraphim_config::KnowledgeGraph {
3542 automata_path: Some(terraphim_automata::AutomataPath::local_example()),
3543 knowledge_graph_local: None,
3544 public: false,
3545 publish: false,
3546 }),
3547 terraphim_it: false,
3548 theme: "default".to_string(),
3549 relevance_function: terraphim_types::RelevanceFunction::TitleScorer,
3550 llm_enabled: false,
3551 llm_api_key: None,
3552 llm_model: None,
3553 llm_auto_summarize: false,
3554 llm_chat_enabled: false,
3555 llm_chat_system_prompt: None,
3556 llm_chat_model: None,
3557 llm_context_window: None,
3558 extra: AHashMap::new(),
3559 llm_router_enabled: false,
3560 llm_router_config: None,
3561 };
3562 config.roles.insert(role_name.clone(), role);
3563
3564 let config_state = ConfigState::new(&mut config).await.unwrap();
3565 let _service = TerraphimService::new(config_state);
3566
3567 let test_documents = vec![
3569 Document {
3570 id: "test-doc-1".to_string(),
3571 title: "First Test Document".to_string(),
3572 body: "This is the first test document body".to_string(),
3573 url: "test://doc1".to_string(),
3574 description: Some("First document description".to_string()),
3575 summarization: None,
3576 stub: None,
3577 tags: Some(vec!["test".to_string(), "first".to_string()]),
3578 rank: None, source_haystack: None,
3580 doc_type: terraphim_types::DocumentType::KgEntry,
3581 synonyms: None,
3582 route: None,
3583 priority: None,
3584 quality_score: None,
3585 },
3586 Document {
3587 id: "test-doc-2".to_string(),
3588 title: "Second Test Document".to_string(),
3589 body: "This is the second test document body".to_string(),
3590 url: "test://doc2".to_string(),
3591 description: Some("Second document description".to_string()),
3592 summarization: None,
3593 stub: None,
3594 tags: Some(vec!["test".to_string(), "second".to_string()]),
3595 rank: None, source_haystack: None,
3597 doc_type: terraphim_types::DocumentType::KgEntry,
3598 synonyms: None,
3599 route: None,
3600 priority: None,
3601 quality_score: None,
3602 },
3603 Document {
3604 id: "test-doc-3".to_string(),
3605 title: "Third Test Document".to_string(),
3606 body: "This is the third test document body".to_string(),
3607 url: "test://doc3".to_string(),
3608 description: Some("Third document description".to_string()),
3609 summarization: None,
3610 stub: None,
3611 tags: Some(vec!["test".to_string(), "third".to_string()]),
3612 rank: None, source_haystack: None,
3614 doc_type: terraphim_types::DocumentType::KgEntry,
3615 synonyms: None,
3616 route: None,
3617 priority: None,
3618 quality_score: None,
3619 },
3620 ];
3621
3622 for doc in &test_documents {
3624 doc.save().await.expect("Failed to save test document");
3625 }
3626
3627 let mut simulated_documents = test_documents.clone();
3633
3634 let total_length = simulated_documents.len();
3636 for (idx, doc) in simulated_documents.iter_mut().enumerate() {
3637 let rank = (total_length - idx) as u64;
3638 doc.rank = Some(rank);
3639 }
3640
3641 assert_eq!(simulated_documents.len(), 3, "Should have 3 test documents");
3643
3644 for doc in &simulated_documents {
3646 assert!(
3647 doc.rank.is_some(),
3648 "Document '{}' should have a rank assigned",
3649 doc.title
3650 );
3651 assert!(
3652 doc.rank.unwrap() > 0,
3653 "Document '{}' should have a positive rank",
3654 doc.title
3655 );
3656 }
3657
3658 assert_eq!(
3660 simulated_documents[0].rank,
3661 Some(3),
3662 "First document should have highest rank (3)"
3663 );
3664 assert_eq!(
3665 simulated_documents[1].rank,
3666 Some(2),
3667 "Second document should have rank 2"
3668 );
3669 assert_eq!(
3670 simulated_documents[2].rank,
3671 Some(1),
3672 "Third document should have rank 1"
3673 );
3674
3675 let mut ranks: Vec<u64> = simulated_documents
3677 .iter()
3678 .map(|doc| doc.rank.unwrap())
3679 .collect();
3680 ranks.sort_by_key(|r| std::cmp::Reverse(*r));
3681 assert_eq!(
3682 ranks,
3683 vec![3, 2, 1],
3684 "Ranks should be unique and in descending order"
3685 );
3686
3687 log::info!("✅ KG term search rank assignment test completed successfully!");
3688 Ok(())
3689 }
3690
3691 fn doc_with_quality(id: &str, knowledge: f64, logic: f64, structure: f64) -> Document {
3693 Document {
3694 id: id.to_string(),
3695 url: format!("https://example.com/{id}"),
3696 title: id.to_string(),
3697 body: String::new(),
3698 quality_score: Some(terraphim_types::QualityScore {
3699 knowledge: Some(knowledge),
3700 logic: Some(logic),
3701 structure: Some(structure),
3702 last_evaluated: None,
3703 }),
3704 ..Default::default()
3705 }
3706 }
3707
3708 fn doc_without_quality(id: &str) -> Document {
3709 Document {
3710 id: id.to_string(),
3711 url: format!("https://example.com/{id}"),
3712 title: id.to_string(),
3713 body: String::new(),
3714 quality_score: None,
3715 ..Default::default()
3716 }
3717 }
3718
3719 #[test]
3720 fn test_min_quality_none_returns_all_documents() {
3721 let docs = vec![
3723 doc_with_quality("a", 0.9, 0.9, 0.9),
3724 doc_with_quality("b", 0.1, 0.1, 0.1),
3725 doc_without_quality("c"),
3726 ];
3727 let result = TerraphimService::apply_min_quality_filter(docs, None);
3728 assert_eq!(result.len(), 3);
3729 }
3730
3731 #[test]
3732 fn test_min_quality_keeps_documents_at_or_above_threshold() {
3733 let high = doc_with_quality("high", 0.8, 0.6, 0.7);
3735 let low = doc_with_quality("low", 0.3, 0.2, 0.1);
3737 let docs = vec![high, low];
3738
3739 let result = TerraphimService::apply_min_quality_filter(docs, Some(0.5));
3740 assert_eq!(result.len(), 1);
3741 assert_eq!(result[0].id, "high");
3742 }
3743
3744 #[test]
3745 fn test_min_quality_excludes_documents_below_threshold() {
3746 let doc = doc_with_quality("below", 0.4, 0.4, 0.4);
3748 let result = TerraphimService::apply_min_quality_filter(vec![doc], Some(0.5));
3749 assert!(result.is_empty());
3750 }
3751
3752 #[test]
3753 fn test_min_quality_excludes_documents_without_quality_score() {
3754 let no_score = doc_without_quality("no-score");
3756 let result = TerraphimService::apply_min_quality_filter(vec![no_score], Some(0.0));
3757 assert!(result.is_empty());
3758 }
3759
3760 #[test]
3761 fn test_min_quality_exact_threshold_is_included() {
3762 let doc = doc_with_quality("exact", 0.5, 0.5, 0.5);
3764 let result = TerraphimService::apply_min_quality_filter(vec![doc], Some(0.5));
3765 assert_eq!(result.len(), 1);
3766 }
3767
3768 #[test]
3769 fn test_min_quality_threshold_zero_excludes_no_score_docs() {
3770 let with_score = doc_with_quality("scored", 0.0, 0.0, 0.0);
3772 let no_score = doc_without_quality("unscored");
3773 let result =
3774 TerraphimService::apply_min_quality_filter(vec![with_score, no_score], Some(0.0));
3775 assert_eq!(result.len(), 1);
3776 assert_eq!(result[0].id, "scored");
3777 }
3778
3779 #[test]
3780 fn test_min_quality_empty_input_returns_empty() {
3781 let result = TerraphimService::apply_min_quality_filter(vec![], Some(0.5));
3782 assert!(result.is_empty());
3783 }
3784
3785 #[test]
3786 fn test_min_quality_preserves_document_order() {
3787 let a = doc_with_quality("a", 0.9, 0.9, 0.9);
3789 let b = doc_with_quality("b", 0.8, 0.8, 0.8);
3790 let c = doc_with_quality("c", 0.7, 0.7, 0.7);
3791 let result = TerraphimService::apply_min_quality_filter(vec![a, b, c], Some(0.5));
3792 assert_eq!(result.len(), 3);
3793 assert_eq!(result[0].id, "a");
3794 assert_eq!(result[1].id, "b");
3795 assert_eq!(result[2].id, "c");
3796 }
3797
3798 #[test]
3799 fn test_min_quality_negative_threshold_clamped_to_zero() {
3800 let with_score = doc_with_quality("scored", 0.1, 0.1, 0.1);
3803 let no_score = doc_without_quality("unscored");
3804 let result =
3805 TerraphimService::apply_min_quality_filter(vec![with_score, no_score], Some(-0.1));
3806 assert_eq!(result.len(), 1, "only scored document should pass");
3807 assert_eq!(result[0].id, "scored");
3808 }
3809
3810 #[test]
3811 fn test_snippet_around_ascii_simple() {
3812 let s = "Hello World foo](kg:bar Baz";
3813 let result = snippet_around(s, "](kg:", 10, 10);
3814 assert_eq!(result, " World foo](kg:bar Baz");
3815 }
3816
3817 #[test]
3818 fn test_snippet_around_ascii_truncation_left() {
3819 let s = "xyz Hello World foo](kg:bar";
3820 let result = snippet_around(s, "](kg:", 10, 10);
3821 assert_eq!(result, " World foo](kg:bar");
3822 }
3823
3824 #[test]
3825 fn test_snippet_around_ascii_truncation_right() {
3826 let s = "Hello World foo](kg:bar xyz";
3827 let result = snippet_around(s, "](kg:", 10, 10);
3828 assert_eq!(result, " World foo](kg:bar xyz");
3829 }
3830
3831 #[test]
3832 fn test_snippet_around_multibyte_cjk() {
3833 let s = "日本語 Hello](kg:bar 日本語";
3834 let result = snippet_around(s, "](kg:", 5, 5);
3835 assert!(!result.is_empty());
3836 assert!(result.contains("Hello"));
3837 assert!(result.contains("](kg:"));
3838 }
3839
3840 #[test]
3841 fn test_snippet_around_multibyte_emoji() {
3842 let s = "Hello 😂 World](kg:bar";
3843 let result = snippet_around(s, "](kg:", 10, 10);
3844 assert!(!result.is_empty());
3845 assert!(result.contains("😂"));
3846 assert!(result.contains("](kg:"));
3847 }
3848
3849 #[test]
3850 fn test_snippet_around_marker_not_found() {
3851 let s = "Hello World";
3852 let result = snippet_around(s, "](kg:", 10, 10);
3853 assert_eq!(result, "");
3854 }
3855
3856 #[test]
3857 fn test_snippet_around_empty_string() {
3858 let s = "";
3859 let result = snippet_around(s, "](kg:", 10, 10);
3860 assert_eq!(result, "");
3861 }
3862
3863 #[test]
3864 fn test_snippet_around_marker_at_start() {
3865 let s = "](kg:bar Hello";
3866 let result = snippet_around(s, "](kg:", 10, 10);
3867 assert_eq!(result, "](kg:bar Hello");
3868 }
3869
3870 #[test]
3871 fn test_snippet_around_marker_at_end() {
3872 let s = "Hello ](kg:bar";
3873 let result = snippet_around(s, "](kg:", 10, 10);
3874 assert_eq!(result, "Hello ](kg:bar");
3875 }
3876}