Skip to main content

terraphim_service/
lib.rs

1//! Main service layer for Terraphim AI.
2//!
3//! Provides document search, indexing, and AI-assisted summarisation across
4//! multiple haystack backends. Integrates the knowledge graph, thesaurus,
5//! and relevance-scoring pipeline into a single async service facade.
6use ahash::AHashMap;
7use terraphim_automata::builder::{Logseq, ThesaurusBuilder, compute_kg_source_hash};
8use terraphim_automata::load_thesaurus;
9use terraphim_automata::{LinkType, replace_matches};
10use terraphim_config::{ConfigState, Role};
11use terraphim_middleware::thesaurus::build_thesaurus_from_haystack;
12use terraphim_persistence::Persistable;
13use terraphim_rolegraph::{RoleGraph, RoleGraphSync};
14use terraphim_types::{
15    Document, Index, IndexedDocument, Layer, NormalizedTermValue, RelevanceFunction, RoleName,
16    SearchQuery, Thesaurus,
17};
18mod score;
19use crate::score::Query;
20
21pub mod auto_route;
22pub use auto_route::{
23    AutoRouteContext, AutoRouteReason, AutoRouteResult, JMAP_MISSING_TOKEN_PENALTY,
24    auto_select_role,
25};
26
27#[cfg(feature = "openrouter")]
28pub mod openrouter;
29
30// Generic LLM layer for multiple providers (OpenRouter, Ollama, etc.)
31pub mod llm;
32
33// LLM proxy service for unified provider management
34
35// LLM Proxy service\npub mod proxy_client;
36// LLM Router configuration integration\n
37
38pub mod llm_proxy;
39
40// LLM Router configuration integration\n
41
42// Centralized HTTP client creation and configuration
43pub mod http_client;
44
45// Standardized logging initialization utilities
46pub mod logging;
47
48// Summarization queue system for production-ready async processing
49pub mod conversation_service;
50pub mod rate_limiter;
51pub mod summarization_manager;
52pub mod summarization_queue;
53pub mod summarization_worker;
54
55// Centralized error handling patterns and utilities
56pub mod error;
57
58// Context management for LLM conversations
59pub mod context;
60
61#[cfg(test)]
62mod context_tests;
63
64/// Normalize a filename to be used as a document ID
65///
66/// This ensures consistent ID generation between server startup and edit API
67fn normalize_filename_to_id(filename: &str) -> String {
68    let re = regex::Regex::new(r"[^a-zA-Z0-9]+").expect("Failed to create regex");
69    re.replace_all(filename, "").to_lowercase()
70}
71
72/// Top-level error type for the Terraphim service layer.
73#[derive(thiserror::Error, Debug)]
74pub enum ServiceError {
75    #[error("Middleware error: {0}")]
76    Middleware(#[from] terraphim_middleware::Error),
77
78    #[error("OpenDal error: {0}")]
79    OpenDal(Box<opendal::Error>),
80
81    #[error("Persistence error: {0}")]
82    Persistence(#[from] terraphim_persistence::Error),
83
84    #[error("Config error: {0}")]
85    Config(String),
86
87    #[cfg(feature = "openrouter")]
88    #[error("OpenRouter error: {0}")]
89    OpenRouter(#[from] crate::openrouter::OpenRouterError),
90
91    #[error("Common error: {0}")]
92    Common(#[from] crate::error::CommonError),
93}
94
95impl From<opendal::Error> for ServiceError {
96    fn from(err: opendal::Error) -> Self {
97        ServiceError::OpenDal(Box::new(err))
98    }
99}
100
101impl crate::error::TerraphimError for ServiceError {
102    fn category(&self) -> crate::error::ErrorCategory {
103        use crate::error::ErrorCategory;
104        match self {
105            ServiceError::Middleware(_) => ErrorCategory::Integration,
106            ServiceError::OpenDal(_) => ErrorCategory::Storage,
107            ServiceError::Persistence(_) => ErrorCategory::Storage,
108            ServiceError::Config(_) => ErrorCategory::Configuration,
109            #[cfg(feature = "openrouter")]
110            ServiceError::OpenRouter(_) => ErrorCategory::Integration,
111            ServiceError::Common(err) => err.category(),
112        }
113    }
114
115    fn is_recoverable(&self) -> bool {
116        match self {
117            ServiceError::Middleware(_) => true,
118            ServiceError::OpenDal(_) => false,
119            ServiceError::Persistence(_) => false,
120            ServiceError::Config(_) => false,
121            #[cfg(feature = "openrouter")]
122            ServiceError::OpenRouter(_) => true,
123            ServiceError::Common(err) => err.is_recoverable(),
124        }
125    }
126}
127
128pub type Result<T> = std::result::Result<T, ServiceError>;
129
130/// Main entry point for search, indexing, and AI operations in Terraphim.
131pub struct TerraphimService {
132    config_state: ConfigState,
133}
134
135impl TerraphimService {
136    /// Create a new TerraphimService
137    pub fn new(config_state: ConfigState) -> Self {
138        Self { config_state }
139    }
140
141    /// Build a thesaurus from the haystack and update the knowledge graph automata URL
142    async fn build_thesaurus(&mut self, search_query: &SearchQuery) -> Result<()> {
143        Ok(build_thesaurus_from_haystack(&mut self.config_state, search_query).await?)
144    }
145    /// load thesaurus from config object and if absent make sure it's loaded from automata_url
146    pub async fn ensure_thesaurus_loaded(&mut self, role_name: &RoleName) -> Result<Thesaurus> {
147        async fn load_thesaurus_from_automata_path(
148            config_state: &ConfigState,
149            role_name: &RoleName,
150            rolegraphs: &mut AHashMap<RoleName, RoleGraphSync>,
151        ) -> Result<Thesaurus> {
152            // CRITICAL: clone the role out, then drop the config lock before
153            // doing I/O. Holding the lock across the network/disk operations
154            // below blocks every other endpoint that touches /config (e.g.
155            // GET /config from `roles select`, `roles list`, etc.) for the
156            // duration of the thesaurus load + persistence + RoleGraph build.
157            let role = {
158                let config = config_state.config.lock().await;
159                let Some(role) = config.roles.get(role_name).cloned() else {
160                    return Err(ServiceError::Config(format!(
161                        "Role '{}' not found in config",
162                        role_name
163                    )));
164                };
165                role
166            };
167            if let Some(kg) = &role.kg {
168                if let Some(automata_path) = &kg.automata_path {
169                    log::info!("Loading Role `{}` - URL: {:?}", role_name, automata_path);
170
171                    // Try to load from automata path first
172                    match load_thesaurus(automata_path).await {
173                        Ok(mut thesaurus) => {
174                            log::info!("Successfully loaded thesaurus from automata path");
175
176                            // Save thesaurus to persistence to ensure it's available for future loads
177                            match thesaurus.save().await {
178                                Ok(_) => {
179                                    log::info!(
180                                        "Thesaurus for role `{}` saved to persistence",
181                                        role_name
182                                    );
183                                    // Reload from persistence to get canonical version
184                                    match thesaurus.load().await {
185                                        Ok(persisted_thesaurus) => {
186                                            thesaurus = persisted_thesaurus;
187                                            log::debug!("Reloaded thesaurus from persistence");
188                                        }
189                                        Err(e) => {
190                                            log::warn!(
191                                                "Failed to reload thesaurus from persistence, using in-memory version: {:?}",
192                                                e
193                                            );
194                                        }
195                                    }
196                                }
197                                Err(e) => {
198                                    log::warn!("Failed to save thesaurus to persistence: {:?}", e);
199                                }
200                            }
201
202                            let rolegraph =
203                                RoleGraph::new(role_name.clone(), thesaurus.clone()).await;
204                            match rolegraph {
205                                Ok(rolegraph) => {
206                                    let rolegraph_value = RoleGraphSync::from(rolegraph);
207                                    rolegraphs.insert(role_name.clone(), rolegraph_value);
208                                }
209                                Err(e) => {
210                                    log::error!("Failed to update role and thesaurus: {:?}", e)
211                                }
212                            }
213                            Ok(thesaurus)
214                        }
215                        Err(e) => {
216                            log::warn!("Failed to load thesaurus from automata path: {:?}", e);
217                            // Fallback to building from local KG if available
218                            if let Some(kg_local) = &kg.knowledge_graph_local {
219                                log::info!(
220                                    "Fallback: building thesaurus from local KG for role {}",
221                                    role_name
222                                );
223                                let logseq_builder = Logseq::default();
224                                match logseq_builder
225                                    .build(
226                                        role_name.as_lowercase().to_string(),
227                                        kg_local.path.clone(),
228                                    )
229                                    .await
230                                {
231                                    Ok(mut thesaurus) => {
232                                        // Save thesaurus to persistence to ensure it's available for future loads
233                                        match thesaurus.save().await {
234                                            Ok(_) => {
235                                                log::info!(
236                                                    "Fallback thesaurus for role `{}` saved to persistence",
237                                                    role_name
238                                                );
239                                                // Reload from persistence to get canonical version
240                                                match thesaurus.load().await {
241                                                    Ok(persisted_thesaurus) => {
242                                                        thesaurus = persisted_thesaurus;
243                                                        log::debug!(
244                                                            "Reloaded fallback thesaurus from persistence"
245                                                        );
246                                                    }
247                                                    Err(e) => {
248                                                        log::warn!(
249                                                            "Failed to reload fallback thesaurus from persistence, using in-memory version: {:?}",
250                                                            e
251                                                        );
252                                                    }
253                                                }
254                                            }
255                                            Err(e) => {
256                                                log::warn!(
257                                                    "Failed to save fallback thesaurus to persistence: {:?}",
258                                                    e
259                                                );
260                                            }
261                                        }
262
263                                        let rolegraph =
264                                            RoleGraph::new(role_name.clone(), thesaurus.clone())
265                                                .await;
266                                        match rolegraph {
267                                            Ok(rolegraph) => {
268                                                let rolegraph_value =
269                                                    RoleGraphSync::from(rolegraph);
270                                                rolegraphs
271                                                    .insert(role_name.clone(), rolegraph_value);
272                                            }
273                                            Err(e) => log::error!(
274                                                "Failed to update role and thesaurus: {:?}",
275                                                e
276                                            ),
277                                        }
278
279                                        Ok(thesaurus)
280                                    }
281                                    Err(e) => {
282                                        // Check if error is "file not found" (expected for optional files)
283                                        // and downgrade log level from ERROR to DEBUG
284                                        let is_file_not_found =
285                                            e.to_string().contains("file not found")
286                                                || e.to_string().contains("not found:");
287
288                                        if is_file_not_found {
289                                            log::debug!(
290                                                "Failed to build thesaurus from local KG (optional file not found) for role {}: {:?}",
291                                                role_name,
292                                                e
293                                            );
294                                        } else {
295                                            log::error!(
296                                                "Failed to build thesaurus from local KG for role {}: {:?}",
297                                                role_name,
298                                                e
299                                            );
300                                        }
301                                        Err(ServiceError::Config(
302                                            "Failed to load or build thesaurus".into(),
303                                        ))
304                                    }
305                                }
306                            } else {
307                                log::warn!(
308                                    "No fallback available for role {}: no local KG path configured, returning empty thesaurus",
309                                    role_name
310                                );
311                                Ok(Thesaurus::new(role_name.as_lowercase().to_string()))
312                            }
313                        }
314                    }
315                } else if let Some(kg_local) = &kg.knowledge_graph_local {
316                    // Build thesaurus from local KG
317                    log::info!(
318                        "Role {} has no automata_path, building thesaurus from local KG files at {:?}",
319                        role_name,
320                        kg_local.path
321                    );
322                    let logseq_builder = Logseq::default();
323                    match logseq_builder
324                        .build(role_name.as_lowercase().to_string(), kg_local.path.clone())
325                        .await
326                    {
327                        Ok(mut thesaurus) => {
328                            log::info!(
329                                "Successfully built thesaurus from local KG for role {}",
330                                role_name
331                            );
332
333                            // Save thesaurus to persistence to ensure it's available for future loads
334                            match thesaurus.save().await {
335                                Ok(_) => {
336                                    log::info!(
337                                        "Local KG thesaurus for role `{}` saved to persistence",
338                                        role_name
339                                    );
340                                    // Reload from persistence to get canonical version
341                                    match thesaurus.load().await {
342                                        Ok(persisted_thesaurus) => {
343                                            log::info!(
344                                                "Reloaded local KG thesaurus from persistence: {} entries",
345                                                persisted_thesaurus.len()
346                                            );
347                                            thesaurus = persisted_thesaurus;
348                                        }
349                                        Err(e) => {
350                                            log::warn!(
351                                                "Failed to reload local KG thesaurus from persistence, using in-memory version: {:?}",
352                                                e
353                                            );
354                                        }
355                                    }
356                                }
357                                Err(e) => {
358                                    log::warn!(
359                                        "Failed to save local KG thesaurus to persistence: {:?}",
360                                        e
361                                    );
362                                }
363                            }
364
365                            let rolegraph =
366                                RoleGraph::new(role_name.clone(), thesaurus.clone()).await;
367                            match rolegraph {
368                                Ok(rolegraph) => {
369                                    let rolegraph_value = RoleGraphSync::from(rolegraph);
370                                    rolegraphs.insert(role_name.clone(), rolegraph_value);
371                                }
372                                Err(e) => {
373                                    log::error!("Failed to update role and thesaurus: {:?}", e)
374                                }
375                            }
376
377                            Ok(thesaurus)
378                        }
379                        Err(e) => {
380                            // Check if error is "file not found" (expected for optional files)
381                            // and downgrade log level from ERROR to DEBUG
382                            let is_file_not_found = e.to_string().contains("file not found");
383
384                            if is_file_not_found {
385                                log::debug!(
386                                    "Failed to build thesaurus from local KG (optional file not found) for role {}: {:?}",
387                                    role_name,
388                                    e
389                                );
390                            } else {
391                                log::error!(
392                                    "Failed to build thesaurus from local KG for role {}: {:?}",
393                                    role_name,
394                                    e
395                                );
396                            }
397                            Err(ServiceError::Config(format!(
398                                "Failed to build thesaurus from local KG for role {}: {}",
399                                role_name, e
400                            )))
401                        }
402                    }
403                } else {
404                    log::warn!(
405                        "Role {} is configured for TerraphimGraph but has neither automata_path nor knowledge_graph_local defined.",
406                        role_name
407                    );
408                    if let Some(kg_local) = &kg.knowledge_graph_local {
409                        // Build thesaurus from local KG files during startup
410                        log::info!(
411                            "Building thesaurus from local KG files for role {} at {:?}",
412                            role_name,
413                            kg_local.path
414                        );
415                        let logseq_builder = Logseq::default();
416                        match logseq_builder
417                            .build(role_name.as_lowercase().to_string(), kg_local.path.clone())
418                            .await
419                        {
420                            Ok(mut thesaurus) => {
421                                log::info!(
422                                    "Successfully built thesaurus from local KG for role {}",
423                                    role_name
424                                );
425
426                                // Save thesaurus to persistence to ensure it's available for future loads
427                                match thesaurus.save().await {
428                                    Ok(_) => {
429                                        log::info!(
430                                            "No-automata thesaurus for role `{}` saved to persistence",
431                                            role_name
432                                        );
433                                        // Reload from persistence to get canonical version
434                                        match thesaurus.load().await {
435                                            Ok(persisted_thesaurus) => {
436                                                thesaurus = persisted_thesaurus;
437                                                log::debug!(
438                                                    "Reloaded no-automata thesaurus from persistence"
439                                                );
440                                            }
441                                            Err(e) => {
442                                                log::warn!(
443                                                    "Failed to reload no-automata thesaurus from persistence, using in-memory version: {:?}",
444                                                    e
445                                                );
446                                            }
447                                        }
448                                    }
449                                    Err(e) => {
450                                        log::warn!(
451                                            "Failed to save no-automata thesaurus to persistence: {:?}",
452                                            e
453                                        );
454                                    }
455                                }
456
457                                let rolegraph =
458                                    RoleGraph::new(role_name.clone(), thesaurus.clone()).await;
459                                match rolegraph {
460                                    Ok(rolegraph) => {
461                                        let rolegraph_value = RoleGraphSync::from(rolegraph);
462                                        rolegraphs.insert(role_name.clone(), rolegraph_value);
463                                    }
464                                    Err(e) => {
465                                        // Check if error is "file not found" (expected for optional files)
466                                        // and downgrade log level from ERROR to DEBUG
467                                        let is_file_not_found =
468                                            e.to_string().contains("file not found");
469
470                                        if is_file_not_found {
471                                            log::debug!(
472                                                "Failed to update role and thesaurus (optional file not found): {:?}",
473                                                e
474                                            );
475                                        } else {
476                                            log::error!(
477                                                "Failed to update role and thesaurus: {:?}",
478                                                e
479                                            );
480                                        }
481                                    }
482                                }
483
484                                Ok(thesaurus)
485                            }
486                            Err(e) => {
487                                log::error!(
488                                    "Failed to build thesaurus from local KG for role {}: {:?}",
489                                    role_name,
490                                    e
491                                );
492                                Err(ServiceError::Config(
493                                    "Failed to build thesaurus from local KG".into(),
494                                ))
495                            }
496                        }
497                    } else {
498                        log::debug!(
499                            "Role '{}' has no local KG path, returning empty thesaurus",
500                            role_name
501                        );
502                        Ok(Thesaurus::new(role_name.as_lowercase().to_string()))
503                    }
504                }
505            } else {
506                log::debug!("Role '{}' has no knowledge graph configured", role_name);
507                Err(ServiceError::Config(format!(
508                    "Knowledge graph not configured for role '{}'",
509                    role_name
510                )))
511            }
512        }
513
514        log::debug!("Loading thesaurus for role: {}", role_name);
515        log::debug!("Role keys {:?}", self.config_state.roles.keys());
516
517        if let Some(rolegraph_value) = self.config_state.roles.get(role_name) {
518            let thesaurus_result = rolegraph_value.lock().await.thesaurus.clone().load().await;
519            match thesaurus_result {
520                Ok(thesaurus) => {
521                    log::debug!("Thesaurus loaded: {:?}", thesaurus);
522                    log::info!("Rolegraph loaded: for role name {:?}", role_name);
523
524                    // Check if the cached thesaurus is stale by comparing source hashes
525                    let is_stale = if let Some(ref cached_hash) = thesaurus.source_hash {
526                        let role = {
527                            let config = self.config_state.config.lock().await;
528                            config.roles.get(role_name).cloned()
529                        };
530                        if let Some(role) = role {
531                            if let Some(ref kg) = role.kg {
532                                if let Some(ref kg_local) = kg.knowledge_graph_local {
533                                    match compute_kg_source_hash(&kg_local.path) {
534                                        Ok(Some(current_hash)) => {
535                                            let stale = current_hash != *cached_hash;
536                                            if stale {
537                                                log::info!(
538                                                    "Thesaurus cache stale for role '{}': hash mismatch (cached {} != current {})",
539                                                    role_name,
540                                                    cached_hash,
541                                                    current_hash
542                                                );
543                                            }
544                                            stale
545                                        }
546                                        Ok(None) => {
547                                            log::debug!(
548                                                "No markdown files found in KG path {:?}",
549                                                kg_local.path
550                                            );
551                                            false
552                                        }
553                                        Err(e) => {
554                                            log::warn!(
555                                                "Failed to compute source hash for role '{}': {}",
556                                                role_name,
557                                                e
558                                            );
559                                            false
560                                        }
561                                    }
562                                } else {
563                                    false
564                                }
565                            } else {
566                                false
567                            }
568                        } else {
569                            false
570                        }
571                    } else {
572                        log::debug!(
573                            "No source_hash in cached thesaurus for role '{}'",
574                            role_name
575                        );
576                        false
577                    };
578
579                    if is_stale {
580                        let mut rolegraphs = self.config_state.roles.clone();
581                        let result = load_thesaurus_from_automata_path(
582                            &self.config_state,
583                            role_name,
584                            &mut rolegraphs,
585                        )
586                        .await;
587
588                        if result.is_ok() {
589                            if let Some(updated_rolegraph) = rolegraphs.get(role_name) {
590                                self.config_state
591                                    .roles
592                                    .insert(role_name.clone(), updated_rolegraph.clone());
593                                log::info!(
594                                    "Updated config_state with rebuilt rolegraph for role: {}",
595                                    role_name
596                                );
597                            }
598                        }
599                        result
600                    } else {
601                        Ok(thesaurus)
602                    }
603                }
604                Err(e) => {
605                    // Check if error is "file not found" (expected for optional files)
606                    // and downgrade log level from ERROR to DEBUG
607                    let is_file_not_found = e.to_string().contains("file not found")
608                        || e.to_string().contains("not found:");
609
610                    if is_file_not_found {
611                        log::debug!("Thesaurus file not found (optional): {:?}", e);
612                    } else {
613                        log::error!("Failed to load thesaurus: {:?}", e);
614                    }
615                    // Try to build thesaurus from KG and update the config_state directly
616                    let mut rolegraphs = self.config_state.roles.clone();
617                    let result = load_thesaurus_from_automata_path(
618                        &self.config_state,
619                        role_name,
620                        &mut rolegraphs,
621                    )
622                    .await;
623
624                    // Update the actual config_state with the new rolegraph
625                    if result.is_ok() {
626                        if let Some(updated_rolegraph) = rolegraphs.get(role_name) {
627                            self.config_state
628                                .roles
629                                .insert(role_name.clone(), updated_rolegraph.clone());
630                            log::info!(
631                                "Updated config_state with new rolegraph for role: {}",
632                                role_name
633                            );
634                        }
635                    }
636
637                    result
638                }
639            }
640        } else {
641            // Role not found, try to build from KG
642            let mut rolegraphs = self.config_state.roles.clone();
643            let result =
644                load_thesaurus_from_automata_path(&self.config_state, role_name, &mut rolegraphs)
645                    .await;
646
647            // Update the actual config_state with the new rolegraph
648            if result.is_ok() {
649                if let Some(new_rolegraph) = rolegraphs.get(role_name) {
650                    self.config_state
651                        .roles
652                        .insert(role_name.clone(), new_rolegraph.clone());
653                    log::info!(
654                        "Added new rolegraph to config_state for role: {}",
655                        role_name
656                    );
657                }
658            }
659
660            result
661        }
662    }
663
664    /// Preprocess document content to create clickable KG links when terraphim_it is enabled
665    ///
666    /// This function replaces KG terms in the document body with markdown links
667    /// in the format `[term](kg:term)` which can be intercepted by the frontend
668    /// to display KG documents when clicked.
669    pub async fn preprocess_document_content(
670        &mut self,
671        mut document: Document,
672        role: &Role,
673    ) -> Result<Document> {
674        // Only preprocess if terraphim_it is enabled and role has KG configured
675        if !role.terraphim_it {
676            log::info!(
677                "🔍 terraphim_it disabled for role '{}', skipping KG preprocessing",
678                role.name
679            );
680            return Ok(document);
681        }
682
683        let Some(_kg) = &role.kg else {
684            log::info!(
685                "⚠️ No KG configured for role '{}', skipping KG preprocessing",
686                role.name
687            );
688            return Ok(document);
689        };
690
691        log::info!(
692            "🧠 Starting KG preprocessing for document '{}' in role '{}' (terraphim_it enabled)",
693            document.title,
694            role.name
695        );
696        log::debug!(
697            "📄 Document preview: {} characters starting with: {}",
698            document.body.len(),
699            &document.body.chars().take(100).collect::<String>()
700        );
701
702        // Load thesaurus for the role
703        let thesaurus = match self.ensure_thesaurus_loaded(&role.name).await {
704            Ok(thesaurus) => thesaurus,
705            Err(e) => {
706                log::warn!("Failed to load thesaurus for role {}: {:?}", role.name, e);
707                return Ok(document); // Return original document if thesaurus fails to load
708            }
709        };
710
711        // Filter thesaurus to only include meaningful terms and avoid over-linking
712        let mut kg_thesaurus = Thesaurus::new(format!("kg_links_{}", role.name));
713
714        // Prioritize important KG terms while excluding overly generic ones
715        // Key KG concepts should always be included even if they're common
716        let important_kg_terms = [
717            "graph",
718            "haystack",
719            "service",
720            "terraphim",
721            "knowledge",
722            "embedding",
723            "search",
724            "automata",
725            "thesaurus",
726            "rolegraph",
727        ];
728
729        // Exclude only very generic programming/technical terms that don't add value
730        let excluded_common_terms = [
731            "system",
732            "config",
733            "configuration",
734            "type",
735            "method",
736            "function",
737            "class",
738            "component",
739            "module",
740            "library",
741            "framework",
742            "interface",
743            "api",
744            "data",
745            "file",
746            "path",
747            "url",
748            "string",
749            "number",
750            "value",
751            "option",
752            "parameter",
753            "field",
754            "property",
755            "attribute",
756            "element",
757            "item",
758            "object",
759            "array",
760            "list",
761            "map",
762            "set",
763            "collection",
764            "server",
765            "client",
766            "request",
767            "response",
768            "error",
769            "result",
770            "success",
771            "failure",
772            "true",
773            "false",
774            "null",
775            "undefined",
776            "empty",
777            "full",
778            "start",
779            "end",
780            "begin",
781            "finish",
782            "create",
783            "delete",
784            "update",
785            "read",
786            "write",
787            "load",
788            "save",
789            "process",
790            "handle",
791            "manage",
792            "control",
793            "execute",
794            "run",
795            "call",
796            "invoke",
797            "trigger",
798            "event",
799            "action",
800            "command",
801            "query",
802            "search",
803            "filter",
804            "sort",
805            "order",
806            "group",
807            "match",
808            "find",
809            "replace",
810            "insert",
811            "remove",
812            "add",
813            "set",
814            "get",
815            "put",
816            "post",
817            "head",
818            "patch",
819            "delete",
820        ];
821
822        let mut sorted_terms: Vec<_> = (&thesaurus)
823            .into_iter()
824            .filter(|(key, _)| {
825                let term = key.as_str();
826
827                // Always exclude empty or very short terms
828                if term.is_empty() || term.len() < 3 {
829                    return false;
830                }
831
832                // Always include important KG terms, even if they're short
833                if important_kg_terms.contains(&term) {
834                    return true;
835                }
836
837                // Exclude generic technical terms
838                if excluded_common_terms.contains(&term) {
839                    return false;
840                }
841
842                // Include terms that are:
843                // 1. Moderately long (>5 chars) OR
844                // 2. Hyphenated compound terms OR
845                // 3. Underscore-separated compound terms OR
846                // 4. Capitalized terms (likely proper nouns or important concepts)
847                term.len() > 5
848                    || term.contains('-')
849                    || term.contains('_')
850                    || term.chars().next().is_some_and(|c| c.is_uppercase())
851            })
852            .collect();
853
854        // Sort by relevance, but prioritize important KG terms
855        #[allow(clippy::unnecessary_sort_by)]
856        sorted_terms.sort_by(|a, b| {
857            let a_important = important_kg_terms.contains(&a.0.as_str());
858            let b_important = important_kg_terms.contains(&b.0.as_str());
859
860            match (a_important, b_important) {
861                (true, false) => std::cmp::Ordering::Less, // a comes first
862                (false, true) => std::cmp::Ordering::Greater, // b comes first
863                _ => b.1.id.cmp(&a.1.id),                  // Both or neither important, sort by ID
864            }
865        });
866
867        // Take more terms since we're being more selective about quality
868        let max_kg_terms = 8;
869        for (key, value) in sorted_terms.into_iter().take(max_kg_terms) {
870            let mut kg_value = value.clone();
871            // IMPORTANT: Keep the original term (key) as visible text, link to root concept (value.value)
872            // This creates links like: [graph embeddings](kg:terraphim-graph)
873            // where "graph embeddings" stays visible but links to the root concept "terraphim-graph"
874            kg_value.value = key.clone(); // Keep original term as visible text
875            kg_value.url = Some(format!("kg:{}", value.value)); // Link to the root concept
876            kg_thesaurus.insert(key.clone(), kg_value);
877        }
878
879        let kg_terms_count = kg_thesaurus.len();
880        log::info!(
881            "📋 KG thesaurus filtering: {} → {} terms (prioritizing: {}, filters: len>5, hyphenated, or important KG terms)",
882            thesaurus.len(),
883            kg_terms_count,
884            important_kg_terms.join(", ")
885        );
886
887        // Log the actual terms that passed filtering for debugging
888        if kg_terms_count > 0 {
889            let terms: Vec<String> = (&kg_thesaurus)
890                .into_iter()
891                .map(|(k, v)| format!("'{}' → kg:{}", k, v.value))
892                .collect();
893            log::info!("🔍 KG terms selected for linking: {}", terms.join(", "));
894        } else {
895            log::info!(
896                "⚠️ No KG terms passed filtering criteria - document '{}' will have no KG links",
897                document.title
898            );
899        }
900
901        // Apply KG term replacement to document body (only if we have terms to replace)
902        if !kg_thesaurus.is_empty() {
903            // Debug: log what we're about to pass to replace_matches
904            let debug_thesaurus: Vec<String> = (&kg_thesaurus)
905                .into_iter()
906                .map(|(k, v)| format!("'{}' -> '{}' (url: {:?})", k, v.value, v.url))
907                .take(3) // Limit to first 3 entries to avoid spam
908                .collect();
909            log::info!(
910                "🔧 Passing to replace_matches: {} (total terms: {})",
911                debug_thesaurus.join(", "),
912                kg_thesaurus.len()
913            );
914            let preview = if document.body.chars().count() > 200 {
915                document.body.chars().take(200).collect::<String>() + "..."
916            } else {
917                document.body.clone()
918            };
919            log::info!("📝 Document body preview (first 200 chars): {}", preview);
920
921            match replace_matches(&document.body, kg_thesaurus, LinkType::MarkdownLinks) {
922                Ok(processed_bytes) => {
923                    match String::from_utf8(processed_bytes) {
924                        Ok(processed_content) => {
925                            log::info!(
926                                "✅ Successfully preprocessed document '{}' with {} KG terms → created [term](kg:concept) links",
927                                document.title,
928                                kg_terms_count
929                            );
930
931                            // Debug: Check if content actually changed
932                            let content_changed = processed_content != document.body;
933                            log::info!(
934                                "🔄 Content changed: {} (original: {} chars, processed: {} chars)",
935                                content_changed,
936                                document.body.len(),
937                                processed_content.len()
938                            );
939
940                            // Debug: Show actual KG links in the processed content
941                            let kg_links: Vec<&str> = processed_content
942                                .split("[")
943                                .filter_map(|s| s.find("](kg:").map(|closing| &s[..closing]))
944                                .collect();
945
946                            if !kg_links.is_empty() {
947                                log::info!(
948                                    "🔗 Found KG links in processed content: [{}](kg:...)",
949                                    kg_links.join("], [")
950                                );
951
952                                let snippet = snippet_around(&processed_content, "](kg:", 50, 100);
953                                if !snippet.is_empty() {
954                                    log::info!(
955                                        "📄 Content snippet with KG link: ...{}...",
956                                        snippet
957                                    );
958                                }
959                            } else {
960                                log::warn!(
961                                    "⚠️ No KG links found in processed content despite successful replacement"
962                                );
963                            }
964
965                            document.body = processed_content;
966                        }
967                        Err(e) => {
968                            log::warn!(
969                                "Failed to convert processed content to UTF-8 for document '{}': {:?}",
970                                document.title,
971                                e
972                            );
973                        }
974                    }
975                }
976                Err(e) => {
977                    log::warn!(
978                        "Failed to replace KG terms in document '{}': {:?}",
979                        document.title,
980                        e
981                    );
982                }
983            }
984        } else {
985            log::info!(
986                "💭 No specific KG terms found for document '{}' (filters excluded generic terms)",
987                document.title
988            );
989        }
990
991        Ok(document)
992    }
993
994    /// Preprocess document content with both KG linking and search term highlighting
995    pub async fn preprocess_document_content_with_search(
996        &mut self,
997        document: Document,
998        role: &Role,
999        search_query: Option<&SearchQuery>,
1000    ) -> Result<Document> {
1001        // First apply KG preprocessing if enabled
1002        let mut processed_doc = self.preprocess_document_content(document, role).await?;
1003
1004        // Then apply search term highlighting if query is provided
1005        if let Some(query) = search_query {
1006            log::debug!(
1007                "Applying search term highlighting to document '{}'",
1008                processed_doc.title
1009            );
1010            processed_doc.body = Self::highlight_search_terms(&processed_doc.body, query);
1011        }
1012
1013        Ok(processed_doc)
1014    }
1015
1016    /// Create document
1017    pub async fn create_document(&mut self, document: Document) -> Result<Document> {
1018        // Persist the document using the fastest available Operator. The document becomes
1019        // available on all profiles/devices thanks to the Persistable implementation.
1020        document.save().await?;
1021
1022        // Index the freshly-saved document inside all role graphs so it can be discovered via
1023        // search immediately.
1024        self.config_state.add_to_roles(&document).await?;
1025
1026        // 🔄 Persist the updated body back to on-disk Markdown files for every writable
1027        // ripgrep haystack so that subsequent searches (and external tooling) see the
1028        // changes instantly.
1029        use terraphim_config::ServiceType;
1030        use terraphim_middleware::indexer::RipgrepIndexer;
1031
1032        let ripgrep = RipgrepIndexer::default();
1033        let config_snapshot = { self.config_state.config.lock().await.clone() };
1034
1035        for role in config_snapshot.roles.values() {
1036            for haystack in &role.haystacks {
1037                if haystack.service == ServiceType::Ripgrep && !haystack.read_only {
1038                    if let Err(e) = ripgrep.update_document(&document).await {
1039                        log::warn!(
1040                            "Failed to write document {} to haystack {:?}: {:?}",
1041                            document.id,
1042                            haystack.location,
1043                            e
1044                        );
1045                    }
1046                }
1047            }
1048        }
1049
1050        Ok(document)
1051    }
1052
1053    /// Get document by ID
1054    ///
1055    /// This method supports both normalized IDs (e.g., "haystackmd") and original filenames (e.g., "haystack.md").
1056    /// It tries to find the document using the provided ID first, then tries with a normalized version,
1057    /// and finally falls back to searching by title.
1058    pub async fn get_document_by_id(&mut self, document_id: &str) -> Result<Option<Document>> {
1059        log::debug!("Getting document by ID: '{}'", document_id);
1060
1061        // Validate document_id is not empty or whitespace-only
1062        if document_id.trim().is_empty() {
1063            log::warn!("Empty or whitespace-only document_id provided");
1064            return Ok(None);
1065        }
1066
1067        // 1️⃣ Try to load the document directly using the provided ID
1068        let mut placeholder = Document {
1069            id: document_id.to_string(),
1070            ..Default::default()
1071        };
1072        match placeholder.load().await {
1073            Ok(doc) => {
1074                log::debug!("Found document '{}' with direct ID lookup", document_id);
1075                return self.apply_kg_preprocessing_if_needed(doc).await.map(Some);
1076            }
1077            Err(e) => {
1078                log::debug!(
1079                    "Document '{}' not found with direct lookup: {:?}",
1080                    document_id,
1081                    e
1082                );
1083            }
1084        }
1085
1086        // 2️⃣ If the provided ID looks like a filename, try with normalized ID
1087        if document_id.contains('.') || document_id.contains('-') || document_id.contains('_') {
1088            let normalized_id = normalize_filename_to_id(document_id);
1089            log::debug!(
1090                "Trying normalized ID '{}' for filename '{}'",
1091                normalized_id,
1092                document_id
1093            );
1094
1095            let mut normalized_placeholder = Document {
1096                id: normalized_id.clone(),
1097                ..Default::default()
1098            };
1099            match normalized_placeholder.load().await {
1100                Ok(doc) => {
1101                    log::debug!(
1102                        "Found document '{}' with normalized ID '{}'",
1103                        document_id,
1104                        normalized_id
1105                    );
1106                    return self.apply_kg_preprocessing_if_needed(doc).await.map(Some);
1107                }
1108                Err(e) => {
1109                    log::debug!(
1110                        "Document '{}' not found with normalized ID '{}': {:?}",
1111                        document_id,
1112                        normalized_id,
1113                        e
1114                    );
1115                }
1116            }
1117        }
1118
1119        // 3️⃣ Fallback: search by title (for documents where title contains the original filename)
1120        log::debug!("Falling back to search for document '{}'", document_id);
1121        let search_query = SearchQuery {
1122            search_term: NormalizedTermValue::new(document_id.to_string()),
1123            search_terms: None,
1124            operator: None,
1125            limit: Some(5), // Get a few results to check titles
1126            skip: None,
1127            role: None,
1128            layer: Layer::default(),
1129            include_pinned: false,
1130            min_quality: None,
1131        };
1132
1133        let documents = self.search(&search_query).await?;
1134
1135        // Look for a document whose title matches the requested ID
1136        for doc in documents {
1137            if doc.title == document_id || doc.id == document_id {
1138                log::debug!("Found document '{}' via search fallback", document_id);
1139                return self.apply_kg_preprocessing_if_needed(doc).await.map(Some);
1140            }
1141        }
1142
1143        log::debug!("Document '{}' not found anywhere", document_id);
1144        Ok(None)
1145    }
1146
1147    /// Apply KG preprocessing to a document if needed based on the current selected role
1148    ///
1149    /// This helper method checks if the selected role has terraphim_it enabled
1150    /// and applies KG term preprocessing accordingly. It prevents double processing
1151    /// by checking if KG links already exist in the document.
1152    async fn apply_kg_preprocessing_if_needed(&mut self, document: Document) -> Result<Document> {
1153        log::debug!(
1154            "🔍 [KG-DEBUG] apply_kg_preprocessing_if_needed called for document: '{}'",
1155            document.title
1156        );
1157        log::debug!(
1158            "🔍 [KG-DEBUG] Document body preview: {}",
1159            document.body.chars().take(100).collect::<String>()
1160        );
1161
1162        let role = {
1163            let config = self.config_state.config.lock().await;
1164            let selected_role = &config.selected_role;
1165
1166            log::debug!("🔍 [KG-DEBUG] Selected role: '{}'", selected_role);
1167
1168            match config.roles.get(selected_role) {
1169                Some(role) => {
1170                    log::debug!(
1171                        "🔍 [KG-DEBUG] Role found: '{}', terraphim_it: {}",
1172                        role.name,
1173                        role.terraphim_it
1174                    );
1175                    role.clone() // Clone to avoid borrowing issues
1176                }
1177                None => {
1178                    log::warn!(
1179                        "❌ [KG-DEBUG] Selected role '{}' not found in config, skipping KG preprocessing",
1180                        selected_role
1181                    );
1182                    return Ok(document);
1183                }
1184            }
1185        }; // Release the lock here
1186
1187        // Only apply preprocessing if role has terraphim_it enabled
1188        if !role.terraphim_it {
1189            log::info!(
1190                "🔍 [KG-DEBUG] terraphim_it disabled for role '{}', skipping KG preprocessing",
1191                role.name
1192            );
1193            return Ok(document);
1194        }
1195
1196        // Check if document already has KG links to prevent double processing
1197        let has_existing_kg_links = document.body.contains("](kg:");
1198        log::debug!(
1199            "🔍 [KG-DEBUG] Document already has KG links: {}",
1200            has_existing_kg_links
1201        );
1202        if has_existing_kg_links {
1203            log::info!(
1204                "🔍 [KG-DEBUG] Document '{}' already has KG links, skipping preprocessing to prevent double processing",
1205                document.title
1206            );
1207            return Ok(document);
1208        }
1209
1210        log::info!(
1211            "🧠 [KG-DEBUG] Starting KG preprocessing for document '{}' with role '{}' (terraphim_it enabled)",
1212            document.title,
1213            role.name
1214        );
1215
1216        // Apply KG preprocessing
1217        let document_title = document.title.clone(); // Save title before moving document
1218        let processed_doc = match self.preprocess_document_content(document, &role).await {
1219            Ok(doc) => {
1220                let links_added = doc.body.contains("](kg:");
1221                log::info!(
1222                    "✅ [KG-DEBUG] KG preprocessing completed for document '{}'. Links added: {}",
1223                    doc.title,
1224                    links_added
1225                );
1226                if links_added {
1227                    log::debug!(
1228                        "🔍 [KG-DEBUG] Processed body preview: {}",
1229                        doc.body.chars().take(200).collect::<String>()
1230                    );
1231                }
1232                doc
1233            }
1234            Err(e) => {
1235                log::error!(
1236                    "❌ [KG-DEBUG] KG preprocessing failed for document '{}': {:?}",
1237                    document_title,
1238                    e
1239                );
1240                return Err(e);
1241            }
1242        };
1243
1244        Ok(processed_doc)
1245    }
1246
1247    /// Enhance document descriptions with AI-generated summaries using OpenRouter
1248    ///
1249    /// This method uses the OpenRouter service to generate intelligent summaries
1250    /// of document content, replacing basic text excerpts with AI-powered descriptions.
1251    #[allow(dead_code)] // Used in 7+ places but compiler can't see due to async/feature boundaries
1252    async fn enhance_descriptions_with_ai(
1253        &self,
1254        mut documents: Vec<Document>,
1255        role: &Role,
1256    ) -> Result<Vec<Document>> {
1257        use crate::llm::{SummarizeOptions, build_llm_from_role};
1258
1259        eprintln!("🤖 Attempting to build LLM client for role: {}", role.name);
1260        let llm = match build_llm_from_role(role) {
1261            Some(client) => {
1262                eprintln!("✅ LLM client successfully created: {}", client.name());
1263                client
1264            }
1265            None => {
1266                eprintln!("❌ No LLM client available for role: {}", role.name);
1267                return Ok(documents);
1268            }
1269        };
1270
1271        log::info!(
1272            "Enhancing {} document descriptions with LLM provider: {}",
1273            documents.len(),
1274            llm.name()
1275        );
1276
1277        let mut enhanced_count = 0;
1278        let mut error_count = 0;
1279
1280        for document in &mut documents {
1281            if self.should_generate_ai_summary(document) {
1282                let summary_length = 250;
1283                match llm
1284                    .summarize(
1285                        &document.body,
1286                        SummarizeOptions {
1287                            max_length: summary_length,
1288                        },
1289                    )
1290                    .await
1291                {
1292                    Ok(ai_summary) => {
1293                        log::debug!(
1294                            "Generated AI summary for '{}': {} characters",
1295                            document.title,
1296                            ai_summary.len()
1297                        );
1298                        document.description = Some(ai_summary);
1299                        enhanced_count += 1;
1300                    }
1301                    Err(e) => {
1302                        log::warn!(
1303                            "Failed to generate AI summary for '{}': {}",
1304                            document.title,
1305                            e
1306                        );
1307                        error_count += 1;
1308                    }
1309                }
1310            }
1311        }
1312
1313        log::info!(
1314            "LLM enhancement complete: {} enhanced, {} errors, {} skipped",
1315            enhanced_count,
1316            error_count,
1317            documents.len() - enhanced_count - error_count
1318        );
1319
1320        Ok(documents)
1321    }
1322
1323    /// Determine if a document should receive an AI-generated summary
1324    ///
1325    /// This helper method checks various criteria to decide whether a document
1326    /// would benefit from AI summarization.
1327    #[allow(dead_code)] // Used by enhance_descriptions_with_ai, compiler can't see due to async boundaries
1328    fn should_generate_ai_summary(&self, document: &Document) -> bool {
1329        // Don't enhance if the document body is too short to summarize meaningfully
1330        if document.body.trim().len() < 200 {
1331            return false;
1332        }
1333
1334        // Don't enhance if we already have a high-quality description
1335        if let Some(ref description) = document.description {
1336            // If the description is substantial and doesn't look like a simple excerpt, keep it
1337            if description.len() > 100 && !description.ends_with("...") {
1338                return false;
1339            }
1340        }
1341
1342        // Don't enhance very large documents (cost control)
1343        if document.body.len() > 8000 {
1344            return false;
1345        }
1346
1347        // Good candidates for AI summarization
1348        true
1349    }
1350
1351    /// Get the role for the given search query
1352    async fn get_search_role(&self, search_query: &SearchQuery) -> Result<Role> {
1353        let search_role = match &search_query.role {
1354            Some(role) => role.clone(),
1355            None => self.config_state.get_default_role().await,
1356        };
1357
1358        log::debug!("Searching for role: {:?}", search_role);
1359        let Some(role) = self.config_state.get_role(&search_role).await else {
1360            return Err(ServiceError::Config(format!(
1361                "Role `{}` not found in config",
1362                search_role
1363            )));
1364        };
1365        Ok(role)
1366    }
1367
1368    /// Check if a character is a word boundary (not alphanumeric or underscore).
1369    /// This provides Unicode-aware word boundary detection.
1370    fn is_word_boundary_char(c: char) -> bool {
1371        !c.is_alphanumeric() && c != '_'
1372    }
1373
1374    /// Check if a match position is at word boundaries in the text.
1375    /// Returns true if the character before start (or start of string) and
1376    /// the character after end (or end of string) are word boundary characters.
1377    fn is_at_word_boundary(text: &str, start: usize, end: usize) -> bool {
1378        let before_ok = if start == 0 {
1379            true
1380        } else {
1381            text[..start]
1382                .chars()
1383                .last()
1384                .map(Self::is_word_boundary_char)
1385                .unwrap_or(true)
1386        };
1387
1388        let after_ok = if end >= text.len() {
1389            true
1390        } else {
1391            text[end..]
1392                .chars()
1393                .next()
1394                .map(Self::is_word_boundary_char)
1395                .unwrap_or(true)
1396        };
1397
1398        before_ok && after_ok
1399    }
1400
1401    /// Match a term against text using unicode-aware word boundaries.
1402    /// Returns true if the term appears as a complete word (not as part of another word).
1403    /// Both inputs should already be lowercase for efficiency.
1404    fn term_matches_with_word_boundaries(term: &str, text: &str) -> bool {
1405        // Find all occurrences of the term in the text
1406        let mut start = 0;
1407        while let Some(pos) = text[start..].find(term) {
1408            let abs_start = start + pos;
1409            let abs_end = abs_start + term.len();
1410
1411            if Self::is_at_word_boundary(text, abs_start, abs_end) {
1412                return true;
1413            }
1414            start = abs_end;
1415        }
1416        false
1417    }
1418
1419    /// Apply logical operators (AND/OR) to filter documents based on multiple search terms
1420    pub async fn apply_logical_operators_to_documents(
1421        &mut self,
1422        search_query: &SearchQuery,
1423        documents: Vec<Document>,
1424    ) -> Result<Vec<Document>> {
1425        use terraphim_types::LogicalOperator;
1426
1427        let all_terms = search_query.get_all_terms();
1428        let operator = search_query.get_operator();
1429
1430        let initial_doc_count = documents.len();
1431
1432        log::debug!(
1433            "Applying {:?} operator to {} documents with {} search terms",
1434            operator,
1435            initial_doc_count,
1436            all_terms.len()
1437        );
1438
1439        // Pre-compute lowercase terms once for efficiency
1440        let terms_lower: Vec<String> = all_terms
1441            .iter()
1442            .map(|t| t.as_str().to_lowercase())
1443            .collect();
1444
1445        let filtered_docs: Vec<Document> = documents
1446            .into_iter()
1447            .filter(|doc| {
1448                // Create searchable text from document
1449                let searchable_text = format!(
1450                    "{} {} {}",
1451                    doc.title.to_lowercase(),
1452                    doc.body.to_lowercase(),
1453                    doc.description
1454                        .as_ref()
1455                        .unwrap_or(&String::new())
1456                        .to_lowercase()
1457                );
1458
1459                match operator {
1460                    LogicalOperator::And => {
1461                        // Document must contain ALL terms as whole words
1462                        terms_lower.iter().all(|term| {
1463                            Self::term_matches_with_word_boundaries(term, &searchable_text)
1464                        })
1465                    }
1466                    LogicalOperator::Or => {
1467                        // Document must contain ANY term as a whole word
1468                        terms_lower.iter().any(|term| {
1469                            Self::term_matches_with_word_boundaries(term, &searchable_text)
1470                        })
1471                    }
1472                }
1473            })
1474            .collect();
1475
1476        log::debug!(
1477            "Logical operator filtering: {} -> {} documents",
1478            initial_doc_count,
1479            filtered_docs.len()
1480        );
1481
1482        // Sort filtered documents by relevance using a combined query
1483        let combined_query_string = terms_lower.join(" ");
1484        let query = Query::new(&combined_query_string);
1485        let sorted_docs = score::sort_documents(&query, filtered_docs);
1486
1487        Ok(sorted_docs)
1488    }
1489
1490    /// search for documents in the haystacks with selected role from the config
1491    /// and return the documents sorted by relevance
1492    pub async fn search_documents_selected_role(
1493        &mut self,
1494        search_term: &NormalizedTermValue,
1495    ) -> Result<Vec<Document>> {
1496        let role = self.config_state.get_selected_role().await;
1497        let documents = self
1498            .search(&SearchQuery {
1499                search_term: search_term.clone(),
1500                search_terms: None,
1501                operator: None,
1502                role: Some(role),
1503                skip: None,
1504                limit: None,
1505                layer: Layer::default(),
1506                include_pinned: false,
1507                min_quality: None,
1508            })
1509            .await?;
1510        Ok(documents)
1511    }
1512
1513    /// Filter documents by minimum composite quality score.
1514    ///
1515    /// Documents without a quality score are excluded when `min_quality` is set.
1516    /// Out-of-range thresholds are clamped to `[0.0, 1.0]` before comparison.
1517    fn apply_min_quality_filter(docs: Vec<Document>, min_quality: Option<f64>) -> Vec<Document> {
1518        let Some(threshold) = min_quality else {
1519            return docs;
1520        };
1521        let threshold = threshold.clamp(0.0, 1.0);
1522        docs.into_iter()
1523            .filter(|doc| {
1524                doc.quality_score
1525                    .as_ref()
1526                    .map(|qs| qs.composite() >= threshold)
1527                    .unwrap_or(false)
1528            })
1529            .collect()
1530    }
1531
1532    /// Search for documents in the haystacks
1533    pub async fn search(&mut self, search_query: &SearchQuery) -> Result<Vec<Document>> {
1534        // Get the role from the config
1535        log::debug!("Role for searching: {:?}", search_query.role);
1536        let role = self.get_search_role(search_query).await?;
1537
1538        log::trace!("Building index for search query: {:?}", search_query);
1539        let index: Index =
1540            terraphim_middleware::search_haystacks(self.config_state.clone(), search_query.clone())
1541                .await?;
1542
1543        let min_quality = search_query.min_quality;
1544
1545        let docs_result: Result<Vec<Document>> = match role.relevance_function {
1546            RelevanceFunction::TitleScorer => {
1547                log::debug!("Searching haystack with title scorer");
1548
1549                let documents = index.get_all_documents();
1550
1551                log::debug!("Sorting documents by relevance");
1552
1553                let documents = if search_query.is_multi_term_query() {
1554                    // Handle multi-term queries with logical operators
1555                    self.apply_logical_operators_to_documents(search_query, documents)
1556                        .await?
1557                } else {
1558                    // Single term query (backward compatibility)
1559                    let query = Query::new(&search_query.search_term.to_string());
1560                    score::sort_documents(&query, documents)
1561                };
1562                let total_length = documents.len();
1563                let mut docs_ranked = Vec::new();
1564                for (idx, doc) in documents.iter().enumerate() {
1565                    let mut document: terraphim_types::Document = doc.clone();
1566                    let rank = (total_length - idx).try_into().unwrap();
1567                    document.rank = Some(rank);
1568
1569                    // 🔄 Enhanced persistence layer integration for both local and Atomic Data documents
1570                    if document.id.starts_with("http://") || document.id.starts_with("https://") {
1571                        // Atomic Data document: Check persistence first, then save for future queries
1572                        log::debug!(
1573                            "Processing Atomic Data document '{}' (URL: {})",
1574                            document.title,
1575                            document.id
1576                        );
1577
1578                        // Try to load from persistence first (for cached Atomic Data documents)
1579                        let mut placeholder = Document {
1580                            id: document.id.clone(),
1581                            ..Default::default()
1582                        };
1583                        match placeholder.load().await {
1584                            Ok(persisted_doc) => {
1585                                // Found in persistence - use cached version
1586                                log::debug!(
1587                                    "Found cached Atomic Data document '{}' in persistence",
1588                                    document.title
1589                                );
1590                                if let Some(better_description) = persisted_doc.description {
1591                                    document.description = Some(better_description);
1592                                }
1593                                // Update body if the persisted version has better content
1594                                // But DO NOT overwrite if this role uses KG preprocessing (terraphim_it)
1595                                // because we need to preserve the processed content with KG links
1596                                if !persisted_doc.body.is_empty() && !role.terraphim_it {
1597                                    log::debug!(
1598                                        "Updated body from persistence for Atomic document '{}' (role: '{}', terraphim_it: {})",
1599                                        document.title,
1600                                        role.name,
1601                                        role.terraphim_it
1602                                    );
1603                                    document.body = persisted_doc.body;
1604                                } else if role.terraphim_it {
1605                                    log::debug!(
1606                                        "Keeping search result body for Atomic document '{}' because role '{}' uses KG preprocessing (terraphim_it=true)",
1607                                        document.title,
1608                                        role.name
1609                                    );
1610                                }
1611                            }
1612                            Err(_) => {
1613                                // Not in persistence - save this Atomic Data document for future queries
1614                                log::debug!(
1615                                    "Caching Atomic Data document '{}' to persistence for future queries",
1616                                    document.title
1617                                );
1618
1619                                // Save in background to avoid blocking the response
1620                                let doc_to_save = document.clone();
1621                                tokio::spawn(async move {
1622                                    if let Err(e) = doc_to_save.save().await {
1623                                        log::warn!(
1624                                            "Failed to cache Atomic Data document '{}': {}",
1625                                            doc_to_save.title,
1626                                            e
1627                                        );
1628                                    } else {
1629                                        log::debug!(
1630                                            "Successfully cached Atomic Data document '{}'",
1631                                            doc_to_save.title
1632                                        );
1633                                    }
1634                                });
1635                            }
1636                        }
1637                    } else {
1638                        // Local document: Try direct persistence lookup first
1639                        let should_lookup_persistence = document
1640                            .get_source_haystack()
1641                            .and_then(|source| {
1642                                role.haystacks
1643                                    .iter()
1644                                    .find(|haystack| haystack.location == *source)
1645                            })
1646                            .map(|haystack| haystack.fetch_content)
1647                            .unwrap_or(true);
1648
1649                        if !should_lookup_persistence {
1650                            log::trace!(
1651                                "Skipping persistence lookup for '{}' (haystack fetch_content=false)",
1652                                document.title
1653                            );
1654                        } else {
1655                            let mut placeholder = Document {
1656                                id: document.id.clone(),
1657                                ..Default::default()
1658                            };
1659                            if let Ok(persisted_doc) = placeholder.load().await {
1660                                if let Some(better_description) = persisted_doc.description {
1661                                    log::debug!(
1662                                        "Replaced ripgrep description for '{}' with persistence description",
1663                                        document.title
1664                                    );
1665                                    document.description = Some(better_description);
1666                                }
1667                            } else {
1668                                // Try normalized ID based on document title (filename)
1669                                // For KG files, the title might be "haystack" but persistence ID is "haystackmd"
1670                                let normalized_id = normalize_filename_to_id(&document.title);
1671
1672                                let mut normalized_placeholder = Document {
1673                                    id: normalized_id.clone(),
1674                                    ..Default::default()
1675                                };
1676                                if let Ok(persisted_doc) = normalized_placeholder.load().await {
1677                                    if let Some(better_description) = persisted_doc.description {
1678                                        log::debug!(
1679                                            "Replaced ripgrep description for '{}' with persistence description (normalized from title: {})",
1680                                            document.title,
1681                                            normalized_id
1682                                        );
1683                                        document.description = Some(better_description);
1684                                    }
1685                                } else {
1686                                    // Try with "md" suffix for KG files (title "haystack" -> ID "haystackmd")
1687                                    let normalized_id_with_md = format!("{}md", normalized_id);
1688                                    let mut md_placeholder = Document {
1689                                        id: normalized_id_with_md.clone(),
1690                                        ..Default::default()
1691                                    };
1692                                    if let Ok(persisted_doc) = md_placeholder.load().await {
1693                                        if let Some(better_description) = persisted_doc.description
1694                                        {
1695                                            log::debug!(
1696                                                "Replaced ripgrep description for '{}' with persistence description (normalized with md: {})",
1697                                                document.title,
1698                                                normalized_id_with_md
1699                                            );
1700                                            document.description = Some(better_description);
1701                                        }
1702                                    } else {
1703                                        log::debug!(
1704                                            "No persistence document found for '{}' (tried ID: '{}', normalized: '{}', with md: '{}')",
1705                                            document.title,
1706                                            document.id,
1707                                            normalized_id,
1708                                            normalized_id_with_md
1709                                        );
1710                                    }
1711                                }
1712                            }
1713                        }
1714                    }
1715
1716                    docs_ranked.push(document);
1717                }
1718
1719                // Apply OpenRouter AI summarization if enabled for this role and auto-summarize is on
1720                // Apply AI summarization if enabled via OpenRouter or generic LLM config
1721                #[cfg(feature = "openrouter")]
1722                if role.has_llm_config() && role.llm_auto_summarize {
1723                    log::debug!(
1724                        "Applying OpenRouter AI summarization to {} search results for role '{}'",
1725                        docs_ranked.len(),
1726                        role.name
1727                    );
1728                    docs_ranked = self
1729                        .enhance_descriptions_with_ai(docs_ranked, &role)
1730                        .await?;
1731                } else {
1732                    // Always apply LLM AI summarization if LLM client is available
1733                    eprintln!(
1734                        "📋 Entering LLM AI summarization branch for role: {}",
1735                        role.name
1736                    );
1737                    log::debug!(
1738                        "Applying LLM AI summarization to {} search results for role '{}'",
1739                        docs_ranked.len(),
1740                        role.name
1741                    );
1742                    docs_ranked = self
1743                        .enhance_descriptions_with_ai(docs_ranked, &role)
1744                        .await?;
1745                }
1746
1747                // Apply KG preprocessing if enabled for this role (but only once, not in individual document loads)
1748                if role.terraphim_it {
1749                    log::info!(
1750                        "🧠 Applying KG preprocessing to {} TerraphimGraph search results for role '{}'",
1751                        docs_ranked.len(),
1752                        role.name
1753                    );
1754                    let mut processed_docs = Vec::new();
1755                    let mut total_kg_terms = 0;
1756                    let mut docs_with_kg_links = 0;
1757
1758                    for document in docs_ranked {
1759                        let original_body_len = document.body.len();
1760                        let processed_doc =
1761                            self.preprocess_document_content(document, &role).await?;
1762
1763                        // Count KG links added (rough estimate by body size increase)
1764                        let new_body_len = processed_doc.body.len();
1765                        if new_body_len > original_body_len {
1766                            docs_with_kg_links += 1;
1767                            // Rough estimate: each KG link adds ~15-20 chars on average
1768                            let estimated_links = (new_body_len - original_body_len) / 17;
1769                            total_kg_terms += estimated_links;
1770                        }
1771
1772                        processed_docs.push(processed_doc);
1773                    }
1774
1775                    log::info!(
1776                        "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1777                        processed_docs.len(),
1778                        docs_with_kg_links,
1779                        total_kg_terms
1780                    );
1781                    Ok(processed_docs)
1782                } else {
1783                    Ok(docs_ranked)
1784                }
1785            }
1786            RelevanceFunction::BM25 => {
1787                log::debug!("Searching haystack with BM25 scorer");
1788
1789                let documents = index.get_all_documents();
1790
1791                log::debug!("Sorting documents by BM25 relevance");
1792
1793                let documents = if search_query.is_multi_term_query() {
1794                    // Handle multi-term queries with logical operators
1795                    let filtered_docs = self
1796                        .apply_logical_operators_to_documents(search_query, documents)
1797                        .await?;
1798                    // Apply BM25 scoring to filtered documents
1799                    let combined_query_string = search_query
1800                        .get_all_terms()
1801                        .iter()
1802                        .map(|t| t.as_str())
1803                        .collect::<Vec<_>>()
1804                        .join(" ");
1805                    let query =
1806                        Query::new(&combined_query_string).name_scorer(score::QueryScorer::BM25);
1807                    score::sort_documents(&query, filtered_docs)
1808                } else {
1809                    // Single term query (backward compatibility)
1810                    let query = Query::new(&search_query.search_term.to_string())
1811                        .name_scorer(score::QueryScorer::BM25);
1812                    score::sort_documents(&query, documents)
1813                };
1814                let total_length = documents.len();
1815                let mut docs_ranked = Vec::new();
1816                for (idx, doc) in documents.iter().enumerate() {
1817                    let mut document: terraphim_types::Document = doc.clone();
1818                    let rank = (total_length - idx).try_into().unwrap();
1819                    document.rank = Some(rank);
1820                    docs_ranked.push(document);
1821                }
1822
1823                // Apply OpenRouter AI summarization if enabled for this role and auto-summarize is on
1824                #[cfg(feature = "openrouter")]
1825                if role.has_llm_config() && role.llm_auto_summarize {
1826                    log::debug!(
1827                        "Applying OpenRouter AI summarization to {} BM25 search results for role '{}'",
1828                        docs_ranked.len(),
1829                        role.name
1830                    );
1831                    docs_ranked = self
1832                        .enhance_descriptions_with_ai(docs_ranked, &role)
1833                        .await?;
1834                } else {
1835                    // Always apply LLM AI summarization if LLM client is available
1836                    log::debug!(
1837                        "Applying LLM AI summarization to {} BM25 search results for role '{}'",
1838                        docs_ranked.len(),
1839                        role.name
1840                    );
1841                    docs_ranked = self
1842                        .enhance_descriptions_with_ai(docs_ranked, &role)
1843                        .await?;
1844                }
1845
1846                // Apply KG preprocessing if enabled for this role
1847                if role.terraphim_it {
1848                    log::info!(
1849                        "🧠 Applying KG preprocessing to {} BM25 search results for role '{}'",
1850                        docs_ranked.len(),
1851                        role.name
1852                    );
1853                    let mut processed_docs = Vec::new();
1854                    let mut total_kg_terms = 0;
1855                    let mut docs_with_kg_links = 0;
1856
1857                    for document in docs_ranked {
1858                        let original_body_len = document.body.len();
1859                        let processed_doc =
1860                            self.preprocess_document_content(document, &role).await?;
1861
1862                        // Count KG links added (rough estimate by body size increase)
1863                        let new_body_len = processed_doc.body.len();
1864                        if new_body_len > original_body_len {
1865                            docs_with_kg_links += 1;
1866                            let estimated_links = (new_body_len - original_body_len) / 17;
1867                            total_kg_terms += estimated_links;
1868                        }
1869
1870                        processed_docs.push(processed_doc);
1871                    }
1872
1873                    log::info!(
1874                        "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1875                        processed_docs.len(),
1876                        docs_with_kg_links,
1877                        total_kg_terms
1878                    );
1879                    Ok(processed_docs)
1880                } else {
1881                    Ok(docs_ranked)
1882                }
1883            }
1884            RelevanceFunction::BM25F => {
1885                log::debug!("Searching haystack with BM25F scorer");
1886
1887                let documents = index.get_all_documents();
1888
1889                log::debug!("Sorting documents by BM25F relevance");
1890
1891                let documents = if search_query.is_multi_term_query() {
1892                    // Handle multi-term queries with logical operators
1893                    let filtered_docs = self
1894                        .apply_logical_operators_to_documents(search_query, documents)
1895                        .await?;
1896                    // Apply BM25F scoring to filtered documents
1897                    let combined_query_string = search_query
1898                        .get_all_terms()
1899                        .iter()
1900                        .map(|t| t.as_str())
1901                        .collect::<Vec<_>>()
1902                        .join(" ");
1903                    let query =
1904                        Query::new(&combined_query_string).name_scorer(score::QueryScorer::BM25F);
1905                    score::sort_documents(&query, filtered_docs)
1906                } else {
1907                    // Single term query (backward compatibility)
1908                    let query = Query::new(&search_query.search_term.to_string())
1909                        .name_scorer(score::QueryScorer::BM25F);
1910                    score::sort_documents(&query, documents)
1911                };
1912                let total_length = documents.len();
1913                let mut docs_ranked = Vec::new();
1914                for (idx, doc) in documents.iter().enumerate() {
1915                    let mut document: terraphim_types::Document = doc.clone();
1916                    let rank = (total_length - idx).try_into().unwrap();
1917                    document.rank = Some(rank);
1918                    docs_ranked.push(document);
1919                }
1920
1921                // Apply OpenRouter AI summarization if enabled for this role and auto-summarize is on
1922                #[cfg(feature = "openrouter")]
1923                if role.has_llm_config() && role.llm_auto_summarize {
1924                    log::debug!(
1925                        "Applying OpenRouter AI summarization to {} BM25F search results for role '{}'",
1926                        docs_ranked.len(),
1927                        role.name
1928                    );
1929                    docs_ranked = self
1930                        .enhance_descriptions_with_ai(docs_ranked, &role)
1931                        .await?;
1932                } else {
1933                    // Always apply LLM AI summarization if LLM client is available
1934                    log::debug!(
1935                        "Applying LLM AI summarization to {} BM25F search results for role '{}'",
1936                        docs_ranked.len(),
1937                        role.name
1938                    );
1939                    docs_ranked = self
1940                        .enhance_descriptions_with_ai(docs_ranked, &role)
1941                        .await?;
1942                }
1943
1944                // Apply KG preprocessing if enabled for this role
1945                if role.terraphim_it {
1946                    log::info!(
1947                        "🧠 Applying KG preprocessing to {} BM25F search results for role '{}'",
1948                        docs_ranked.len(),
1949                        role.name
1950                    );
1951                    let mut processed_docs = Vec::new();
1952                    let mut total_kg_terms = 0;
1953                    let mut docs_with_kg_links = 0;
1954
1955                    for document in docs_ranked {
1956                        let original_body_len = document.body.len();
1957                        let processed_doc =
1958                            self.preprocess_document_content(document, &role).await?;
1959
1960                        // Count KG links added (rough estimate by body size increase)
1961                        let new_body_len = processed_doc.body.len();
1962                        if new_body_len > original_body_len {
1963                            docs_with_kg_links += 1;
1964                            let estimated_links = (new_body_len - original_body_len) / 17;
1965                            total_kg_terms += estimated_links;
1966                        }
1967
1968                        processed_docs.push(processed_doc);
1969                    }
1970
1971                    log::info!(
1972                        "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
1973                        processed_docs.len(),
1974                        docs_with_kg_links,
1975                        total_kg_terms
1976                    );
1977                    Ok(processed_docs)
1978                } else {
1979                    Ok(docs_ranked)
1980                }
1981            }
1982            RelevanceFunction::BM25Plus => {
1983                log::debug!("Searching haystack with BM25Plus scorer");
1984
1985                let documents = index.get_all_documents();
1986
1987                log::debug!("Sorting documents by BM25Plus relevance");
1988
1989                let documents = if search_query.is_multi_term_query() {
1990                    // Handle multi-term queries with logical operators
1991                    let filtered_docs = self
1992                        .apply_logical_operators_to_documents(search_query, documents)
1993                        .await?;
1994                    // Apply BM25Plus scoring to filtered documents
1995                    let combined_query_string = search_query
1996                        .get_all_terms()
1997                        .iter()
1998                        .map(|t| t.as_str())
1999                        .collect::<Vec<_>>()
2000                        .join(" ");
2001                    let query = Query::new(&combined_query_string)
2002                        .name_scorer(score::QueryScorer::BM25Plus);
2003                    score::sort_documents(&query, filtered_docs)
2004                } else {
2005                    // Single term query (backward compatibility)
2006                    let query = Query::new(&search_query.search_term.to_string())
2007                        .name_scorer(score::QueryScorer::BM25Plus);
2008                    score::sort_documents(&query, documents)
2009                };
2010                let total_length = documents.len();
2011                let mut docs_ranked = Vec::new();
2012                for (idx, doc) in documents.iter().enumerate() {
2013                    let mut document: terraphim_types::Document = doc.clone();
2014                    let rank = (total_length - idx).try_into().unwrap();
2015                    document.rank = Some(rank);
2016                    docs_ranked.push(document);
2017                }
2018
2019                // Apply OpenRouter AI summarization if enabled for this role and auto-summarize is on
2020                #[cfg(feature = "openrouter")]
2021                if role.has_llm_config() && role.llm_auto_summarize {
2022                    log::debug!(
2023                        "Applying OpenRouter AI summarization to {} BM25Plus search results for role '{}'",
2024                        docs_ranked.len(),
2025                        role.name
2026                    );
2027                    docs_ranked = self
2028                        .enhance_descriptions_with_ai(docs_ranked, &role)
2029                        .await?;
2030                }
2031
2032                // Apply KG preprocessing if enabled for this role
2033                if role.terraphim_it {
2034                    log::info!(
2035                        "🧠 Applying KG preprocessing to {} BM25Plus search results for role '{}'",
2036                        docs_ranked.len(),
2037                        role.name
2038                    );
2039                    let mut processed_docs = Vec::new();
2040                    let mut total_kg_terms = 0;
2041                    let mut docs_with_kg_links = 0;
2042
2043                    for document in docs_ranked {
2044                        let original_body_len = document.body.len();
2045                        let processed_doc =
2046                            self.preprocess_document_content(document, &role).await?;
2047
2048                        // Count KG links added (rough estimate by body size increase)
2049                        let new_body_len = processed_doc.body.len();
2050                        if new_body_len > original_body_len {
2051                            docs_with_kg_links += 1;
2052                            let estimated_links = (new_body_len - original_body_len) / 17;
2053                            total_kg_terms += estimated_links;
2054                        }
2055
2056                        processed_docs.push(processed_doc);
2057                    }
2058
2059                    log::info!(
2060                        "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
2061                        processed_docs.len(),
2062                        docs_with_kg_links,
2063                        total_kg_terms
2064                    );
2065                    Ok(processed_docs)
2066                } else {
2067                    Ok(docs_ranked)
2068                }
2069            }
2070            RelevanceFunction::TerraphimGraph => {
2071                log::debug!("TerraphimGraph search initiated for role: {}", role.name);
2072                self.build_thesaurus(search_query).await?;
2073                let _thesaurus = self.ensure_thesaurus_loaded(&role.name).await?;
2074                let scored_index_docs: Vec<IndexedDocument> = self
2075                    .config_state
2076                    .search_indexed_documents(search_query, &role)
2077                    .await;
2078
2079                log::debug!(
2080                    "TerraphimGraph search found {} indexed documents",
2081                    scored_index_docs.len()
2082                );
2083
2084                // Apply to ripgrep vector of document output
2085                // I.e. use the ranking of thesaurus to rank the documents here
2086                log::debug!("Ranking documents with thesaurus");
2087                let mut documents = index.get_documents(scored_index_docs.clone());
2088
2089                // CRITICAL FIX: Index all haystack documents into rolegraph if not already present
2090                // This ensures TerraphimGraph search can find documents discovered by haystacks
2091                let all_haystack_docs = index.get_all_documents();
2092                log::debug!(
2093                    "Found {} total documents from haystacks, checking which need indexing",
2094                    all_haystack_docs.len()
2095                );
2096                let mut need_reindexing = false;
2097
2098                if let Some(rolegraph_sync) = self.config_state.roles.get(&role.name) {
2099                    let mut rolegraph = rolegraph_sync.lock().await;
2100                    let mut newly_indexed = 0;
2101
2102                    for doc in &all_haystack_docs {
2103                        // Only index documents that aren't already in the rolegraph
2104                        if !rolegraph.has_document(&doc.id) && !doc.body.is_empty() {
2105                            log::debug!(
2106                                "Indexing new document '{}' into rolegraph for TerraphimGraph search",
2107                                doc.id
2108                            );
2109                            rolegraph.insert_document(&doc.id, doc.clone());
2110
2111                            // Save document to persistence to ensure it's available for kg_search
2112                            // Drop the rolegraph lock temporarily to avoid deadlocks during async save
2113                            drop(rolegraph);
2114                            if let Err(e) = doc.save().await {
2115                                log::warn!(
2116                                    "Failed to save document '{}' to persistence: {}",
2117                                    doc.id,
2118                                    e
2119                                );
2120                            } else {
2121                                log::debug!(
2122                                    "Successfully saved document '{}' to persistence",
2123                                    doc.id
2124                                );
2125                            }
2126                            // Re-acquire the lock
2127                            rolegraph = rolegraph_sync.lock().await;
2128
2129                            newly_indexed += 1;
2130                        }
2131                    }
2132
2133                    if newly_indexed > 0 {
2134                        log::info!(
2135                            "✅ Indexed {} new documents into rolegraph for role '{}'",
2136                            newly_indexed,
2137                            role.name
2138                        );
2139                        log::debug!(
2140                            "RoleGraph now has {} nodes, {} edges, {} documents",
2141                            rolegraph.get_node_count(),
2142                            rolegraph.get_edge_count(),
2143                            rolegraph.get_document_count()
2144                        );
2145                        need_reindexing = true; // We'll use the existing re-search logic below
2146                    }
2147                }
2148
2149                // CRITICAL FIX: Ensure documents have body content loaded from persistence
2150                // If documents don't have body content, they won't contribute to graph nodes properly
2151                let mut documents_with_content = Vec::new();
2152
2153                for mut document in documents {
2154                    // Check if document body is empty or missing
2155                    if document.body.is_empty() {
2156                        log::debug!(
2157                            "Document '{}' has empty body, attempting to load from persistence",
2158                            document.id
2159                        );
2160
2161                        // Try to load full document from persistence with fallback
2162                        let mut full_doc = Document::new(document.id.clone());
2163                        match full_doc.load().await {
2164                            Ok(loaded_doc) => {
2165                                if !loaded_doc.body.is_empty() {
2166                                    log::info!(
2167                                        "✅ Loaded body content for document '{}' from persistence",
2168                                        document.id
2169                                    );
2170                                    document.body = loaded_doc.body.clone();
2171                                    if loaded_doc.description.is_some() {
2172                                        document.description = loaded_doc.description.clone();
2173                                    }
2174
2175                                    // Re-index document into rolegraph with proper content
2176                                    if let Some(rolegraph_sync) =
2177                                        self.config_state.roles.get(&role.name)
2178                                    {
2179                                        let mut rolegraph = rolegraph_sync.lock().await;
2180                                        rolegraph.insert_document(&document.id, loaded_doc);
2181                                        need_reindexing = true;
2182                                        log::debug!(
2183                                            "Re-indexed document '{}' into rolegraph with content",
2184                                            document.id
2185                                        );
2186                                    }
2187                                } else {
2188                                    log::warn!(
2189                                        "Document '{}' still has empty body after loading from persistence",
2190                                        document.id
2191                                    );
2192                                }
2193                            }
2194                            Err(e) => {
2195                                log::warn!(
2196                                    "Failed to load document '{}' from persistence: {}",
2197                                    document.id,
2198                                    e
2199                                );
2200
2201                                // Try to read from original file path if it's a local file
2202                                if document.url.starts_with('/')
2203                                    || document.url.starts_with("docs/")
2204                                {
2205                                    match tokio::fs::read_to_string(&document.url).await {
2206                                        Ok(content) => {
2207                                            log::info!(
2208                                                "✅ Loaded content for '{}' from file: {}",
2209                                                document.id,
2210                                                document.url
2211                                            );
2212                                            document.body = content.clone();
2213
2214                                            // Create and save full document
2215                                            let full_doc = Document {
2216                                                id: document.id.clone(),
2217                                                title: document.title.clone(),
2218                                                body: content,
2219                                                url: document.url.clone(),
2220                                                description: document.description.clone(),
2221                                                summarization: document.summarization.clone(),
2222                                                stub: None,
2223                                                tags: document.tags.clone(),
2224                                                rank: document.rank,
2225                                                source_haystack: document.source_haystack.clone(),
2226                                                doc_type: terraphim_types::DocumentType::KgEntry,
2227                                                synonyms: None,
2228                                                route: None,
2229                                                priority: None,
2230                                                quality_score: None,
2231                                            };
2232
2233                                            // Save to persistence for future use
2234                                            if let Err(e) = full_doc.save().await {
2235                                                log::warn!(
2236                                                    "Failed to save document '{}' to persistence: {}",
2237                                                    document.id,
2238                                                    e
2239                                                );
2240                                            }
2241
2242                                            // Re-index into rolegraph
2243                                            if let Some(rolegraph_sync) =
2244                                                self.config_state.roles.get(&role.name)
2245                                            {
2246                                                let mut rolegraph = rolegraph_sync.lock().await;
2247                                                rolegraph.insert_document(&document.id, full_doc);
2248                                                need_reindexing = true;
2249                                                log::debug!(
2250                                                    "Re-indexed document '{}' into rolegraph from file",
2251                                                    document.id
2252                                                );
2253                                            }
2254                                        }
2255                                        Err(file_e) => {
2256                                            log::warn!(
2257                                                "Failed to read file '{}' for document '{}': {}",
2258                                                document.url,
2259                                                document.id,
2260                                                file_e
2261                                            );
2262                                        }
2263                                    }
2264                                }
2265                            }
2266                        }
2267                    }
2268                    documents_with_content.push(document);
2269                }
2270
2271                documents = documents_with_content;
2272
2273                if need_reindexing {
2274                    log::info!("🔄 Re-running TerraphimGraph search after indexing new documents");
2275
2276                    // Re-run the rolegraph search to get updated rankings
2277                    let updated_scored_docs: Vec<IndexedDocument> = self
2278                        .config_state
2279                        .search_indexed_documents(search_query, &role)
2280                        .await;
2281
2282                    if !updated_scored_docs.is_empty() {
2283                        log::debug!(
2284                            "✅ Updated rolegraph search found {} documents",
2285                            updated_scored_docs.len()
2286                        );
2287                        // Update documents with new ranking from rolegraph
2288                        let updated_documents = index.get_documents(updated_scored_docs);
2289                        if !updated_documents.is_empty() {
2290                            documents = updated_documents;
2291                        }
2292                    }
2293                }
2294
2295                if documents.is_empty() && !all_haystack_docs.is_empty() {
2296                    log::info!(
2297                        "TerraphimGraph returned no results for role '{}'; falling back to lexical haystack ranking",
2298                        role.name
2299                    );
2300                    documents = if search_query.is_multi_term_query() {
2301                        let filtered_docs = self
2302                            .apply_logical_operators_to_documents(
2303                                search_query,
2304                                all_haystack_docs.clone(),
2305                            )
2306                            .await?;
2307                        let combined_query_string = search_query
2308                            .get_all_terms()
2309                            .iter()
2310                            .map(|t| t.as_str())
2311                            .collect::<Vec<_>>()
2312                            .join(" ");
2313                        let query = Query::new(&combined_query_string);
2314                        score::sort_documents(&query, filtered_docs)
2315                    } else {
2316                        let query = Query::new(&search_query.search_term.to_string());
2317                        score::sort_documents(&query, all_haystack_docs.clone())
2318                    };
2319                }
2320
2321                // Apply TF-IDF scoring to enhance Terraphim Graph ranking
2322                if !documents.is_empty() {
2323                    log::debug!(
2324                        "Applying TF-IDF scoring to {} documents for enhanced ranking",
2325                        documents.len()
2326                    );
2327
2328                    use crate::score::bm25_additional::TFIDFScorer;
2329                    let mut tfidf_scorer = TFIDFScorer::new();
2330                    tfidf_scorer.initialize(&documents);
2331
2332                    // Re-score documents using TF-IDF
2333                    let query_text = &search_query.search_term.to_string();
2334                    for document in &mut documents {
2335                        let tfidf_score = tfidf_scorer.score(query_text, document);
2336                        // Combine TF-IDF score with existing rank using a weighted approach
2337                        if let Some(rank) = document.rank {
2338                            document.rank = Some(rank + (tfidf_score * 0.3) as u64);
2339                        // 30% weight for TF-IDF
2340                        } else {
2341                            document.rank = Some((tfidf_score * 10.0) as u64); // Scale TF-IDF for ranking
2342                        }
2343                    }
2344
2345                    // Re-sort documents by the new combined rank
2346                    documents.sort_by_key(|d| std::cmp::Reverse(d.rank.unwrap_or(0)));
2347
2348                    log::debug!("TF-IDF scoring applied successfully");
2349                }
2350
2351                // 🔄 Enhanced persistence layer integration for both local and Atomic Data documents
2352                for document in &mut documents {
2353                    if document.id.starts_with("http://") || document.id.starts_with("https://") {
2354                        // Atomic Data document: Check persistence first, then save for future queries
2355                        log::debug!(
2356                            "Processing Atomic Data document '{}' (URL: {})",
2357                            document.title,
2358                            document.id
2359                        );
2360
2361                        // Try to load from persistence first (for cached Atomic Data documents)
2362                        let mut placeholder = Document {
2363                            id: document.id.clone(),
2364                            ..Default::default()
2365                        };
2366                        match placeholder.load().await {
2367                            Ok(persisted_doc) => {
2368                                // Found in persistence - use cached version
2369                                log::debug!(
2370                                    "Found cached Atomic Data document '{}' in persistence",
2371                                    document.title
2372                                );
2373                                if let Some(better_description) = persisted_doc.description {
2374                                    document.description = Some(better_description);
2375                                }
2376                                // Update body if the persisted version has better content
2377                                // But DO NOT overwrite if this role uses KG preprocessing (terraphim_it)
2378                                // because we need to preserve the processed content with KG links
2379                                if !persisted_doc.body.is_empty() && !role.terraphim_it {
2380                                    log::debug!(
2381                                        "Updated body from persistence for Atomic document '{}' (role: '{}', terraphim_it: {})",
2382                                        document.title,
2383                                        role.name,
2384                                        role.terraphim_it
2385                                    );
2386                                    document.body = persisted_doc.body;
2387                                } else if role.terraphim_it {
2388                                    log::debug!(
2389                                        "Keeping search result body for Atomic document '{}' because role '{}' uses KG preprocessing (terraphim_it=true)",
2390                                        document.title,
2391                                        role.name
2392                                    );
2393                                }
2394                            }
2395                            Err(_) => {
2396                                // Not in persistence - save this Atomic Data document for future queries
2397                                log::debug!(
2398                                    "Caching Atomic Data document '{}' to persistence for future queries",
2399                                    document.title
2400                                );
2401
2402                                // Save in background to avoid blocking the response
2403                                let doc_to_save = document.clone();
2404                                tokio::spawn(async move {
2405                                    if let Err(e) = doc_to_save.save().await {
2406                                        log::warn!(
2407                                            "Failed to cache Atomic Data document '{}': {}",
2408                                            doc_to_save.title,
2409                                            e
2410                                        );
2411                                    } else {
2412                                        log::debug!(
2413                                            "Successfully cached Atomic Data document '{}'",
2414                                            doc_to_save.title
2415                                        );
2416                                    }
2417                                });
2418                            }
2419                        }
2420                    } else {
2421                        // Local document: Try direct persistence lookup first
2422                        let mut placeholder = Document {
2423                            id: document.id.clone(),
2424                            ..Default::default()
2425                        };
2426                        if let Ok(persisted_doc) = placeholder.load().await {
2427                            if let Some(better_description) = persisted_doc.description {
2428                                log::debug!(
2429                                    "Replaced ripgrep description for '{}' with persistence description",
2430                                    document.title
2431                                );
2432                                document.description = Some(better_description);
2433                            }
2434                        } else {
2435                            // Try normalized ID based on document title (filename)
2436                            // For KG files, the title might be "haystack" but persistence ID is "haystackmd"
2437                            let normalized_id = normalize_filename_to_id(&document.title);
2438
2439                            let mut normalized_placeholder = Document {
2440                                id: normalized_id.clone(),
2441                                ..Default::default()
2442                            };
2443                            if let Ok(persisted_doc) = normalized_placeholder.load().await {
2444                                if let Some(better_description) = persisted_doc.description {
2445                                    log::debug!(
2446                                        "Replaced ripgrep description for '{}' with persistence description (normalized from title: {})",
2447                                        document.title,
2448                                        normalized_id
2449                                    );
2450                                    document.description = Some(better_description);
2451                                }
2452                            } else {
2453                                // Try with "md" suffix for KG files (title "haystack" -> ID "haystackmd")
2454                                let normalized_id_with_md = format!("{}md", normalized_id);
2455                                let mut md_placeholder = Document {
2456                                    id: normalized_id_with_md.clone(),
2457                                    ..Default::default()
2458                                };
2459                                if let Ok(persisted_doc) = md_placeholder.load().await {
2460                                    if let Some(better_description) = persisted_doc.description {
2461                                        log::debug!(
2462                                            "Replaced ripgrep description for '{}' with persistence description (normalized with md: {})",
2463                                            document.title,
2464                                            normalized_id_with_md
2465                                        );
2466                                        document.description = Some(better_description);
2467                                    }
2468                                } else {
2469                                    log::debug!(
2470                                        "No persistence document found for '{}' (tried ID: '{}', normalized: '{}', with md: '{}')",
2471                                        document.title,
2472                                        document.id,
2473                                        normalized_id,
2474                                        normalized_id_with_md
2475                                    );
2476                                }
2477                            }
2478                        }
2479                    }
2480                }
2481
2482                // Apply OpenRouter AI summarization if enabled for this role
2483                #[cfg(feature = "openrouter")]
2484                if role.has_llm_config() {
2485                    log::debug!(
2486                        "Applying OpenRouter AI summarization to {} search results for role '{}'",
2487                        documents.len(),
2488                        role.name
2489                    );
2490                    documents = self.enhance_descriptions_with_ai(documents, &role).await?;
2491                } else {
2492                    // Always apply LLM AI summarization if LLM client is available
2493                    log::debug!(
2494                        "Applying LLM AI summarization to {} search results for role '{}'",
2495                        documents.len(),
2496                        role.name
2497                    );
2498                    documents = self.enhance_descriptions_with_ai(documents, &role).await?;
2499                }
2500
2501                // Apply KG preprocessing if enabled for this role (but only once, not in individual document loads)
2502                if role.terraphim_it {
2503                    log::debug!(
2504                        "Applying KG preprocessing to {} search results for role '{}'",
2505                        documents.len(),
2506                        role.name
2507                    );
2508                    let mut processed_docs = Vec::new();
2509                    for document in documents {
2510                        let processed_doc =
2511                            self.preprocess_document_content(document, &role).await?;
2512                        processed_docs.push(processed_doc);
2513                    }
2514                    Ok(processed_docs)
2515                } else {
2516                    Ok(documents)
2517                }
2518            }
2519        };
2520        let docs = docs_result?;
2521        Ok(Self::apply_min_quality_filter(docs, min_quality))
2522    }
2523
2524    /// Check if a document ID appears to be hash-based (16 hex characters)
2525    fn is_hash_based_id(id: &str) -> bool {
2526        id.len() == 16 && id.chars().all(|c| c.is_ascii_hexdigit())
2527    }
2528
2529    /// Find documents that contain a given knowledge graph term
2530    ///
2531    /// This method searches for documents that were the source of a knowledge graph term.
2532    /// For example, given "haystack", it will find documents like "haystack.md" that contain
2533    /// this term or its synonyms ("datasource", "service", "agent").
2534    ///
2535    /// For KG protocol resolution, this method also directly looks for KG definition documents
2536    /// when the term appears to be a KG concept (like "terraphim-graph" -> "./docs/src/kg/terraphim-graph.md").
2537    ///
2538    /// Returns a vector of Documents that contain the term, with KG preprocessing applied if enabled for the role.
2539    pub async fn find_documents_for_kg_term(
2540        &mut self,
2541        role_name: &RoleName,
2542        term: &str,
2543    ) -> Result<Vec<Document>> {
2544        log::debug!(
2545            "Finding documents for KG term '{}' in role '{}'",
2546            term,
2547            role_name
2548        );
2549
2550        // Ensure the thesaurus is loaded for this role
2551        let thesaurus = self.ensure_thesaurus_loaded(role_name).await?;
2552
2553        // Get the role configuration to check if KG preprocessing should be applied
2554        let role = self.config_state.get_role(role_name).await.ok_or_else(|| {
2555            ServiceError::Config(format!("Role '{}' not found in config", role_name))
2556        })?;
2557
2558        let mut documents = Vec::new();
2559
2560        // ENHANCEMENT: First, check if this is a direct KG definition document request
2561        // This handles KG protocol resolution like kg:terraphim-graph -> ./docs/src/kg/terraphim-graph.md
2562        // Also handles synonyms like kg:graph -> terraphim-graph -> ./docs/src/kg/terraphim-graph.md
2563        if let Some(kg_config) = &role.kg {
2564            log::debug!("Found KG config for role");
2565            if let Some(kg_local) = &kg_config.knowledge_graph_local {
2566                let mut potential_concepts = vec![term.to_string()];
2567
2568                // Use the loaded thesaurus to resolve synonyms to root concepts
2569                log::debug!("Checking thesaurus for term '{}'", term);
2570
2571                // Create normalized term to look up in thesaurus
2572                let normalized_search_term =
2573                    terraphim_types::NormalizedTermValue::new(term.to_string());
2574
2575                // Look up the term in the thesaurus - this will find the root concept if term is a synonym
2576                if let Some(root_concept) = thesaurus.get(&normalized_search_term) {
2577                    log::debug!("Found root concept for '{}': {:?}", term, root_concept);
2578
2579                    // The root concept's value contains the canonical concept name
2580                    let root_concept_name = root_concept.value.as_str();
2581
2582                    // If we have a URL, extract concept name from it, otherwise use the concept value
2583                    let concept_name = if let Some(url) = &root_concept.url {
2584                        url.split('/')
2585                            .next_back()
2586                            .and_then(|s| s.strip_suffix(".md"))
2587                            .unwrap_or(root_concept_name)
2588                    } else {
2589                        root_concept_name
2590                    };
2591
2592                    if !potential_concepts.contains(&concept_name.to_string()) {
2593                        potential_concepts.push(concept_name.to_string());
2594                        log::debug!(
2595                            "Added concept from thesaurus: {} (root: {})",
2596                            concept_name,
2597                            root_concept_name
2598                        );
2599                    }
2600                } else {
2601                    log::debug!("No direct mapping found for '{}' in thesaurus", term);
2602                }
2603
2604                log::debug!(
2605                    "Trying {} potential concepts: {:?}",
2606                    potential_concepts.len(),
2607                    potential_concepts
2608                );
2609
2610                // Try to find KG definition documents for all potential concepts
2611                for concept in potential_concepts {
2612                    let potential_kg_file = kg_local.path.join(format!("{}.md", concept));
2613                    log::debug!("Looking for KG definition file: {:?}", potential_kg_file);
2614
2615                    if potential_kg_file.exists() {
2616                        log::info!("Found KG definition file: {:?}", potential_kg_file);
2617
2618                        // Check if we already have this document to avoid duplicates
2619                        let file_path = potential_kg_file.to_string_lossy().to_string();
2620                        if documents.iter().any(|d: &Document| d.url == file_path) {
2621                            log::debug!("Skipping duplicate KG document: {}", file_path);
2622                            continue;
2623                        }
2624
2625                        // Load the KG definition document directly from filesystem
2626                        // Don't use Document::load() as it relies on persistence layer
2627                        match std::fs::read_to_string(&potential_kg_file) {
2628                            Ok(content) => {
2629                                let mut kg_doc =
2630                                    Document::new(potential_kg_file.to_string_lossy().to_string());
2631                                kg_doc.url = potential_kg_file.to_string_lossy().to_string();
2632                                kg_doc.body = content.clone();
2633
2634                                // Extract title from markdown content (first # line)
2635                                let title = content
2636                                    .lines()
2637                                    .find(|line| line.starts_with("# "))
2638                                    .map(|line| line.trim_start_matches("# ").trim())
2639                                    .unwrap_or(&concept)
2640                                    .to_string();
2641                                kg_doc.title = title;
2642
2643                                log::debug!(
2644                                    "Successfully loaded KG definition document: {}",
2645                                    kg_doc.title
2646                                );
2647                                documents.push(kg_doc);
2648
2649                                // Found the definition document, no need to check other concepts
2650                                break;
2651                            }
2652                            Err(e) => {
2653                                log::warn!(
2654                                    "Failed to read KG definition file '{}': {}",
2655                                    potential_kg_file.display(),
2656                                    e
2657                                );
2658                            }
2659                        }
2660                    } else {
2661                        log::debug!("KG definition file not found: {:?}", potential_kg_file);
2662                    }
2663                }
2664            } else {
2665                log::debug!("No KG local config found");
2666            }
2667        } else {
2668            log::debug!("No KG config found for role");
2669        }
2670
2671        // Also search through the rolegraph for any documents that contain this term
2672        let rolegraph_sync = self
2673            .config_state
2674            .roles
2675            .get(role_name)
2676            .ok_or_else(|| ServiceError::Config(format!("Role '{}' not found", role_name)))?;
2677
2678        let rolegraph = rolegraph_sync.lock().await;
2679        let document_ids = rolegraph.find_document_ids_for_term(term);
2680        drop(rolegraph); // Release the lock early
2681
2682        log::debug!(
2683            "Found {} document IDs from rolegraph for term '{}'",
2684            document_ids.len(),
2685            term
2686        );
2687
2688        // Load documents found in the rolegraph (if any)
2689        for doc_id in &document_ids {
2690            // Skip if we already have this document from the KG definition lookup
2691            if documents
2692                .iter()
2693                .any(|d| d.id == *doc_id || d.url == *doc_id)
2694            {
2695                log::debug!("Skipping duplicate document from rolegraph: {}", doc_id);
2696                continue;
2697            }
2698
2699            // Load the actual documents using the persistence layer
2700            // Handle both local and Atomic Data documents properly
2701            if doc_id.starts_with("http://") || doc_id.starts_with("https://") {
2702                // Atomic Data document: Try to load from persistence first
2703                log::debug!("Loading Atomic Data document '{}' from persistence", doc_id);
2704                let mut placeholder = Document {
2705                    id: doc_id.clone(),
2706                    ..Default::default()
2707                };
2708                match placeholder.load().await {
2709                    Ok(loaded_doc) => {
2710                        log::debug!(
2711                            "Found cached Atomic Data document '{}' in persistence",
2712                            doc_id
2713                        );
2714                        documents.push(loaded_doc);
2715                    }
2716                    Err(_) => {
2717                        log::warn!(
2718                            "Atomic Data document '{}' not found in persistence - this may indicate the document hasn't been cached yet",
2719                            doc_id
2720                        );
2721                        // Skip this document for now - it will be cached when accessed through search
2722                        // In a production system, you might want to fetch it from the Atomic Server here
2723                    }
2724                }
2725            } else {
2726                // Local document: Use the standard persistence loading
2727                let mut doc = Document::new(doc_id.clone());
2728                match doc.load().await {
2729                    Ok(loaded_doc) => {
2730                        documents.push(loaded_doc);
2731                        log::trace!("Successfully loaded local document: {}", doc_id);
2732                    }
2733                    Err(e) => {
2734                        log::warn!("Failed to load local document '{}': {}", doc_id, e);
2735
2736                        // Check if this might be a hash-based ID from old ripgrep documents
2737                        if Self::is_hash_based_id(doc_id) {
2738                            log::debug!(
2739                                "Document ID '{}' appears to be hash-based (legacy document), skipping for now",
2740                                doc_id
2741                            );
2742                            log::info!(
2743                                "💡 Hash-based document IDs are deprecated. This document will be re-indexed with normalized IDs on next haystack search."
2744                            );
2745                            // Skip legacy hash-based documents - they will be re-indexed with proper normalized IDs
2746                            // when the haystack is searched again
2747                        }
2748
2749                        // Continue processing other documents even if this one fails
2750                    }
2751                }
2752            }
2753        }
2754
2755        // Apply KG preprocessing if enabled for this role
2756        if role.terraphim_it {
2757            log::info!(
2758                "🧠 Applying KG preprocessing to {} KG term documents for role '{}' (terraphim_it enabled)",
2759                documents.len(),
2760                role_name
2761            );
2762            let mut processed_documents = Vec::new();
2763            let mut total_kg_terms = 0;
2764            let mut docs_with_kg_links = 0;
2765
2766            for document in documents {
2767                let original_body_len = document.body.len();
2768                let processed_doc = self.preprocess_document_content(document, &role).await?;
2769
2770                // Count KG links added (rough estimate by body size increase)
2771                let new_body_len = processed_doc.body.len();
2772                if new_body_len > original_body_len {
2773                    docs_with_kg_links += 1;
2774                    let estimated_links = (new_body_len - original_body_len) / 17;
2775                    total_kg_terms += estimated_links;
2776                }
2777
2778                processed_documents.push(processed_doc);
2779            }
2780
2781            log::info!(
2782                "✅ KG preprocessing complete: {} documents processed, {} received KG links (~{} total links)",
2783                processed_documents.len(),
2784                docs_with_kg_links,
2785                total_kg_terms
2786            );
2787            documents = processed_documents;
2788        } else {
2789            log::info!(
2790                "🔍 terraphim_it disabled for role '{}', skipping KG preprocessing for {} documents",
2791                role_name,
2792                documents.len()
2793            );
2794        }
2795
2796        // Assign ranks based on order (same logic as regular search)
2797        // Higher rank for earlier results to maintain consistency
2798        let total_length = documents.len();
2799        for (idx, doc) in documents.iter_mut().enumerate() {
2800            let rank = (total_length - idx) as u64;
2801            doc.rank = Some(rank);
2802            log::trace!("Assigned rank {} to document '{}'", rank, doc.title);
2803        }
2804
2805        log::debug!(
2806            "Successfully loaded and processed {} documents for term '{}', ranks assigned from {} to 1",
2807            documents.len(),
2808            term,
2809            total_length
2810        );
2811        Ok(documents)
2812    }
2813
2814    /// Generate a summary for a document using OpenRouter
2815    ///
2816    /// This method takes a document and generates an AI-powered summary using the OpenRouter service.
2817    /// The summary is generated based on the document's content and can be customized with different
2818    /// models and length constraints.
2819    ///
2820    /// # Arguments
2821    ///
2822    /// * `document` - The document to summarize
2823    /// * `api_key` - The OpenRouter API key
2824    /// * `model` - The model to use for summarization (e.g., "openai/gpt-3.5-turbo")
2825    /// * `max_length` - Maximum length of the summary in characters
2826    ///
2827    /// # Returns
2828    ///
2829    /// Returns a `Result<String>` containing the generated summary or an error if summarization fails.
2830    #[cfg(feature = "openrouter")]
2831    pub async fn generate_document_summary(
2832        &self,
2833        document: &Document,
2834        api_key: &str,
2835        model: &str,
2836        max_length: usize,
2837    ) -> Result<String> {
2838        use crate::openrouter::OpenRouterService;
2839
2840        log::debug!(
2841            "Generating summary for document '{}' using model '{}'",
2842            document.id,
2843            model
2844        );
2845
2846        // Create the OpenRouter service
2847        let openrouter_service =
2848            OpenRouterService::new(api_key, model).map_err(ServiceError::OpenRouter)?;
2849
2850        // Use the document body for summarization
2851        let content = &document.body;
2852
2853        if content.trim().is_empty() {
2854            return Err(ServiceError::Config(
2855                "Document body is empty, cannot generate summary".to_string(),
2856            ));
2857        }
2858
2859        // Generate the summary
2860        let summary = openrouter_service
2861            .generate_summary(content, max_length)
2862            .await
2863            .map_err(ServiceError::OpenRouter)?;
2864
2865        log::info!(
2866            "Generated {}-character summary for document '{}' using model '{}'",
2867            summary.len(),
2868            document.id,
2869            model
2870        );
2871
2872        Ok(summary)
2873    }
2874
2875    /// Generate a summary for a document using OpenRouter (stub when feature is disabled)
2876    #[cfg(not(feature = "openrouter"))]
2877    pub async fn generate_document_summary(
2878        &self,
2879        _document: &Document,
2880        _api_key: &str,
2881        _model: &str,
2882        _max_length: usize,
2883    ) -> Result<String> {
2884        Err(ServiceError::Config(
2885            "OpenRouter feature not enabled during compilation".to_string(),
2886        ))
2887    }
2888
2889    /// Fetch the current config
2890    pub async fn fetch_config(&self) -> terraphim_config::Config {
2891        let current_config = self.config_state.config.lock().await;
2892        current_config.clone()
2893    }
2894
2895    // Test helper methods
2896    #[cfg(test)]
2897    pub async fn get_role(&self, role_name: &RoleName) -> Result<Role> {
2898        let config = self.config_state.config.lock().await;
2899        config
2900            .roles
2901            .get(role_name)
2902            .cloned()
2903            .ok_or_else(|| ServiceError::Config(format!("Role '{}' not found", role_name)))
2904    }
2905
2906    /// Update the config
2907    ///
2908    /// Overwrites the config in the config state and returns the updated
2909    /// config.
2910    pub async fn update_config(
2911        &self,
2912        config: terraphim_config::Config,
2913    ) -> Result<terraphim_config::Config> {
2914        // Lock briefly to swap in the new config, then drop before save so
2915        // the disk write doesn't block other /config endpoints.
2916        {
2917            let mut current_config = self.config_state.config.lock().await;
2918            *current_config = config.clone();
2919        }
2920        config.save().await?;
2921        log::info!("Config updated");
2922        Ok(config)
2923    }
2924
2925    /// Update only the `selected_role` in the config without mutating the rest of the
2926    /// configuration. Returns the up-to-date `Config` object.
2927    pub async fn update_selected_role(
2928        &self,
2929        role_name: terraphim_types::RoleName,
2930    ) -> Result<terraphim_config::Config> {
2931        // Lock briefly: validate, mutate in-memory state, snapshot. Drop the
2932        // lock BEFORE the disk save -- holding the config mutex across an
2933        // async I/O write blocks every other endpoint that touches /config
2934        // (e.g. concurrent search, get_config) for the duration of the save.
2935        let snapshot = {
2936            let mut current_config = self.config_state.config.lock().await;
2937
2938            if !current_config.roles.contains_key(&role_name) {
2939                return Err(ServiceError::Config(format!(
2940                    "Role `{}` not found in config",
2941                    role_name
2942                )));
2943            }
2944
2945            current_config.selected_role = role_name.clone();
2946            current_config.clone()
2947        };
2948        // Persist asynchronously: in-memory update is the source of truth for
2949        // subsequent reads; disk save is best-effort and must not delay the
2950        // HTTP response. save_to_all() can take many seconds depending on the
2951        // configured persistence profiles (sled WAL flush, S3 PUT, etc.) and
2952        // should never block role selection.
2953        let snapshot_for_save = snapshot.clone();
2954        let role_for_log = role_name.clone();
2955        tokio::spawn(async move {
2956            if let Err(e) = snapshot_for_save.save().await {
2957                log::warn!(
2958                    "background persist of selected_role={} failed: {}",
2959                    role_for_log,
2960                    e
2961                );
2962            }
2963        });
2964        // Log role selection from the snapshot (no need to re-lock).
2965        if let Some(role) = snapshot.roles.get(&role_name) {
2966            if role.terraphim_it {
2967                log::info!(
2968                    "🎯 Selected role '{}' → terraphim_it: ENABLED (KG preprocessing will be applied)",
2969                    role_name
2970                );
2971            } else {
2972                log::info!("🎯 Selected role '{}' → terraphim_it: DISABLED", role_name);
2973            }
2974        }
2975
2976        Ok(snapshot)
2977    }
2978
2979    /// Highlight search terms in the given text content
2980    ///
2981    /// This method wraps matching search terms with HTML-style highlighting tags
2982    /// to make them visually distinct in the frontend.
2983    fn highlight_search_terms(content: &str, search_query: &SearchQuery) -> String {
2984        let mut highlighted_content = content.to_string();
2985
2986        // Get all terms from the search query
2987        let terms = search_query.get_all_terms();
2988
2989        // Sort terms by length (longest first) to avoid partial replacements
2990        let mut sorted_terms: Vec<&str> = terms.iter().map(|t| t.as_str()).collect();
2991        sorted_terms.sort_by_key(|term| std::cmp::Reverse(term.len()));
2992
2993        for term in sorted_terms {
2994            if term.trim().is_empty() {
2995                continue;
2996            }
2997
2998            // Create case-insensitive regex for the term
2999            // Escape special regex characters in the search term
3000            let escaped_term = regex::escape(term);
3001
3002            if let Ok(regex) = regex::RegexBuilder::new(&escaped_term)
3003                .case_insensitive(true)
3004                .build()
3005            {
3006                // Replace all matches with highlighted version
3007                // Use a unique delimiter to avoid conflicts with existing HTML
3008                let highlight_open = "<mark class=\"search-highlight\">";
3009                let highlight_close = "</mark>";
3010
3011                highlighted_content = regex
3012                    .replace_all(
3013                        &highlighted_content,
3014                        format!("{}{}{}", highlight_open, "$0", highlight_close),
3015                    )
3016                    .to_string();
3017            }
3018        }
3019
3020        highlighted_content
3021    }
3022}
3023
3024pub(crate) fn snippet_around(s: &str, marker: &str, before: usize, after: usize) -> String {
3025    let Some(marker_byte) = s.find(marker) else {
3026        return String::new();
3027    };
3028    let marker_char_index = s[..marker_byte].chars().count();
3029    let total_chars = s.chars().count();
3030
3031    let start_char_index = marker_char_index.saturating_sub(before);
3032    let end_char_index = (marker_char_index + marker.len() + after).min(total_chars);
3033
3034    if start_char_index >= end_char_index {
3035        return String::new();
3036    }
3037
3038    s.chars()
3039        .skip(start_char_index)
3040        .take(end_char_index - start_char_index)
3041        .collect()
3042}
3043
3044#[cfg(test)]
3045mod tests {
3046    use super::*;
3047    use std::path::PathBuf;
3048    use terraphim_config::ConfigBuilder;
3049    use terraphim_types::NormalizedTermValue;
3050
3051    #[tokio::test]
3052    async fn test_get_config() {
3053        let mut config = ConfigBuilder::new()
3054            .build_default_desktop()
3055            .build()
3056            .unwrap();
3057        let config_state = ConfigState::new(&mut config).await.unwrap();
3058        let service = TerraphimService::new(config_state);
3059        let fetched_config = service.fetch_config().await;
3060        assert_eq!(fetched_config.id, terraphim_config::ConfigId::Desktop);
3061    }
3062
3063    #[tokio::test]
3064    async fn test_search_documents_selected_role() {
3065        // Check if KG directory exists before running test
3066        let project_root = std::env::current_dir().unwrap_or_else(|_| PathBuf::from("."));
3067        let kg_path = project_root.join("docs/src/kg");
3068        if !kg_path.exists() {
3069            println!("Skipping test: KG directory not found at {:?}", kg_path);
3070            return;
3071        }
3072
3073        let mut config = ConfigBuilder::new()
3074            .build_default_desktop()
3075            .build()
3076            .unwrap();
3077        let config_state = match ConfigState::new(&mut config).await {
3078            Ok(state) => state,
3079            Err(e) => {
3080                println!("Skipping test: Failed to create config state: {:?}", e);
3081                return;
3082            }
3083        };
3084        let mut service = TerraphimService::new(config_state);
3085        let search_term = NormalizedTermValue::new("terraphim".to_string());
3086        let documents = match service.search_documents_selected_role(&search_term).await {
3087            Ok(docs) => docs,
3088            Err(e) => {
3089                println!(
3090                    "Skipping test: Search failed (expected in some environments): {:?}",
3091                    e
3092                );
3093                return;
3094            }
3095        };
3096        assert!(documents.is_empty() || !documents.is_empty()); // Either empty or has results
3097    }
3098
3099    #[tokio::test]
3100    async fn test_ensure_thesaurus_loaded_terraphim_engineer() {
3101        // Create a fresh config with correct KG path for testing
3102        let project_root = std::env::current_dir().unwrap_or_else(|_| PathBuf::from("."));
3103        let kg_path = project_root.join("docs/src/kg");
3104
3105        // Skip test gracefully if KG directory doesn't exist
3106        if !kg_path.exists() {
3107            println!("⚠️ KG directory not found at {:?}, skipping test", kg_path);
3108            return;
3109        }
3110
3111        let mut config = ConfigBuilder::new()
3112            .build_default_desktop()
3113            .build()
3114            .unwrap();
3115
3116        // Update the Terraphim Engineer role to use project KG directory
3117        if let Some(terr_eng_role) = config.roles.get_mut(&"Terraphim Engineer".into()) {
3118            if let Some(kg) = &mut terr_eng_role.kg {
3119                if let Some(kg_local) = &mut kg.knowledge_graph_local {
3120                    kg_local.path = kg_path;
3121                }
3122            }
3123        }
3124
3125        let config_state = ConfigState::new(&mut config).await.unwrap();
3126        let mut service = TerraphimService::new(config_state);
3127
3128        let role_name = RoleName::new("Terraphim Engineer");
3129        let thesaurus_result = service.ensure_thesaurus_loaded(&role_name).await;
3130
3131        match thesaurus_result {
3132            Ok(thesaurus) => {
3133                println!(
3134                    "✅ Successfully loaded thesaurus with {} entries",
3135                    thesaurus.len()
3136                );
3137                // Verify thesaurus contains expected terms
3138                assert!(!thesaurus.is_empty(), "Thesaurus should not be empty");
3139
3140                // Check for expected terms from docs/src/kg using &thesaurus for iteration
3141                let has_terraphim = (&thesaurus)
3142                    .into_iter()
3143                    .any(|(term, _)| term.as_str().to_lowercase().contains("terraphim"));
3144                let has_graph = (&thesaurus)
3145                    .into_iter()
3146                    .any(|(term, _)| term.as_str().to_lowercase().contains("graph"));
3147
3148                println!("   Contains 'terraphim': {}", has_terraphim);
3149                println!("   Contains 'graph': {}", has_graph);
3150
3151                // At least one of these should be present
3152                assert!(
3153                    has_terraphim || has_graph,
3154                    "Thesaurus should contain expected terms"
3155                );
3156            }
3157            Err(e) => {
3158                println!("❌ Failed to load thesaurus: {:?}", e);
3159                // This might fail if the local KG files don't exist, which is expected in some test environments
3160                // We'll just log the error but not fail the test
3161            }
3162        }
3163    }
3164
3165    #[tokio::test]
3166    #[ignore = "Requires local KG fixtures at ~/.terraphim/kg"]
3167    async fn test_config_building_with_local_kg() {
3168        // Test that config building works correctly with local KG files
3169        let mut config = ConfigBuilder::new()
3170            .build_default_desktop()
3171            .build()
3172            .unwrap();
3173        let config_state_result = ConfigState::new(&mut config).await;
3174
3175        match config_state_result {
3176            Ok(config_state) => {
3177                println!("✅ Successfully built config state");
3178                // Verify that roles were created
3179                assert!(
3180                    !config_state.roles.is_empty(),
3181                    "Config state should have roles"
3182                );
3183
3184                // Check if Terraphim Engineer role was created
3185                let terraphim_engineer_role = RoleName::new("Terraphim Engineer");
3186                let has_terraphim_engineer =
3187                    config_state.roles.contains_key(&terraphim_engineer_role);
3188                println!("   Has Terraphim Engineer role: {}", has_terraphim_engineer);
3189
3190                // The role should exist even if thesaurus building failed
3191                assert!(
3192                    has_terraphim_engineer,
3193                    "Terraphim Engineer role should exist"
3194                );
3195            }
3196            Err(e) => {
3197                println!("❌ Failed to build config state: {:?}", e);
3198                // This might fail if the local KG files don't exist, which is expected in some test environments
3199                // We'll just log the error but not fail the test
3200            }
3201        }
3202    }
3203
3204    #[tokio::test]
3205    async fn test_atomic_data_persistence_skip() {
3206        use ahash::AHashMap;
3207        use terraphim_config::{Config, Haystack, Role, ServiceType};
3208        use terraphim_persistence::DeviceStorage;
3209        use terraphim_types::{NormalizedTermValue, RoleName, SearchQuery};
3210
3211        // Initialize memory-only persistence for testing
3212        DeviceStorage::init_memory_only().await.unwrap();
3213
3214        // Create a test config with a role
3215        let mut config = Config::default();
3216        let role_name = RoleName::new("test_role");
3217        let role = Role {
3218            shortname: None,
3219            name: "test_role".into(),
3220            haystacks: vec![Haystack {
3221                location: "test".to_string(),
3222                service: ServiceType::Ripgrep,
3223                read_only: false,
3224                atomic_server_secret: None,
3225                extra_parameters: std::collections::HashMap::new(),
3226                fetch_content: false,
3227            }],
3228            kg: None,
3229            terraphim_it: false,
3230            theme: "default".to_string(),
3231            relevance_function: terraphim_types::RelevanceFunction::TitleScorer,
3232            llm_enabled: false,
3233            llm_api_key: None,
3234            llm_model: None,
3235            llm_auto_summarize: false,
3236            llm_chat_enabled: false,
3237            llm_chat_system_prompt: None,
3238            llm_chat_model: None,
3239            llm_context_window: None,
3240            extra: AHashMap::new(),
3241            llm_router_enabled: false,
3242            llm_router_config: None,
3243        };
3244        config.roles.insert(role_name.clone(), role);
3245
3246        let config_state = ConfigState::new(&mut config).await.unwrap();
3247        let mut service = TerraphimService::new(config_state);
3248
3249        // Create a test search query
3250        let search_query = SearchQuery {
3251            search_term: NormalizedTermValue::new("test".to_string()),
3252            search_terms: None,
3253            operator: None,
3254            limit: Some(10),
3255            skip: None,
3256            role: Some(role_name),
3257            layer: Layer::default(),
3258            include_pinned: false,
3259            min_quality: None,
3260        };
3261
3262        // Test that Atomic Data URLs are skipped during persistence lookup
3263        // This test verifies that the debug message is logged instead of trying to load from persistence
3264        let result = service.search(&search_query).await;
3265
3266        // The search should complete without errors, even though no documents are found
3267        // The important thing is that Atomic Data URLs don't cause persistence lookup errors
3268        assert!(result.is_ok(), "Search should complete without errors");
3269    }
3270
3271    #[tokio::test]
3272    async fn test_atomic_data_caching() {
3273        use ahash::AHashMap;
3274        use terraphim_config::{Config, Haystack, Role, ServiceType};
3275        use terraphim_persistence::DeviceStorage;
3276        use terraphim_types::{Document, NormalizedTermValue, RoleName, SearchQuery};
3277
3278        // Initialize memory-only persistence for testing
3279        DeviceStorage::init_memory_only().await.unwrap();
3280
3281        // Create a test config with a role
3282        let mut config = Config::default();
3283        let role_name = RoleName::new("test_role");
3284        let role = Role {
3285            shortname: None,
3286            name: "test_role".into(),
3287            haystacks: vec![Haystack {
3288                location: "test".to_string(),
3289                service: ServiceType::Ripgrep,
3290                read_only: false,
3291                atomic_server_secret: None,
3292                extra_parameters: std::collections::HashMap::new(),
3293                fetch_content: false,
3294            }],
3295            kg: None,
3296            terraphim_it: false,
3297            theme: "default".to_string(),
3298            relevance_function: terraphim_types::RelevanceFunction::TitleScorer,
3299            llm_enabled: false,
3300            llm_api_key: None,
3301            llm_model: None,
3302            llm_auto_summarize: false,
3303            llm_chat_enabled: false,
3304            llm_chat_system_prompt: None,
3305            llm_chat_model: None,
3306            llm_context_window: None,
3307            extra: AHashMap::new(),
3308            llm_router_enabled: false,
3309            llm_router_config: None,
3310        };
3311        config.roles.insert(role_name.clone(), role);
3312
3313        let config_state = ConfigState::new(&mut config).await.unwrap();
3314        let mut service = TerraphimService::new(config_state);
3315
3316        // Create a mock Atomic Data document
3317        let atomic_doc = Document {
3318            id: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3319            url: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3320            title: "Requested Loan Amount ($)".to_string(),
3321            body: "Form field for Requested Loan Amount ($)".to_string(),
3322            description: Some("Form field for Requested Loan Amount ($)".to_string()),
3323            summarization: None,
3324            stub: None,
3325            tags: None,
3326            rank: None,
3327            source_haystack: None,
3328            doc_type: terraphim_types::DocumentType::KgEntry,
3329            synonyms: None,
3330            route: None,
3331            priority: None,
3332            quality_score: None,
3333        };
3334
3335        // Test 1: Save Atomic Data document to persistence
3336        log::info!("Testing Atomic Data document caching...");
3337        match atomic_doc.save().await {
3338            Ok(_) => log::info!("✅ Successfully saved Atomic Data document to persistence"),
3339            Err(e) => {
3340                log::error!("❌ Failed to save Atomic Data document: {}", e);
3341                panic!("Atomic Data document save failed");
3342            }
3343        }
3344
3345        // Test 2: Verify the document can be loaded from persistence
3346        let mut placeholder = Document {
3347            id: atomic_doc.id.clone(),
3348            ..Default::default()
3349        };
3350        match placeholder.load().await {
3351            Ok(loaded_doc) => {
3352                log::info!("✅ Successfully loaded Atomic Data document from persistence");
3353                assert_eq!(loaded_doc.title, atomic_doc.title);
3354                assert_eq!(loaded_doc.body, atomic_doc.body);
3355                assert_eq!(loaded_doc.description, atomic_doc.description);
3356            }
3357            Err(e) => {
3358                log::error!(
3359                    "❌ Failed to load Atomic Data document from persistence: {}",
3360                    e
3361                );
3362                panic!("Atomic Data document load failed");
3363            }
3364        }
3365
3366        // Test 3: Verify the search logic would find the cached document
3367        let search_query = SearchQuery {
3368            search_term: NormalizedTermValue::new("test".to_string()),
3369            search_terms: None,
3370            operator: None,
3371            limit: Some(10),
3372            skip: None,
3373            role: Some(role_name),
3374            layer: Layer::default(),
3375            include_pinned: false,
3376            min_quality: None,
3377        };
3378
3379        let result = service.search(&search_query).await;
3380        assert!(result.is_ok(), "Search should complete without errors");
3381
3382        log::info!("✅ All Atomic Data caching tests passed!");
3383    }
3384
3385    #[tokio::test]
3386    #[ignore = "Requires local KG fixtures at 'test' directory"]
3387    async fn test_kg_term_search_with_atomic_data() {
3388        use ahash::AHashMap;
3389        use std::path::PathBuf;
3390        use terraphim_config::{
3391            Config, Haystack, KnowledgeGraph, KnowledgeGraphLocal, Role, ServiceType,
3392        };
3393        use terraphim_persistence::DeviceStorage;
3394        use terraphim_types::{Document, KnowledgeGraphInputType, RoleName};
3395
3396        // Initialize memory-only persistence for testing
3397        DeviceStorage::init_memory_only().await.unwrap();
3398
3399        // Create a test config with a role that has KG enabled
3400        let mut config = Config::default();
3401        let role_name = RoleName::new("test_kg_role");
3402        let role = Role {
3403            shortname: None,
3404            name: "test_kg_role".into(),
3405            haystacks: vec![Haystack {
3406                location: "test".to_string(),
3407                service: ServiceType::Ripgrep,
3408                read_only: false,
3409                atomic_server_secret: None,
3410                extra_parameters: std::collections::HashMap::new(),
3411                fetch_content: false,
3412            }],
3413            kg: Some(KnowledgeGraph {
3414                automata_path: None,
3415                knowledge_graph_local: Some(KnowledgeGraphLocal {
3416                    input_type: KnowledgeGraphInputType::Markdown,
3417                    path: PathBuf::from("test"),
3418                }),
3419                public: true,
3420                publish: true,
3421            }),
3422            terraphim_it: true,
3423            theme: "default".to_string(),
3424            relevance_function: terraphim_types::RelevanceFunction::TerraphimGraph,
3425            llm_enabled: false,
3426            llm_api_key: None,
3427            llm_model: None,
3428            llm_auto_summarize: false,
3429            llm_chat_enabled: false,
3430            llm_chat_system_prompt: None,
3431            llm_chat_model: None,
3432            llm_context_window: None,
3433            extra: AHashMap::new(),
3434            llm_router_enabled: false,
3435            llm_router_config: None,
3436        };
3437        config.roles.insert(role_name.clone(), role);
3438
3439        let config_state = ConfigState::new(&mut config).await.unwrap();
3440        let mut service = TerraphimService::new(config_state);
3441
3442        // Create and cache an Atomic Data document
3443        let atomic_doc = Document {
3444            id: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3445            url: "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount".to_string(),
3446            title: "Requested Loan Amount ($)".to_string(),
3447            body: "Form field for Requested Loan Amount ($)".to_string(),
3448            description: Some("Form field for Requested Loan Amount ($)".to_string()),
3449            summarization: None,
3450            stub: None,
3451            tags: None,
3452            rank: None,
3453            source_haystack: None,
3454            doc_type: terraphim_types::DocumentType::KgEntry,
3455            synonyms: None,
3456            route: None,
3457            priority: None,
3458            quality_score: None,
3459        };
3460
3461        // Save the Atomic Data document to persistence
3462        log::info!("Testing KG term search with Atomic Data documents...");
3463        match atomic_doc.save().await {
3464            Ok(_) => log::info!("✅ Successfully saved Atomic Data document to persistence"),
3465            Err(e) => {
3466                log::error!("❌ Failed to save Atomic Data document: {}", e);
3467                panic!("Atomic Data document save failed");
3468            }
3469        }
3470
3471        // Test that find_documents_for_kg_term can handle Atomic Data document IDs
3472        // Note: In a real scenario, the rolegraph would contain the Atomic Data document ID
3473        // For this test, we're verifying that the function can handle Atomic Data URLs properly
3474        let result = service.find_documents_for_kg_term(&role_name, "test").await;
3475
3476        // The function should complete without errors, even if no documents are found
3477        // The important thing is that it doesn't crash when encountering Atomic Data URLs
3478        assert!(
3479            result.is_ok(),
3480            "find_documents_for_kg_term should complete without errors"
3481        );
3482
3483        let documents = result.unwrap();
3484        log::info!(
3485            "✅ KG term search completed successfully, found {} documents",
3486            documents.len()
3487        );
3488
3489        // Verify that the function can handle Atomic Data document loading
3490        // by manually testing the document loading logic
3491        let atomic_doc_id = "http://localhost:9883/borrower-portal/form-field/requestedLoanAmount";
3492        let mut placeholder = Document {
3493            id: atomic_doc_id.to_string(),
3494            ..Default::default()
3495        };
3496
3497        match placeholder.load().await {
3498            Ok(loaded_doc) => {
3499                log::info!(
3500                    "✅ Successfully loaded Atomic Data document from persistence in KG term search context"
3501                );
3502                assert_eq!(loaded_doc.title, atomic_doc.title);
3503                assert_eq!(loaded_doc.body, atomic_doc.body);
3504            }
3505            Err(e) => {
3506                log::error!(
3507                    "❌ Failed to load Atomic Data document in KG term search context: {}",
3508                    e
3509                );
3510                panic!("Atomic Data document load failed in KG term search context");
3511            }
3512        }
3513
3514        log::info!("✅ All KG term search with Atomic Data tests passed!");
3515    }
3516
3517    #[tokio::test]
3518    async fn test_kg_term_search_rank_assignment() -> Result<()> {
3519        use ahash::AHashMap;
3520        use terraphim_config::{Config, Haystack, Role, ServiceType};
3521        use terraphim_persistence::DeviceStorage;
3522        use terraphim_types::{Document, RoleName};
3523
3524        // Initialize memory-only persistence for testing
3525        DeviceStorage::init_memory_only().await.unwrap();
3526
3527        // Create a test config with a role that has KG capabilities
3528        let mut config = Config::default();
3529        let role_name = RoleName::new("Test KG Role");
3530        let role = Role {
3531            shortname: Some("test-kg".to_string()),
3532            name: role_name.clone(),
3533            haystacks: vec![Haystack {
3534                location: "test".to_string(),
3535                service: ServiceType::Ripgrep,
3536                read_only: false,
3537                atomic_server_secret: None,
3538                extra_parameters: std::collections::HashMap::new(),
3539                fetch_content: false,
3540            }],
3541            kg: Some(terraphim_config::KnowledgeGraph {
3542                automata_path: Some(terraphim_automata::AutomataPath::local_example()),
3543                knowledge_graph_local: None,
3544                public: false,
3545                publish: false,
3546            }),
3547            terraphim_it: false,
3548            theme: "default".to_string(),
3549            relevance_function: terraphim_types::RelevanceFunction::TitleScorer,
3550            llm_enabled: false,
3551            llm_api_key: None,
3552            llm_model: None,
3553            llm_auto_summarize: false,
3554            llm_chat_enabled: false,
3555            llm_chat_system_prompt: None,
3556            llm_chat_model: None,
3557            llm_context_window: None,
3558            extra: AHashMap::new(),
3559            llm_router_enabled: false,
3560            llm_router_config: None,
3561        };
3562        config.roles.insert(role_name.clone(), role);
3563
3564        let config_state = ConfigState::new(&mut config).await.unwrap();
3565        let _service = TerraphimService::new(config_state);
3566
3567        // Create test documents and save them to persistence
3568        let test_documents = vec![
3569            Document {
3570                id: "test-doc-1".to_string(),
3571                title: "First Test Document".to_string(),
3572                body: "This is the first test document body".to_string(),
3573                url: "test://doc1".to_string(),
3574                description: Some("First document description".to_string()),
3575                summarization: None,
3576                stub: None,
3577                tags: Some(vec!["test".to_string(), "first".to_string()]),
3578                rank: None, // Should be assigned by the function
3579                source_haystack: None,
3580                doc_type: terraphim_types::DocumentType::KgEntry,
3581                synonyms: None,
3582                route: None,
3583                priority: None,
3584                quality_score: None,
3585            },
3586            Document {
3587                id: "test-doc-2".to_string(),
3588                title: "Second Test Document".to_string(),
3589                body: "This is the second test document body".to_string(),
3590                url: "test://doc2".to_string(),
3591                description: Some("Second document description".to_string()),
3592                summarization: None,
3593                stub: None,
3594                tags: Some(vec!["test".to_string(), "second".to_string()]),
3595                rank: None, // Should be assigned by the function
3596                source_haystack: None,
3597                doc_type: terraphim_types::DocumentType::KgEntry,
3598                synonyms: None,
3599                route: None,
3600                priority: None,
3601                quality_score: None,
3602            },
3603            Document {
3604                id: "test-doc-3".to_string(),
3605                title: "Third Test Document".to_string(),
3606                body: "This is the third test document body".to_string(),
3607                url: "test://doc3".to_string(),
3608                description: Some("Third document description".to_string()),
3609                summarization: None,
3610                stub: None,
3611                tags: Some(vec!["test".to_string(), "third".to_string()]),
3612                rank: None, // Should be assigned by the function
3613                source_haystack: None,
3614                doc_type: terraphim_types::DocumentType::KgEntry,
3615                synonyms: None,
3616                route: None,
3617                priority: None,
3618                quality_score: None,
3619            },
3620        ];
3621
3622        // Save test documents to persistence
3623        for doc in &test_documents {
3624            doc.save().await.expect("Failed to save test document");
3625        }
3626
3627        // The rolegraph will be created automatically by ensure_thesaurus_loaded
3628        // We don't need to manually create it for this test
3629
3630        // Test the rank assignment logic directly
3631        // This validates the core functionality we implemented in find_documents_for_kg_term
3632        let mut simulated_documents = test_documents.clone();
3633
3634        // Apply the same rank assignment logic as in find_documents_for_kg_term
3635        let total_length = simulated_documents.len();
3636        for (idx, doc) in simulated_documents.iter_mut().enumerate() {
3637            let rank = (total_length - idx) as u64;
3638            doc.rank = Some(rank);
3639        }
3640
3641        // Verify rank assignment
3642        assert_eq!(simulated_documents.len(), 3, "Should have 3 test documents");
3643
3644        // Check that all documents have ranks assigned
3645        for doc in &simulated_documents {
3646            assert!(
3647                doc.rank.is_some(),
3648                "Document '{}' should have a rank assigned",
3649                doc.title
3650            );
3651            assert!(
3652                doc.rank.unwrap() > 0,
3653                "Document '{}' should have a positive rank",
3654                doc.title
3655            );
3656        }
3657
3658        // Check that ranks are in descending order (first document has highest rank)
3659        assert_eq!(
3660            simulated_documents[0].rank,
3661            Some(3),
3662            "First document should have highest rank (3)"
3663        );
3664        assert_eq!(
3665            simulated_documents[1].rank,
3666            Some(2),
3667            "Second document should have rank 2"
3668        );
3669        assert_eq!(
3670            simulated_documents[2].rank,
3671            Some(1),
3672            "Third document should have rank 1"
3673        );
3674
3675        // Verify ranks are unique and properly ordered
3676        let mut ranks: Vec<u64> = simulated_documents
3677            .iter()
3678            .map(|doc| doc.rank.unwrap())
3679            .collect();
3680        ranks.sort_by_key(|r| std::cmp::Reverse(*r));
3681        assert_eq!(
3682            ranks,
3683            vec![3, 2, 1],
3684            "Ranks should be unique and in descending order"
3685        );
3686
3687        log::info!("✅ KG term search rank assignment test completed successfully!");
3688        Ok(())
3689    }
3690
3691    // Helper to build a Document with a given composite quality score.
3692    fn doc_with_quality(id: &str, knowledge: f64, logic: f64, structure: f64) -> Document {
3693        Document {
3694            id: id.to_string(),
3695            url: format!("https://example.com/{id}"),
3696            title: id.to_string(),
3697            body: String::new(),
3698            quality_score: Some(terraphim_types::QualityScore {
3699                knowledge: Some(knowledge),
3700                logic: Some(logic),
3701                structure: Some(structure),
3702                last_evaluated: None,
3703            }),
3704            ..Default::default()
3705        }
3706    }
3707
3708    fn doc_without_quality(id: &str) -> Document {
3709        Document {
3710            id: id.to_string(),
3711            url: format!("https://example.com/{id}"),
3712            title: id.to_string(),
3713            body: String::new(),
3714            quality_score: None,
3715            ..Default::default()
3716        }
3717    }
3718
3719    #[test]
3720    fn test_min_quality_none_returns_all_documents() {
3721        // When min_quality is None, all documents are returned unchanged.
3722        let docs = vec![
3723            doc_with_quality("a", 0.9, 0.9, 0.9),
3724            doc_with_quality("b", 0.1, 0.1, 0.1),
3725            doc_without_quality("c"),
3726        ];
3727        let result = TerraphimService::apply_min_quality_filter(docs, None);
3728        assert_eq!(result.len(), 3);
3729    }
3730
3731    #[test]
3732    fn test_min_quality_keeps_documents_at_or_above_threshold() {
3733        // composite = (0.8 + 0.6 + 0.7) / 3 = 0.7
3734        let high = doc_with_quality("high", 0.8, 0.6, 0.7);
3735        // composite = (0.3 + 0.2 + 0.1) / 3 ≈ 0.2
3736        let low = doc_with_quality("low", 0.3, 0.2, 0.1);
3737        let docs = vec![high, low];
3738
3739        let result = TerraphimService::apply_min_quality_filter(docs, Some(0.5));
3740        assert_eq!(result.len(), 1);
3741        assert_eq!(result[0].id, "high");
3742    }
3743
3744    #[test]
3745    fn test_min_quality_excludes_documents_below_threshold() {
3746        // composite = 0.4
3747        let doc = doc_with_quality("below", 0.4, 0.4, 0.4);
3748        let result = TerraphimService::apply_min_quality_filter(vec![doc], Some(0.5));
3749        assert!(result.is_empty());
3750    }
3751
3752    #[test]
3753    fn test_min_quality_excludes_documents_without_quality_score() {
3754        // Documents with no quality_score must be excluded when a threshold is set.
3755        let no_score = doc_without_quality("no-score");
3756        let result = TerraphimService::apply_min_quality_filter(vec![no_score], Some(0.0));
3757        assert!(result.is_empty());
3758    }
3759
3760    #[test]
3761    fn test_min_quality_exact_threshold_is_included() {
3762        // composite = 0.5 exactly — must satisfy >= threshold
3763        let doc = doc_with_quality("exact", 0.5, 0.5, 0.5);
3764        let result = TerraphimService::apply_min_quality_filter(vec![doc], Some(0.5));
3765        assert_eq!(result.len(), 1);
3766    }
3767
3768    #[test]
3769    fn test_min_quality_threshold_zero_excludes_no_score_docs() {
3770        // Threshold 0.0 passes any document that has a score, but not scoreless ones.
3771        let with_score = doc_with_quality("scored", 0.0, 0.0, 0.0);
3772        let no_score = doc_without_quality("unscored");
3773        let result =
3774            TerraphimService::apply_min_quality_filter(vec![with_score, no_score], Some(0.0));
3775        assert_eq!(result.len(), 1);
3776        assert_eq!(result[0].id, "scored");
3777    }
3778
3779    #[test]
3780    fn test_min_quality_empty_input_returns_empty() {
3781        let result = TerraphimService::apply_min_quality_filter(vec![], Some(0.5));
3782        assert!(result.is_empty());
3783    }
3784
3785    #[test]
3786    fn test_min_quality_preserves_document_order() {
3787        // Verify that documents passing the filter are returned in original order.
3788        let a = doc_with_quality("a", 0.9, 0.9, 0.9);
3789        let b = doc_with_quality("b", 0.8, 0.8, 0.8);
3790        let c = doc_with_quality("c", 0.7, 0.7, 0.7);
3791        let result = TerraphimService::apply_min_quality_filter(vec![a, b, c], Some(0.5));
3792        assert_eq!(result.len(), 3);
3793        assert_eq!(result[0].id, "a");
3794        assert_eq!(result[1].id, "b");
3795        assert_eq!(result[2].id, "c");
3796    }
3797
3798    #[test]
3799    fn test_min_quality_negative_threshold_clamped_to_zero() {
3800        // A negative threshold is clamped to 0.0: documents with any score pass,
3801        // documents without a score are still excluded.
3802        let with_score = doc_with_quality("scored", 0.1, 0.1, 0.1);
3803        let no_score = doc_without_quality("unscored");
3804        let result =
3805            TerraphimService::apply_min_quality_filter(vec![with_score, no_score], Some(-0.1));
3806        assert_eq!(result.len(), 1, "only scored document should pass");
3807        assert_eq!(result[0].id, "scored");
3808    }
3809
3810    #[test]
3811    fn test_snippet_around_ascii_simple() {
3812        let s = "Hello World foo](kg:bar Baz";
3813        let result = snippet_around(s, "](kg:", 10, 10);
3814        assert_eq!(result, " World foo](kg:bar Baz");
3815    }
3816
3817    #[test]
3818    fn test_snippet_around_ascii_truncation_left() {
3819        let s = "xyz Hello World foo](kg:bar";
3820        let result = snippet_around(s, "](kg:", 10, 10);
3821        assert_eq!(result, " World foo](kg:bar");
3822    }
3823
3824    #[test]
3825    fn test_snippet_around_ascii_truncation_right() {
3826        let s = "Hello World foo](kg:bar xyz";
3827        let result = snippet_around(s, "](kg:", 10, 10);
3828        assert_eq!(result, " World foo](kg:bar xyz");
3829    }
3830
3831    #[test]
3832    fn test_snippet_around_multibyte_cjk() {
3833        let s = "日本語 Hello](kg:bar 日本語";
3834        let result = snippet_around(s, "](kg:", 5, 5);
3835        assert!(!result.is_empty());
3836        assert!(result.contains("Hello"));
3837        assert!(result.contains("](kg:"));
3838    }
3839
3840    #[test]
3841    fn test_snippet_around_multibyte_emoji() {
3842        let s = "Hello 😂 World](kg:bar";
3843        let result = snippet_around(s, "](kg:", 10, 10);
3844        assert!(!result.is_empty());
3845        assert!(result.contains("😂"));
3846        assert!(result.contains("](kg:"));
3847    }
3848
3849    #[test]
3850    fn test_snippet_around_marker_not_found() {
3851        let s = "Hello World";
3852        let result = snippet_around(s, "](kg:", 10, 10);
3853        assert_eq!(result, "");
3854    }
3855
3856    #[test]
3857    fn test_snippet_around_empty_string() {
3858        let s = "";
3859        let result = snippet_around(s, "](kg:", 10, 10);
3860        assert_eq!(result, "");
3861    }
3862
3863    #[test]
3864    fn test_snippet_around_marker_at_start() {
3865        let s = "](kg:bar Hello";
3866        let result = snippet_around(s, "](kg:", 10, 10);
3867        assert_eq!(result, "](kg:bar Hello");
3868    }
3869
3870    #[test]
3871    fn test_snippet_around_marker_at_end() {
3872        let s = "Hello ](kg:bar";
3873        let result = snippet_around(s, "](kg:", 10, 10);
3874        assert_eq!(result, "Hello ](kg:bar");
3875    }
3876}