rexis_rag/graph_retrieval/
config.rs

1//! # Graph Retrieval Configuration
2//!
3//! Centralized configuration structures for the graph-based retrieval system.
4
5use super::{
6    algorithms::PageRankConfig, entity::EntityExtractionConfig, query_expansion::ExpansionConfig,
7    storage::GraphStorageConfig,
8};
9use serde::{Deserialize, Serialize};
10
11/// Main configuration for graph-based retrieval
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct GraphConfig {
14    /// Entity extraction configuration
15    pub entity_extraction: EntityExtractionConfig,
16
17    /// Graph storage configuration
18    pub storage: GraphStorageConfig,
19
20    /// Query expansion configuration
21    pub query_expansion: ExpansionConfig,
22
23    /// Algorithm configurations
24    pub algorithms: AlgorithmConfig,
25
26    /// Performance configuration
27    pub performance: PerformanceConfig,
28
29    /// Feature flags
30    pub features: FeatureFlags,
31}
32
33/// Algorithm-specific configurations
34#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct AlgorithmConfig {
36    /// PageRank algorithm configuration
37    pub pagerank: PageRankConfig,
38
39    /// Graph traversal limits
40    pub traversal: TraversalConfig,
41
42    /// Similarity computation settings
43    pub similarity: SimilarityConfig,
44
45    /// Path-finding configuration
46    pub pathfinding: PathFindingConfig,
47}
48
49/// Graph traversal configuration
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct TraversalConfig {
52    /// Maximum depth for graph traversal
53    pub max_depth: usize,
54
55    /// Maximum number of nodes to visit
56    pub max_nodes: usize,
57
58    /// Maximum distance for weighted traversals
59    pub max_distance: f32,
60
61    /// Enable early termination optimizations
62    pub enable_early_termination: bool,
63}
64
65/// Similarity computation configuration
66#[derive(Debug, Clone, Serialize, Deserialize)]
67pub struct SimilarityConfig {
68    /// Default similarity metric
69    pub default_metric: SimilarityMetric,
70
71    /// Threshold for considering nodes similar
72    pub similarity_threshold: f32,
73
74    /// Enable embedding-based similarity
75    pub enable_embedding_similarity: bool,
76
77    /// Enable structural similarity
78    pub enable_structural_similarity: bool,
79
80    /// Weights for different similarity factors
81    pub similarity_weights: SimilarityWeights,
82}
83
84/// Path-finding configuration
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct PathFindingConfig {
87    /// Maximum path length to consider
88    pub max_path_length: usize,
89
90    /// Maximum number of paths to find
91    pub max_paths: usize,
92
93    /// Minimum path score threshold
94    pub min_path_score: f32,
95
96    /// Enable bidirectional search
97    pub enable_bidirectional_search: bool,
98
99    /// Path scoring method
100    pub scoring_method: PathScoringMethod,
101}
102
103/// Performance-related configuration
104#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct PerformanceConfig {
106    /// Enable parallel processing
107    pub enable_parallel_processing: bool,
108
109    /// Number of worker threads
110    pub num_workers: usize,
111
112    /// Batch size for bulk operations
113    pub batch_size: usize,
114
115    /// Cache size limits
116    pub cache_limits: CacheLimits,
117
118    /// Memory usage limits
119    pub memory_limits: MemoryLimits,
120
121    /// Timeout settings
122    pub timeouts: TimeoutConfig,
123}
124
125/// Feature flags for enabling/disabling functionality
126#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct FeatureFlags {
128    /// Enable entity extraction
129    pub entity_extraction: bool,
130
131    /// Enable relationship extraction
132    pub relationship_extraction: bool,
133
134    /// Enable query expansion
135    pub query_expansion: bool,
136
137    /// Enable PageRank scoring
138    pub pagerank_scoring: bool,
139
140    /// Enable path-based retrieval
141    pub path_based_retrieval: bool,
142
143    /// Enable result diversification
144    pub result_diversification: bool,
145
146    /// Enable semantic search
147    pub semantic_search: bool,
148
149    /// Enable graph-based re-ranking
150    pub graph_reranking: bool,
151
152    /// Enable incremental updates
153    pub incremental_updates: bool,
154
155    /// Enable distributed processing
156    pub distributed_processing: bool,
157}
158
159/// Similarity metrics
160#[derive(Debug, Clone, Serialize, Deserialize)]
161pub enum SimilarityMetric {
162    /// Cosine similarity
163    Cosine,
164
165    /// Euclidean distance (converted to similarity)
166    Euclidean,
167
168    /// Jaccard similarity
169    Jaccard,
170
171    /// Dice coefficient
172    Dice,
173
174    /// Custom similarity function
175    Custom(String),
176}
177
178/// Weights for different similarity factors
179#[derive(Debug, Clone, Serialize, Deserialize)]
180pub struct SimilarityWeights {
181    /// Weight for content similarity
182    pub content: f32,
183
184    /// Weight for structural similarity
185    pub structural: f32,
186
187    /// Weight for semantic similarity
188    pub semantic: f32,
189
190    /// Weight for temporal similarity
191    pub temporal: f32,
192
193    /// Weight for metadata similarity
194    pub metadata: f32,
195}
196
197/// Path scoring methods
198#[derive(Debug, Clone, Serialize, Deserialize)]
199pub enum PathScoringMethod {
200    /// Simple path length-based scoring
201    Length,
202
203    /// Edge weight-based scoring
204    EdgeWeight,
205
206    /// PageRank-based scoring
207    PageRank,
208
209    /// Combined scoring using multiple factors
210    Combined(Vec<PathScoringFactor>),
211}
212
213/// Factors for path scoring
214#[derive(Debug, Clone, Serialize, Deserialize)]
215pub struct PathScoringFactor {
216    /// Factor type
217    pub factor_type: PathFactorType,
218
219    /// Weight of this factor
220    pub weight: f32,
221}
222
223/// Types of path scoring factors
224#[derive(Debug, Clone, Serialize, Deserialize)]
225pub enum PathFactorType {
226    /// Path length
227    Length,
228
229    /// Average edge weight
230    AverageEdgeWeight,
231
232    /// Minimum edge weight
233    MinEdgeWeight,
234
235    /// PageRank of nodes in path
236    NodePageRank,
237
238    /// Semantic coherence of path
239    SemanticCoherence,
240}
241
242/// Cache size limits
243#[derive(Debug, Clone, Serialize, Deserialize)]
244pub struct CacheLimits {
245    /// Maximum number of cached queries
246    pub max_cached_queries: usize,
247
248    /// Maximum number of cached PageRank scores
249    pub max_cached_pagerank: usize,
250
251    /// Maximum number of cached entity embeddings
252    pub max_cached_embeddings: usize,
253
254    /// Maximum number of cached paths
255    pub max_cached_paths: usize,
256
257    /// Cache TTL in seconds
258    pub cache_ttl_seconds: u64,
259}
260
261/// Memory usage limits
262#[derive(Debug, Clone, Serialize, Deserialize)]
263pub struct MemoryLimits {
264    /// Maximum graph size in MB
265    pub max_graph_size_mb: usize,
266
267    /// Maximum number of nodes
268    pub max_nodes: usize,
269
270    /// Maximum number of edges
271    pub max_edges: usize,
272
273    /// Memory threshold for triggering cleanup
274    pub cleanup_threshold_mb: usize,
275}
276
277/// Timeout configuration
278#[derive(Debug, Clone, Serialize, Deserialize)]
279pub struct TimeoutConfig {
280    /// Query timeout in seconds
281    pub query_timeout_seconds: u64,
282
283    /// Entity extraction timeout in seconds
284    pub extraction_timeout_seconds: u64,
285
286    /// Graph traversal timeout in seconds
287    pub traversal_timeout_seconds: u64,
288
289    /// PageRank computation timeout in seconds
290    pub pagerank_timeout_seconds: u64,
291}
292
293/// Default implementations
294impl Default for GraphConfig {
295    fn default() -> Self {
296        Self {
297            entity_extraction: EntityExtractionConfig::default(),
298            storage: GraphStorageConfig::default(),
299            query_expansion: ExpansionConfig::default(),
300            algorithms: AlgorithmConfig::default(),
301            performance: PerformanceConfig::default(),
302            features: FeatureFlags::default(),
303        }
304    }
305}
306
307impl Default for AlgorithmConfig {
308    fn default() -> Self {
309        Self {
310            pagerank: PageRankConfig::default(),
311            traversal: TraversalConfig::default(),
312            similarity: SimilarityConfig::default(),
313            pathfinding: PathFindingConfig::default(),
314        }
315    }
316}
317
318impl Default for TraversalConfig {
319    fn default() -> Self {
320        Self {
321            max_depth: 5,
322            max_nodes: 1000,
323            max_distance: 10.0,
324            enable_early_termination: true,
325        }
326    }
327}
328
329impl Default for SimilarityConfig {
330    fn default() -> Self {
331        Self {
332            default_metric: SimilarityMetric::Cosine,
333            similarity_threshold: 0.7,
334            enable_embedding_similarity: true,
335            enable_structural_similarity: true,
336            similarity_weights: SimilarityWeights::default(),
337        }
338    }
339}
340
341impl Default for SimilarityWeights {
342    fn default() -> Self {
343        Self {
344            content: 0.4,
345            structural: 0.2,
346            semantic: 0.3,
347            temporal: 0.05,
348            metadata: 0.05,
349        }
350    }
351}
352
353impl Default for PathFindingConfig {
354    fn default() -> Self {
355        Self {
356            max_path_length: 6,
357            max_paths: 10,
358            min_path_score: 0.1,
359            enable_bidirectional_search: true,
360            scoring_method: PathScoringMethod::Combined(vec![
361                PathScoringFactor {
362                    factor_type: PathFactorType::Length,
363                    weight: 0.3,
364                },
365                PathScoringFactor {
366                    factor_type: PathFactorType::AverageEdgeWeight,
367                    weight: 0.4,
368                },
369                PathScoringFactor {
370                    factor_type: PathFactorType::NodePageRank,
371                    weight: 0.3,
372                },
373            ]),
374        }
375    }
376}
377
378impl Default for PerformanceConfig {
379    fn default() -> Self {
380        Self {
381            enable_parallel_processing: true,
382            num_workers: num_cpus::get(),
383            batch_size: 100,
384            cache_limits: CacheLimits::default(),
385            memory_limits: MemoryLimits::default(),
386            timeouts: TimeoutConfig::default(),
387        }
388    }
389}
390
391impl Default for FeatureFlags {
392    fn default() -> Self {
393        Self {
394            entity_extraction: true,
395            relationship_extraction: true,
396            query_expansion: true,
397            pagerank_scoring: true,
398            path_based_retrieval: true,
399            result_diversification: true,
400            semantic_search: true,
401            graph_reranking: true,
402            incremental_updates: false,
403            distributed_processing: false,
404        }
405    }
406}
407
408impl Default for CacheLimits {
409    fn default() -> Self {
410        Self {
411            max_cached_queries: 1000,
412            max_cached_pagerank: 1,
413            max_cached_embeddings: 10000,
414            max_cached_paths: 5000,
415            cache_ttl_seconds: 3600, // 1 hour
416        }
417    }
418}
419
420impl Default for MemoryLimits {
421    fn default() -> Self {
422        Self {
423            max_graph_size_mb: 1024, // 1 GB
424            max_nodes: 1_000_000,
425            max_edges: 5_000_000,
426            cleanup_threshold_mb: 800,
427        }
428    }
429}
430
431impl Default for TimeoutConfig {
432    fn default() -> Self {
433        Self {
434            query_timeout_seconds: 30,
435            extraction_timeout_seconds: 300, // 5 minutes
436            traversal_timeout_seconds: 60,
437            pagerank_timeout_seconds: 600, // 10 minutes
438        }
439    }
440}
441
442/// Configuration builder for easier setup
443pub struct GraphConfigBuilder {
444    config: GraphConfig,
445}
446
447impl GraphConfigBuilder {
448    /// Create a new configuration builder
449    pub fn new() -> Self {
450        Self {
451            config: GraphConfig::default(),
452        }
453    }
454
455    /// Enable/disable entity extraction
456    pub fn with_entity_extraction(mut self, enabled: bool) -> Self {
457        self.config.features.entity_extraction = enabled;
458        self
459    }
460
461    /// Set entity extraction confidence threshold
462    pub fn with_entity_confidence_threshold(mut self, threshold: f32) -> Self {
463        self.config.entity_extraction.min_confidence = threshold;
464        self
465    }
466
467    /// Enable/disable query expansion
468    pub fn with_query_expansion(mut self, enabled: bool) -> Self {
469        self.config.features.query_expansion = enabled;
470        self
471    }
472
473    /// Set maximum expansion terms
474    pub fn with_max_expansion_terms(mut self, max_terms: usize) -> Self {
475        self.config.query_expansion.max_expansion_terms = max_terms;
476        self
477    }
478
479    /// Enable/disable PageRank scoring
480    pub fn with_pagerank_scoring(mut self, enabled: bool) -> Self {
481        self.config.features.pagerank_scoring = enabled;
482        self
483    }
484
485    /// Set PageRank damping factor
486    pub fn with_pagerank_damping_factor(mut self, damping_factor: f32) -> Self {
487        self.config.algorithms.pagerank.damping_factor = damping_factor;
488        self
489    }
490
491    /// Set graph traversal limits
492    pub fn with_traversal_limits(mut self, max_depth: usize, max_nodes: usize) -> Self {
493        self.config.algorithms.traversal.max_depth = max_depth;
494        self.config.algorithms.traversal.max_nodes = max_nodes;
495        self
496    }
497
498    /// Set similarity threshold
499    pub fn with_similarity_threshold(mut self, threshold: f32) -> Self {
500        self.config.algorithms.similarity.similarity_threshold = threshold;
501        self
502    }
503
504    /// Enable/disable parallel processing
505    pub fn with_parallel_processing(mut self, enabled: bool) -> Self {
506        self.config.performance.enable_parallel_processing = enabled;
507        self
508    }
509
510    /// Set number of worker threads
511    pub fn with_num_workers(mut self, num_workers: usize) -> Self {
512        self.config.performance.num_workers = num_workers;
513        self
514    }
515
516    /// Set batch size
517    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
518        self.config.performance.batch_size = batch_size;
519        self
520    }
521
522    /// Set memory limits
523    pub fn with_memory_limits(
524        mut self,
525        max_graph_size_mb: usize,
526        max_nodes: usize,
527        max_edges: usize,
528    ) -> Self {
529        self.config.performance.memory_limits.max_graph_size_mb = max_graph_size_mb;
530        self.config.performance.memory_limits.max_nodes = max_nodes;
531        self.config.performance.memory_limits.max_edges = max_edges;
532        self
533    }
534
535    /// Set query timeout
536    pub fn with_query_timeout(mut self, timeout_seconds: u64) -> Self {
537        self.config.performance.timeouts.query_timeout_seconds = timeout_seconds;
538        self
539    }
540
541    /// Enable all features (for development/testing)
542    pub fn with_all_features(mut self) -> Self {
543        self.config.features = FeatureFlags {
544            entity_extraction: true,
545            relationship_extraction: true,
546            query_expansion: true,
547            pagerank_scoring: true,
548            path_based_retrieval: true,
549            result_diversification: true,
550            semantic_search: true,
551            graph_reranking: true,
552            incremental_updates: true,
553            distributed_processing: false, // Keep disabled for single-node setup
554        };
555        self
556    }
557
558    /// Enable minimal features (for lightweight deployment)
559    pub fn with_minimal_features(mut self) -> Self {
560        self.config.features = FeatureFlags {
561            entity_extraction: true,
562            relationship_extraction: false,
563            query_expansion: true,
564            pagerank_scoring: false,
565            path_based_retrieval: false,
566            result_diversification: false,
567            semantic_search: true,
568            graph_reranking: false,
569            incremental_updates: false,
570            distributed_processing: false,
571        };
572        self
573    }
574
575    /// Build the configuration
576    pub fn build(self) -> GraphConfig {
577        self.config
578    }
579}
580
581impl Default for GraphConfigBuilder {
582    fn default() -> Self {
583        Self::new()
584    }
585}
586
587/// Validation methods for configuration
588impl GraphConfig {
589    /// Validate the configuration and return warnings/errors
590    pub fn validate(&self) -> Result<Vec<String>, Vec<String>> {
591        let mut warnings = Vec::new();
592        let mut errors = Vec::new();
593
594        // Validate entity extraction settings
595        if self.features.entity_extraction {
596            if self.entity_extraction.min_confidence < 0.0
597                || self.entity_extraction.min_confidence > 1.0
598            {
599                errors.push("Entity extraction confidence must be between 0.0 and 1.0".to_string());
600            }
601
602            if self.entity_extraction.max_entity_length == 0 {
603                errors.push("Maximum entity length must be greater than 0".to_string());
604            }
605        }
606
607        // Validate query expansion settings
608        if self.features.query_expansion {
609            if self.query_expansion.max_expansion_terms == 0 {
610                warnings.push(
611                    "Maximum expansion terms is 0, query expansion will be ineffective".to_string(),
612                );
613            }
614        }
615
616        // Validate algorithm settings
617        if self.algorithms.pagerank.damping_factor < 0.0
618            || self.algorithms.pagerank.damping_factor > 1.0
619        {
620            errors.push("PageRank damping factor must be between 0.0 and 1.0".to_string());
621        }
622
623        if self.algorithms.traversal.max_depth == 0 {
624            errors.push("Maximum traversal depth must be greater than 0".to_string());
625        }
626
627        if self.algorithms.similarity.similarity_threshold < 0.0
628            || self.algorithms.similarity.similarity_threshold > 1.0
629        {
630            errors.push("Similarity threshold must be between 0.0 and 1.0".to_string());
631        }
632
633        // Validate performance settings
634        if self.performance.num_workers == 0 {
635            errors.push("Number of workers must be greater than 0".to_string());
636        }
637
638        if self.performance.batch_size == 0 {
639            errors.push("Batch size must be greater than 0".to_string());
640        }
641
642        // Validate memory limits
643        if self.performance.memory_limits.max_nodes == 0 {
644            errors.push("Maximum number of nodes must be greater than 0".to_string());
645        }
646
647        if self.performance.memory_limits.max_edges == 0 {
648            errors.push("Maximum number of edges must be greater than 0".to_string());
649        }
650
651        // Check for logical inconsistencies
652        if !self.features.entity_extraction && self.features.relationship_extraction {
653            warnings.push(
654                "Relationship extraction requires entity extraction to be enabled".to_string(),
655            );
656        }
657
658        if !self.features.pagerank_scoring
659            && self.algorithms.pathfinding.scoring_method.uses_pagerank()
660        {
661            warnings
662                .push("Path scoring uses PageRank but PageRank scoring is disabled".to_string());
663        }
664
665        if errors.is_empty() {
666            Ok(warnings)
667        } else {
668            Err(errors)
669        }
670    }
671}
672
673impl PathScoringMethod {
674    /// Check if this scoring method uses PageRank
675    pub fn uses_pagerank(&self) -> bool {
676        match self {
677            PathScoringMethod::PageRank => true,
678            PathScoringMethod::Combined(factors) => factors
679                .iter()
680                .any(|f| matches!(f.factor_type, PathFactorType::NodePageRank)),
681            _ => false,
682        }
683    }
684}
685
686#[cfg(test)]
687mod tests {
688    use super::*;
689
690    #[test]
691    fn test_default_config() {
692        let config = GraphConfig::default();
693
694        // Verify that all features are enabled by default
695        assert!(config.features.entity_extraction);
696        assert!(config.features.query_expansion);
697        assert!(config.features.pagerank_scoring);
698
699        // Verify reasonable default values
700        assert!(config.algorithms.pagerank.damping_factor > 0.0);
701        assert!(config.algorithms.pagerank.damping_factor < 1.0);
702        assert!(config.algorithms.traversal.max_depth > 0);
703        assert!(config.performance.batch_size > 0);
704    }
705
706    #[test]
707    fn test_config_builder() {
708        let config = GraphConfigBuilder::new()
709            .with_entity_extraction(true)
710            .with_entity_confidence_threshold(0.8)
711            .with_query_expansion(true)
712            .with_max_expansion_terms(15)
713            .with_pagerank_scoring(true)
714            .with_pagerank_damping_factor(0.9)
715            .with_parallel_processing(true)
716            .with_num_workers(4)
717            .with_batch_size(50)
718            .build();
719
720        assert!(config.features.entity_extraction);
721        assert_eq!(config.entity_extraction.min_confidence, 0.8);
722        assert!(config.features.query_expansion);
723        assert_eq!(config.query_expansion.max_expansion_terms, 15);
724        assert!(config.features.pagerank_scoring);
725        assert_eq!(config.algorithms.pagerank.damping_factor, 0.9);
726        assert!(config.performance.enable_parallel_processing);
727        assert_eq!(config.performance.num_workers, 4);
728        assert_eq!(config.performance.batch_size, 50);
729    }
730
731    #[test]
732    fn test_config_validation() {
733        let mut config = GraphConfig::default();
734
735        // Valid configuration should pass
736        let result = config.validate();
737        assert!(result.is_ok());
738
739        // Invalid damping factor should fail
740        config.algorithms.pagerank.damping_factor = 1.5;
741        let result = config.validate();
742        assert!(result.is_err());
743
744        // Reset and test another invalid setting
745        config.algorithms.pagerank.damping_factor = 0.85;
746        config.performance.num_workers = 0;
747        let result = config.validate();
748        assert!(result.is_err());
749    }
750
751    #[test]
752    fn test_minimal_and_full_features() {
753        let minimal_config = GraphConfigBuilder::new().with_minimal_features().build();
754
755        assert!(minimal_config.features.entity_extraction);
756        assert!(!minimal_config.features.relationship_extraction);
757        assert!(!minimal_config.features.pagerank_scoring);
758
759        let full_config = GraphConfigBuilder::new().with_all_features().build();
760
761        assert!(full_config.features.entity_extraction);
762        assert!(full_config.features.relationship_extraction);
763        assert!(full_config.features.pagerank_scoring);
764        assert!(full_config.features.incremental_updates);
765    }
766}