graphrag_core/config/
mod.rs

1use crate::Result;
2use std::fs;
3
4/// Enhanced configuration options for GraphRAG
5pub mod enhancements;
6/// JSON5 configuration support
7#[cfg(feature = "json5-support")]
8pub mod json5_loader;
9/// Configuration file loading utilities
10pub mod loader;
11/// JSON Schema validation
12#[cfg(feature = "json5-support")]
13pub mod schema_validator;
14/// SetConfig configuration support (TOML, JSON5, YAML, JSON)
15pub mod setconfig;
16/// Configuration validation utilities
17pub mod validation;
18
19pub use setconfig::{
20    AlgorithmicEmbeddingsConfig,
21    AlgorithmicEntityConfig,
22    AlgorithmicGraphConfig,
23    // Algorithmic/Classic NLP pipeline
24    AlgorithmicPipelineConfig,
25    AlgorithmicRetrievalConfig,
26    HybridEmbeddingsConfig,
27    HybridEntityConfig,
28    HybridGraphConfig,
29    // Hybrid pipeline
30    HybridPipelineConfig,
31    HybridRetrievalConfig,
32    HybridWeightsConfig,
33    // Pipeline approach configuration
34    ModeConfig,
35    SemanticEmbeddingsConfig,
36    SemanticEntityConfig,
37    SemanticGraphConfig,
38    // Semantic/Neural pipeline
39    SemanticPipelineConfig,
40    SemanticRetrievalConfig,
41    SetConfig,
42};
43pub use validation::{validate_config_file, Validatable, ValidationResult};
44
45/// Configuration for the GraphRAG system
46#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
47pub struct Config {
48    /// Output directory for storing graphs and data
49    pub output_dir: String,
50
51    /// Chunk size for text processing
52    pub chunk_size: usize,
53
54    /// Overlap between chunks
55    pub chunk_overlap: usize,
56
57    /// Maximum entities per chunk
58    pub max_entities_per_chunk: Option<usize>,
59
60    /// Top-k results for retrieval
61    pub top_k_results: Option<usize>,
62
63    /// Similarity threshold for retrieval
64    pub similarity_threshold: Option<f32>,
65
66    /// Pipeline approach: "semantic", "algorithmic", or "hybrid"
67    /// Determines which implementation strategy to use for entity extraction and retrieval
68    #[serde(default = "default_approach")]
69    pub approach: String,
70
71    /// Vector embedding configuration
72    pub embeddings: EmbeddingConfig,
73
74    /// Graph construction parameters
75    pub graph: GraphConfig,
76
77    /// Text processing settings
78    pub text: TextConfig,
79
80    /// Entity extraction settings
81    pub entities: EntityConfig,
82
83    /// Retrieval system configuration
84    pub retrieval: RetrievalConfig,
85
86    /// Parallel processing configuration
87    pub parallel: ParallelConfig,
88
89    /// Ollama integration configuration
90    pub ollama: crate::ollama::OllamaConfig,
91
92    /// GLiNER-Relex extractor configuration
93    pub gliner: GlinerConfig,
94
95    /// Latest enhancements configuration
96    pub enhancements: enhancements::EnhancementsConfig,
97
98    /// Auto-save configuration for workspace persistence
99    pub auto_save: AutoSaveConfig,
100
101    /// Hierarchical summarization configuration
102    pub summarization: crate::summarization::HierarchicalConfig,
103
104    /// Zero-cost approach configuration
105    pub zero_cost_approach: ZeroCostApproachConfig,
106
107    /// Advanced features configuration (Phases 2-3)
108    #[serde(default)]
109    pub advanced_features: AdvancedFeaturesConfig,
110
111    /// Suppress indicatif progress bars (use hidden draw target).
112    /// Set to `true` when running inside a TUI to avoid corrupting the terminal.
113    #[serde(default)]
114    pub suppress_progress_bars: bool,
115}
116
117/// GLiNER-Relex extractor configuration (joint NER + RE via ONNX Runtime)
118#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
119pub struct GlinerConfig {
120    /// Enable GLiNER-Relex extraction
121    pub enabled: bool,
122    /// Path to the ONNX model file (e.g. "models/gliner-relex-large-v0.5.onnx")
123    pub model_path: String,
124    /// Path to tokenizer.json — defaults to same directory as model_path if empty
125    pub tokenizer_path: String,
126    /// Span-based ("span", default) or token-based ("token") NER pipeline
127    pub mode: String,
128    /// Entity types to extract
129    pub entity_labels: Vec<String>,
130    /// Relation types to extract (empty list disables RE stage)
131    pub relation_labels: Vec<String>,
132    /// Minimum entity confidence threshold (0.0–1.0)
133    pub entity_threshold: f32,
134    /// Minimum relation confidence threshold (0.0–1.0)
135    pub relation_threshold: f32,
136    /// Use GPU (CUDA) for inference
137    pub use_gpu: bool,
138}
139
140impl Default for GlinerConfig {
141    fn default() -> Self {
142        Self {
143            enabled: false,
144            model_path: String::new(),
145            tokenizer_path: String::new(),
146            mode: "span".to_string(),
147            entity_labels: vec![
148                "person".into(),
149                "organization".into(),
150                "location".into(),
151                "concept".into(),
152            ],
153            relation_labels: vec!["related to".into(), "part of".into(), "causes".into()],
154            entity_threshold: 0.4,
155            relation_threshold: 0.5,
156            use_gpu: false,
157        }
158    }
159}
160
161/// Configuration for automatic workspace saving
162#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
163pub struct AutoSaveConfig {
164    /// Enable persistent storage. When false (default), the graph lives in memory only.
165    /// When true, the graph is saved to disk after `build_graph()` and loaded from disk
166    /// on the next `initialize()` call (if the workspace already exists).
167    #[serde(default)]
168    pub enabled: bool,
169
170    /// Base directory where workspace folders are stored.
171    /// Required when `enabled = true`. Example: `"./output"` or `"/data/graphrag"`.
172    #[serde(default)]
173    pub base_dir: Option<String>,
174
175    /// Auto-save interval in seconds (0 = save after every graph build)
176    #[serde(default = "default_auto_save_interval")]
177    pub interval_seconds: u64,
178
179    /// Workspace name — the sub-folder inside `base_dir` (default: "default").
180    #[serde(default)]
181    pub workspace_name: Option<String>,
182
183    /// Maximum number of auto-save versions to keep (0 = unlimited)
184    #[serde(default = "default_max_versions")]
185    pub max_versions: usize,
186}
187
188/// Configuration for zero-cost GraphRAG approaches
189#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
190pub struct ZeroCostApproachConfig {
191    /// Which zero-cost approach to use
192    #[serde(default = "default_zero_cost_approach")]
193    pub approach: String,
194
195    /// LazyGraphRAG-style configuration
196    #[serde(default)]
197    pub lazy_graphrag: LazyGraphRAGConfig,
198
199    /// E2GraphRAG-style configuration
200    #[serde(default)]
201    pub e2_graphrag: E2GraphRAGConfig,
202
203    /// Pure algorithmic configuration
204    #[serde(default)]
205    pub pure_algorithmic: PureAlgorithmicConfig,
206
207    /// Hybrid strategy configuration
208    #[serde(default)]
209    pub hybrid_strategy: HybridStrategyConfig,
210}
211
212/// Configuration for LazyGraphRAG, an efficient approach for large-scale knowledge graphs.
213/// This configuration enables lazy loading and processing of graph components.
214#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
215pub struct LazyGraphRAGConfig {
216    /// Whether LazyGraphRAG is enabled
217    pub enabled: bool,
218    /// Configuration for concept extraction from text
219    pub concept_extraction: ConceptExtractionConfig,
220    /// Configuration for co-occurrence analysis of concepts
221    pub co_occurrence: CoOccurrenceConfig,
222    /// Configuration for lazy indexing of graph components
223    pub indexing: LazyIndexingConfig,
224    /// Configuration for query expansion strategies
225    pub query_expansion: LazyQueryExpansionConfig,
226    /// Configuration for relevance scoring of results
227    pub relevance_scoring: LazyRelevanceScoringConfig,
228}
229
230/// Configuration for extracting concepts from text documents.
231/// This configuration controls how key concepts are identified and extracted from text.
232#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
233pub struct ConceptExtractionConfig {
234    /// Minimum length of a concept in characters
235    pub min_concept_length: usize,
236    /// Maximum number of words in a multi-word concept
237    pub max_concept_words: usize,
238    /// Whether to extract noun phrases as concepts
239    pub use_noun_phrases: bool,
240    /// Whether to consider capitalized words as potential concepts
241    pub use_capitalization: bool,
242    /// Whether to consider title-cased phrases as potential concepts
243    pub use_title_case: bool,
244    /// Whether to use TF-IDF scoring for concept importance
245    pub use_tf_idf_scoring: bool,
246    /// Minimum term frequency for a term to be considered a concept
247    pub min_term_frequency: usize,
248    /// Maximum number of concepts to extract per document chunk
249    pub max_concepts_per_chunk: usize,
250    /// Minimum score threshold for a term to be considered a concept
251    pub min_concept_score: f32,
252    /// Whether to exclude common stopwords from concept extraction
253    pub exclude_stopwords: bool,
254    /// Custom list of stopwords to exclude from concept extraction
255    pub custom_stopwords: Vec<String>,
256}
257
258/// Configuration for co-occurrence analysis of concepts in documents.
259/// This determines how relationships between concepts are identified based on their co-occurrence.
260#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
261pub struct CoOccurrenceConfig {
262    /// Size of the sliding window (in words) to consider for co-occurrence
263    pub window_size: usize,
264    /// Minimum number of co-occurrences required to create an edge between concepts
265    pub min_co_occurrence: usize,
266    /// Jaccard similarity threshold for merging similar concepts
267    pub jaccard_threshold: f32,
268    /// Maximum number of edges allowed per node in the co-occurrence graph
269    pub max_edges_per_node: usize,
270}
271
272/// Configuration for lazy indexing of graph components.
273/// Controls how graph components are indexed for efficient retrieval.
274#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
275pub struct LazyIndexingConfig {
276    /// Whether to use bidirectional indexing for faster lookups
277    pub use_bidirectional_index: bool,
278    /// Whether to enable HNSW (Hierarchical Navigable Small World) index for approximate nearest neighbor search
279    pub enable_hnsw_index: bool,
280    /// Maximum number of items to keep in the index cache
281    pub cache_size: usize,
282}
283
284/// Configuration for lazy query expansion in the retrieval process.
285/// Controls how queries are expanded to improve search results.
286#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
287pub struct LazyQueryExpansionConfig {
288    /// Whether query expansion is enabled
289    pub enabled: bool,
290    /// Maximum number of query expansions to generate
291    pub max_expansions: usize,
292    /// Name of the model to use for query expansion
293    pub expansion_model: String,
294    /// Temperature parameter for controlling randomness in expansion generation
295    pub expansion_temperature: f32,
296    /// Maximum number of tokens to generate per expansion
297    pub max_tokens_per_expansion: usize,
298}
299
300/// Configuration for lazy relevance scoring of search results.
301/// Controls how search results are scored for relevance to the query.
302#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
303pub struct LazyRelevanceScoringConfig {
304    /// Whether relevance scoring is enabled
305    pub enabled: bool,
306    /// Name of the model to use for relevance scoring
307    pub scoring_model: String,
308    /// Number of items to score in a single batch
309    pub batch_size: usize,
310    /// Temperature parameter for controlling randomness in scoring
311    pub temperature: f32,
312    /// Maximum number of tokens to consider for each score calculation
313    pub max_tokens_per_score: usize,
314}
315
316/// End-to-End GraphRAG configuration for comprehensive knowledge graph construction.
317/// This configuration enables fine-grained control over the entire pipeline from text to knowledge graph.
318#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
319pub struct E2GraphRAGConfig {
320    /// Whether the E2E GraphRAG pipeline is enabled
321    pub enabled: bool,
322
323    /// Configuration for Named Entity Recognition (NER) extraction
324    pub ner_extraction: NERExtractionConfig,
325
326    /// Configuration for keyword extraction from text
327    pub keyword_extraction: KeywordExtractionConfig,
328
329    /// Configuration for graph construction parameters
330    pub graph_construction: E2GraphConstructionConfig,
331
332    /// Configuration for indexing strategies
333    pub indexing: E2IndexingConfig,
334}
335
336/// Configuration for Named Entity Recognition (NER) extraction from text.
337/// Controls how named entities are identified and extracted from documents.
338#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
339pub struct NERExtractionConfig {
340    /// List of entity types to recognize (e.g., ["PERSON", "ORG", "LOCATION"])
341    pub entity_types: Vec<String>,
342
343    /// Whether to recognize capitalized words as potential named entities
344    pub use_capitalized_patterns: bool,
345
346    /// Whether to recognize title-cased phrases as potential named entities
347    pub use_title_case_patterns: bool,
348
349    /// Whether to recognize quoted phrases as potential named entities
350    pub use_quoted_patterns: bool,
351
352    /// Whether to recognize common abbreviations as entities
353    pub use_abbreviations: bool,
354
355    /// Whether to use contextual disambiguation to resolve entity ambiguity
356    pub use_contextual_disambiguation: bool,
357
358    /// Minimum number of context words to consider for disambiguation
359    pub min_context_words: usize,
360
361    /// Minimum confidence score (0.0-1.0) required for an entity to be included
362    pub min_confidence: f32,
363
364    /// Whether to apply positional boost to entities based on their position in the text
365    pub use_positional_boost: bool,
366
367    /// Whether to apply frequency boost to entities based on their frequency in the text
368    pub use_frequency_boost: bool,
369}
370
371impl Default for NERExtractionConfig {
372    fn default() -> Self {
373        Self {
374            entity_types: vec![
375                "PERSON".to_string(),
376                "ORG".to_string(),
377                "LOCATION".to_string(),
378            ],
379            use_capitalized_patterns: true,
380            use_title_case_patterns: true,
381            use_quoted_patterns: true,
382            use_abbreviations: true,
383            use_contextual_disambiguation: true,
384            min_context_words: 5,
385            min_confidence: 0.7,
386            use_positional_boost: true,
387            use_frequency_boost: true,
388        }
389    }
390}
391
392/// Configuration for keyword extraction from text documents.
393/// Controls how keywords are identified and extracted from text content.
394#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
395pub struct KeywordExtractionConfig {
396    /// List of algorithms to use for keyword extraction (e.g., ["tfidf", "yake", "textrank"])
397    pub algorithms: Vec<String>,
398
399    /// Maximum number of keywords to extract per document chunk
400    pub max_keywords_per_chunk: usize,
401
402    /// Minimum length of a keyword in characters
403    pub min_keyword_length: usize,
404
405    /// Whether to combine results from multiple algorithms
406    pub combine_algorithms: bool,
407}
408
409impl Default for KeywordExtractionConfig {
410    fn default() -> Self {
411        Self {
412            algorithms: vec!["tfidf".to_string(), "yake".to_string()],
413            max_keywords_per_chunk: 10,
414            min_keyword_length: 3,
415            combine_algorithms: true,
416        }
417    }
418}
419
420/// Configuration for graph construction in the E2E GraphRAG pipeline.
421/// Controls how entities and their relationships are organized into a knowledge graph.
422#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
423pub struct E2GraphConstructionConfig {
424    /// Types of relationships to extract between entities (e.g., ["CO_OCCURS_WITH", "RELATED_TO"])
425    pub relationship_types: Vec<String>,
426
427    /// Minimum score required to establish a relationship between entities (0.0-1.0)
428    pub min_relationship_score: f32,
429
430    /// Maximum number of relationships to maintain per entity
431    pub max_relationships_per_entity: usize,
432
433    /// Whether to use mutual information for relationship scoring
434    pub use_mutual_information: bool,
435}
436
437impl Default for E2GraphConstructionConfig {
438    fn default() -> Self {
439        Self {
440            relationship_types: vec!["CO_OCCURS_WITH".to_string(), "RELATED_TO".to_string()],
441            min_relationship_score: 0.5,
442            max_relationships_per_entity: 20,
443            use_mutual_information: true,
444        }
445    }
446}
447
448/// Configuration for indexing in the E2E GraphRAG pipeline.
449/// Controls how entities, relationships, and their embeddings are indexed for efficient retrieval.
450#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
451pub struct E2IndexingConfig {
452    /// Number of items to process in a single batch during indexing
453    pub batch_size: usize,
454
455    /// Whether to enable parallel processing during indexing
456    pub enable_parallel_processing: bool,
457
458    /// Whether to cache concept vectors for faster retrieval
459    pub cache_concept_vectors: bool,
460
461    /// Whether to use hash embeddings for more efficient storage
462    pub use_hash_embeddings: bool,
463}
464
465impl Default for E2IndexingConfig {
466    fn default() -> Self {
467        Self {
468            batch_size: 32,
469            enable_parallel_processing: true,
470            cache_concept_vectors: true,
471            use_hash_embeddings: false,
472        }
473    }
474}
475
476/// Configuration for pure algorithmic GraphRAG approach without LLM dependencies.
477///
478/// This configuration enables cost-effective graph construction and analysis
479/// using only algorithmic methods for pattern extraction, keyword analysis,
480/// and relationship discovery.
481#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
482pub struct PureAlgorithmicConfig {
483    /// Whether the pure algorithmic approach is enabled
484    pub enabled: bool,
485    /// Configuration for extracting linguistic patterns from text
486    pub pattern_extraction: PatternExtractionConfig,
487    /// Configuration for keyword extraction using statistical methods
488    pub keyword_extraction: PureKeywordExtractionConfig,
489    /// Configuration for discovering relationships between entities
490    pub relationship_discovery: RelationshipDiscoveryConfig,
491    /// Configuration for search result ranking algorithms
492    pub search_ranking: SearchRankingConfig,
493}
494
495/// Configuration for pattern extraction from text using regex and linguistic rules.
496///
497/// Pattern extraction identifies consistent linguistic structures that can indicate
498/// entities, relationships, and semantic patterns without requiring LLM processing.
499#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
500pub struct PatternExtractionConfig {
501    /// Regex patterns for identifying capitalized entities (proper nouns, acronyms)
502    pub capitalized_patterns: Vec<String>,
503    /// Regex patterns for technical terms, jargon, and specialized language
504    pub technical_patterns: Vec<String>,
505    /// Regex patterns for contextual relationships and semantic structures
506    pub context_patterns: Vec<String>,
507}
508
509/// Configuration for keyword extraction using statistical algorithms.
510///
511/// This configuration enables extraction of important terms from text using
512/// algorithms like TF-IDF, RAKE, or YAKE without requiring LLM processing.
513#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
514pub struct PureKeywordExtractionConfig {
515    /// Algorithm to use for keyword extraction (e.g., "tfidf", "rake", "yake")
516    pub algorithm: String,
517    /// Maximum number of keywords to extract per document
518    pub max_keywords: usize,
519    /// Minimum word length to consider for keywords
520    pub min_word_length: usize,
521    /// Whether to boost keywords based on their position in text
522    pub use_positional_boost: bool,
523    /// Whether to filter keywords based on frequency thresholds
524    pub use_frequency_filter: bool,
525    /// Minimum term frequency for a word to be considered a keyword
526    pub min_term_frequency: usize,
527    /// Maximum term frequency ratio to filter out overly common terms
528    pub max_term_frequency_ratio: f32,
529}
530
531/// Configuration for discovering relationships between entities using co-occurrence analysis.
532///
533/// This configuration enables algorithmic relationship discovery by analyzing
534/// word co-occurrence patterns and statistical measures without LLM inference.
535#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
536pub struct RelationshipDiscoveryConfig {
537    /// Window size for co-occurrence analysis (number of words to check around entities)
538    pub window_size: usize,
539    /// Minimum co-occurrence count to establish a relationship
540    pub min_co_occurrence: usize,
541    /// Whether to use mutual information scoring for relationship strength
542    pub use_mutual_information: bool,
543    /// Types of relationships to identify (e.g., "causal", "hierarchical", "temporal")
544    pub relationship_types: Vec<String>,
545    /// Scoring method for relationship ranking (e.g., "frequency", "mi", "pmi")
546    pub scoring_method: String,
547    /// Minimum similarity score threshold for valid relationships
548    pub min_similarity_score: f32,
549}
550
551/// Configuration for search result ranking across multiple retrieval strategies.
552///
553/// This configuration enables combining different search approaches (vector, keyword,
554/// graph traversal) and fusing their results for optimal relevance ranking.
555#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
556pub struct SearchRankingConfig {
557    /// Configuration for vector-based similarity search
558    pub vector_search: VectorSearchConfig,
559    /// Configuration for keyword-based search algorithms (e.g., BM25)
560    pub keyword_search: KeywordSearchConfig,
561    /// Configuration for graph-based traversal and ranking
562    pub graph_traversal: GraphTraversalConfig,
563    /// Configuration for hybrid fusion of multiple search strategies
564    pub hybrid_fusion: HybridFusionConfig,
565}
566
567/// Configuration for vector-based similarity search.
568///
569/// Enables semantic search using embeddings and similarity scoring
570/// for finding conceptually related content.
571#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
572pub struct VectorSearchConfig {
573    /// Whether vector similarity search is enabled
574    pub enabled: bool,
575}
576
577/// Configuration for keyword-based search algorithms.
578///
579/// Enables traditional information retrieval algorithms like BM25
580/// for keyword matching and scoring.
581#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
582pub struct KeywordSearchConfig {
583    /// Whether keyword-based search is enabled
584    pub enabled: bool,
585    /// Search algorithm to use (e.g., "bm25", "tfidf", "dirichlet")
586    pub algorithm: String,
587    /// BM25 parameter k1: controls term frequency saturation (typically 1.2-2.0)
588    pub k1: f32,
589    /// BM25 parameter b: controls document length normalization (typically 0.0-1.0)
590    pub b: f32,
591}
592
593/// Configuration for graph-based traversal and ranking algorithms.
594///
595/// Enables graph algorithms like PageRank and personalized search
596/// for navigating and ranking content in the knowledge graph.
597#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
598pub struct GraphTraversalConfig {
599    /// Whether graph traversal algorithms are enabled
600    pub enabled: bool,
601    /// Algorithm to use for graph traversal (e.g., "pagerank", "hits", "random_walk")
602    pub algorithm: String,
603    /// Damping factor for PageRank algorithm (typically 0.85)
604    pub damping_factor: f32,
605    /// Maximum iterations for graph algorithms
606    pub max_iterations: usize,
607    /// Whether to use personalized graph traversal
608    pub personalized: bool,
609}
610
611/// Configuration for hybrid fusion of multiple search strategies.
612///
613/// Enables combining results from different search approaches (vector, keyword,
614/// graph) using weighted scoring for improved relevance.
615#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
616pub struct HybridFusionConfig {
617    /// Whether hybrid fusion of search results is enabled
618    pub enabled: bool,
619    /// Weight configuration for different search strategies
620    pub weights: FusionWeights,
621}
622
623/// Weight configuration for combining different search strategies.
624///
625/// Defines the relative importance of each search approach in the
626/// hybrid fusion algorithm. Weights should typically sum to 1.0.
627#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
628pub struct FusionWeights {
629    /// Weight for keyword-based search results
630    pub keywords: f32,
631    /// Weight for graph traversal-based search results
632    pub graph: f32,
633    /// Weight for BM25/TF-IDF statistical search results
634    pub bm25: f32,
635}
636
637/// Configuration for hybrid GraphRAG strategies combining algorithmic and LLM approaches.
638///
639/// This configuration enables different hybrid strategies for balancing cost,
640/// performance, and quality through intelligent LLM usage.
641#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
642pub struct HybridStrategyConfig {
643    /// Configuration for lazy algorithmic approach with selective LLM enhancement
644    pub lazy_algorithmic: LazyAlgorithmicConfig,
645    /// Configuration for progressive multi-level LLM usage
646    pub progressive: ProgressiveConfig,
647    /// Configuration for budget-aware LLM optimization
648    pub budget_aware: BudgetAwareConfig,
649}
650
651/// Configuration for lazy algorithmic approach with selective LLM enhancement.
652///
653/// This strategy primarily uses algorithmic methods and only invokes LLMs
654/// when necessary to improve quality or handle complex cases.
655#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
656pub struct LazyAlgorithmicConfig {
657    /// Indexing strategy (e.g., "algorithmic_first", "llm_assisted", "hybrid")
658    pub indexing_approach: String,
659    /// Query processing strategy (e.g., "algorithmic_only", "selective_llm", "adaptive")
660    pub query_approach: String,
661    /// Cost optimization strategy (e.g., "aggressive", "balanced", "quality_first")
662    pub cost_optimization: String,
663}
664
665/// Configuration for progressive multi-level LLM usage strategy.
666///
667/// This strategy uses different levels of LLM involvement based on
668/// query complexity, budget, and quality requirements.
669#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
670pub struct ProgressiveConfig {
671    /// Level 0: Pure algorithmic processing (no LLM usage)
672    pub level_0: String,
673    /// Level 1: Minimal LLM usage (entity extraction only)
674    pub level_1: String,
675    /// Level 2: Moderate LLM usage (entity + relationship extraction)
676    pub level_2: String,
677    /// Level 3: Heavy LLM usage (full semantic analysis)
678    pub level_3: String,
679    /// Level 4+: Maximum LLM usage (comprehensive processing)
680    pub level_4_plus: String,
681}
682
683/// Configuration for budget-aware LLM optimization strategy.
684///
685/// This strategy dynamically adjusts LLM usage based on budget constraints,
686/// query costs, and daily spending limits to ensure cost control.
687#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
688pub struct BudgetAwareConfig {
689    /// Daily budget limit in USD for LLM operations
690    pub daily_budget_usd: f64,
691    /// Maximum number of queries allowed per day
692    pub queries_per_day: usize,
693    /// Maximum LLM cost allowed per individual query
694    pub max_llm_cost_per_query: f64,
695    /// Budget management strategy (e.g., "throttle", "degrade", "stop")
696    pub strategy: String,
697    /// Whether to fall back to pure algorithmic processing when budget is exceeded
698    pub fallback_to_algorithmic: bool,
699}
700
701// Default functions for zero-cost approach
702fn default_zero_cost_approach() -> String {
703    "pure_algorithmic".to_string()
704}
705
706impl Default for ZeroCostApproachConfig {
707    fn default() -> Self {
708        Self {
709            approach: default_zero_cost_approach(),
710            lazy_graphrag: LazyGraphRAGConfig::default(),
711            e2_graphrag: E2GraphRAGConfig::default(),
712            pure_algorithmic: PureAlgorithmicConfig::default(),
713            hybrid_strategy: HybridStrategyConfig::default(),
714        }
715    }
716}
717
718// Default implementations for sub-configs (simplified for now)
719impl Default for ConceptExtractionConfig {
720    fn default() -> Self {
721        Self {
722            min_concept_length: 3,
723            max_concept_words: 5,
724            use_noun_phrases: true,
725            use_capitalization: true,
726            use_title_case: true,
727            use_tf_idf_scoring: true,
728            min_term_frequency: 2,
729            max_concepts_per_chunk: 10,
730            min_concept_score: 0.1,
731            exclude_stopwords: true,
732            custom_stopwords: vec!["the".to_string(), "and".to_string(), "or".to_string()],
733        }
734    }
735}
736impl Default for CoOccurrenceConfig {
737    fn default() -> Self {
738        Self {
739            window_size: 50,
740            min_co_occurrence: 2,
741            jaccard_threshold: 0.2,
742            max_edges_per_node: 25,
743        }
744    }
745}
746impl Default for LazyIndexingConfig {
747    fn default() -> Self {
748        Self {
749            use_bidirectional_index: true,
750            enable_hnsw_index: false,
751            cache_size: 10000,
752        }
753    }
754}
755impl Default for LazyQueryExpansionConfig {
756    fn default() -> Self {
757        Self {
758            enabled: true,
759            max_expansions: 3,
760            expansion_model: "llama3.1:8b".to_string(),
761            expansion_temperature: 0.1,
762            max_tokens_per_expansion: 50,
763        }
764    }
765}
766impl Default for LazyRelevanceScoringConfig {
767    fn default() -> Self {
768        Self {
769            enabled: true,
770            scoring_model: "llama3.1:8b".to_string(),
771            batch_size: 10,
772            temperature: 0.2,
773            max_tokens_per_score: 30,
774        }
775    }
776}
777impl Default for PureAlgorithmicConfig {
778    fn default() -> Self {
779        Self {
780            enabled: true,
781            pattern_extraction: Default::default(),
782            keyword_extraction: Default::default(),
783            relationship_discovery: Default::default(),
784            search_ranking: Default::default(),
785        }
786    }
787}
788impl Default for PatternExtractionConfig {
789    fn default() -> Self {
790        Self {
791            capitalized_patterns: vec![r"[A-Z][a-z]+".to_string()],
792            technical_patterns: vec![r"[a-z]+-[a-z]+".to_string()],
793            context_patterns: vec![r"\b(the|this)\s+(\w+)".to_string()],
794        }
795    }
796}
797impl Default for PureKeywordExtractionConfig {
798    fn default() -> Self {
799        Self {
800            algorithm: "tf_idf".to_string(),
801            max_keywords: 20,
802            min_word_length: 4,
803            use_positional_boost: true,
804            use_frequency_filter: true,
805            min_term_frequency: 2,
806            max_term_frequency_ratio: 0.8,
807        }
808    }
809}
810impl Default for RelationshipDiscoveryConfig {
811    fn default() -> Self {
812        Self {
813            window_size: 30,
814            min_co_occurrence: 2,
815            use_mutual_information: true,
816            relationship_types: vec!["co_occurs_with".to_string()],
817            scoring_method: "jaccard_similarity".to_string(),
818            min_similarity_score: 0.1,
819        }
820    }
821}
822impl Default for SearchRankingConfig {
823    fn default() -> Self {
824        Self {
825            vector_search: VectorSearchConfig { enabled: false },
826            keyword_search: KeywordSearchConfig {
827                enabled: true,
828                algorithm: "bm25".to_string(),
829                k1: 1.2,
830                b: 0.75,
831            },
832            graph_traversal: GraphTraversalConfig {
833                enabled: true,
834                algorithm: "pagerank".to_string(),
835                damping_factor: 0.85,
836                max_iterations: 20,
837                personalized: true,
838            },
839            hybrid_fusion: HybridFusionConfig {
840                enabled: true,
841                weights: FusionWeights {
842                    keywords: 0.4,
843                    graph: 0.4,
844                    bm25: 0.2,
845                },
846            },
847        }
848    }
849}
850impl Default for HybridStrategyConfig {
851    fn default() -> Self {
852        Self {
853            lazy_algorithmic: LazyAlgorithmicConfig {
854                indexing_approach: "e2_graphrag".to_string(),
855                query_approach: "lazy_graphrag".to_string(),
856                cost_optimization: "indexing".to_string(),
857            },
858            progressive: ProgressiveConfig {
859                level_0: "pure_algorithmic".to_string(),
860                level_1: "pure_algorithmic".to_string(),
861                level_2: "e2_graphrag".to_string(),
862                level_3: "lazy_graphrag".to_string(),
863                level_4_plus: "lazy_graphrag".to_string(),
864            },
865            budget_aware: BudgetAwareConfig {
866                daily_budget_usd: 1.0,
867                queries_per_day: 1000,
868                max_llm_cost_per_query: 0.002,
869                strategy: "lazy_graphrag".to_string(),
870                fallback_to_algorithmic: true,
871            },
872        }
873    }
874}
875impl Default for KeywordSearchConfig {
876    fn default() -> Self {
877        Self {
878            enabled: true,
879            algorithm: "bm25".to_string(),
880            k1: 1.2,
881            b: 0.75,
882        }
883    }
884}
885impl Default for GraphTraversalConfig {
886    fn default() -> Self {
887        Self {
888            enabled: true,
889            algorithm: "pagerank".to_string(),
890            damping_factor: 0.85,
891            max_iterations: 20,
892            personalized: true,
893        }
894    }
895}
896impl Default for HybridFusionConfig {
897    fn default() -> Self {
898        Self {
899            enabled: true,
900            weights: FusionWeights {
901                keywords: 0.4,
902                graph: 0.4,
903                bm25: 0.2,
904            },
905        }
906    }
907}
908impl Default for FusionWeights {
909    fn default() -> Self {
910        Self {
911            keywords: 0.4,
912            graph: 0.4,
913            bm25: 0.2,
914        }
915    }
916}
917impl Default for LazyAlgorithmicConfig {
918    fn default() -> Self {
919        Self {
920            indexing_approach: "e2_graphrag".to_string(),
921            query_approach: "lazy_graphrag".to_string(),
922            cost_optimization: "indexing".to_string(),
923        }
924    }
925}
926impl Default for ProgressiveConfig {
927    fn default() -> Self {
928        Self {
929            level_0: "pure_algorithmic".to_string(),
930            level_1: "pure_algorithmic".to_string(),
931            level_2: "e2_graphrag".to_string(),
932            level_3: "lazy_graphrag".to_string(),
933            level_4_plus: "lazy_graphrag".to_string(),
934        }
935    }
936}
937impl Default for BudgetAwareConfig {
938    fn default() -> Self {
939        Self {
940            daily_budget_usd: 1.0,
941            queries_per_day: 1000,
942            max_llm_cost_per_query: 0.002,
943            strategy: "lazy_graphrag".to_string(),
944            fallback_to_algorithmic: true,
945        }
946    }
947}
948
949/// Configuration for embedding generation
950#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
951pub struct EmbeddingConfig {
952    /// Dimension of the embedding vectors
953    pub dimension: usize,
954
955    /// Embedding backend: "hash", "ollama", "huggingface", "openai", "voyage", "cohere", "jina", "mistral", "together", "onnx", "candle"
956    pub backend: String,
957
958    /// Model identifier (provider-specific)
959    /// - HuggingFace: "sentence-transformers/all-MiniLM-L6-v2"
960    /// - OpenAI: "text-embedding-3-small"
961    /// - Voyage: "voyage-3-large"
962    /// - Cohere: "embed-english-v3.0"
963    /// - Jina: "jina-embeddings-v3"
964    /// - Mistral: "mistral-embed"
965    /// - Together: "BAAI/bge-large-en-v1.5"
966    /// - Ollama: "nomic-embed-text"
967    #[serde(default)]
968    pub model: Option<String>,
969
970    /// Whether to fallback to hash-based embeddings if primary backend fails
971    pub fallback_to_hash: bool,
972
973    /// API endpoint for embeddings (if using external service)
974    pub api_endpoint: Option<String>,
975
976    /// API key for external embedding service
977    /// Can also be set via environment variables (OPENAI_API_KEY, VOYAGE_API_KEY, etc.)
978    pub api_key: Option<String>,
979
980    /// Cache directory for downloaded models (HuggingFace)
981    #[serde(default)]
982    pub cache_dir: Option<String>,
983
984    /// Batch size for processing multiple texts
985    #[serde(default = "default_batch_size")]
986    pub batch_size: usize,
987}
988
989fn default_batch_size() -> usize {
990    32
991}
992
993/// Configuration for graph construction
994#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
995pub struct GraphConfig {
996    /// Maximum number of connections per node
997    pub max_connections: usize,
998
999    /// Similarity threshold for creating edges
1000    pub similarity_threshold: f32,
1001
1002    /// Whether to extract relationships between entities
1003    #[serde(default = "default_true")]
1004    pub extract_relationships: bool,
1005
1006    /// Confidence threshold for relationships
1007    #[serde(default = "default_relationship_confidence")]
1008    pub relationship_confidence_threshold: f32,
1009
1010    /// Graph traversal configuration
1011    #[serde(default)]
1012    pub traversal: TraversalConfigParams,
1013}
1014
1015/// Configuration for graph traversal algorithms
1016#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1017pub struct TraversalConfigParams {
1018    /// Maximum depth for traversal algorithms (BFS, DFS)
1019    #[serde(default = "default_max_traversal_depth")]
1020    pub max_depth: usize,
1021
1022    /// Maximum number of paths to find (for pathfinding algorithms)
1023    #[serde(default = "default_max_paths")]
1024    pub max_paths: usize,
1025
1026    /// Whether to use edge weights in traversal
1027    #[serde(default = "default_true")]
1028    pub use_edge_weights: bool,
1029
1030    /// Minimum relationship strength to consider in traversal
1031    #[serde(default = "default_min_relationship_strength")]
1032    pub min_relationship_strength: f32,
1033}
1034
1035impl Default for TraversalConfigParams {
1036    fn default() -> Self {
1037        Self {
1038            max_depth: default_max_traversal_depth(),
1039            max_paths: default_max_paths(),
1040            use_edge_weights: true,
1041            min_relationship_strength: default_min_relationship_strength(),
1042        }
1043    }
1044}
1045
1046/// Configuration for text processing
1047#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1048pub struct TextConfig {
1049    /// Maximum chunk size for text processing
1050    pub chunk_size: usize,
1051
1052    /// Overlap between chunks
1053    pub chunk_overlap: usize,
1054
1055    /// Languages to support for text processing
1056    pub languages: Vec<String>,
1057}
1058
1059/// Configuration for entity extraction
1060#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1061pub struct EntityConfig {
1062    /// Minimum confidence score for entity extraction
1063    pub min_confidence: f32,
1064
1065    /// Types of entities to extract
1066    pub entity_types: Vec<String>,
1067
1068    /// Whether to use LLM-based gleaning for entity extraction
1069    #[serde(default)]
1070    pub use_gleaning: bool,
1071
1072    /// Maximum number of gleaning rounds for refinement
1073    #[serde(default = "default_max_gleaning_rounds")]
1074    pub max_gleaning_rounds: usize,
1075
1076    /// Enable triple reflection validation (DEG-RAG methodology)
1077    /// Validates extracted relationships against source text using LLM
1078    #[serde(default)]
1079    pub enable_triple_reflection: bool,
1080
1081    /// Minimum confidence score for relationship validation
1082    /// Relationships below this threshold will be filtered out
1083    #[serde(default = "default_validation_confidence")]
1084    pub validation_min_confidence: f32,
1085
1086    /// Enable ATOM atomic fact extraction (Phase 1.3)
1087    /// Extracts self-contained facts as 5-tuples for better granularity
1088    #[serde(default)]
1089    pub use_atomic_facts: bool,
1090
1091    /// Maximum tokens per atomic fact
1092    /// Facts longer than this will be rejected
1093    #[serde(default = "default_max_fact_tokens")]
1094    pub max_fact_tokens: usize,
1095}
1096
1097/// Configuration for advanced GraphRAG features (Phases 2-3)
1098#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1099pub struct AdvancedFeaturesConfig {
1100    /// Phase 2.1: Symbolic Anchoring (CatRAG)
1101    /// Automatically applied for conceptual queries - no config needed
1102    #[serde(default)]
1103    pub symbolic_anchoring: SymbolicAnchoringConfig,
1104
1105    /// Phase 2.2: Dynamic Edge Weighting
1106    /// Query-aware relationship weight adjustment
1107    #[serde(default)]
1108    pub dynamic_weighting: DynamicWeightingConfig,
1109
1110    /// Phase 2.3: Causal Chain Analysis
1111    /// Multi-step causal reasoning
1112    #[serde(default)]
1113    pub causal_analysis: CausalAnalysisConfig,
1114
1115    /// Phase 3.1: Hierarchical Relationship Clustering
1116    /// Multi-level relationship organization
1117    #[serde(default)]
1118    pub hierarchical_clustering: HierarchicalClusteringConfig,
1119
1120    /// Phase 3.2: Graph Weight Optimization (DW-GRPO)
1121    /// Heuristic optimization of relationship weights
1122    #[serde(default)]
1123    pub weight_optimization: WeightOptimizationConfig,
1124}
1125
1126/// Configuration for Symbolic Anchoring (Phase 2.1)
1127#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1128pub struct SymbolicAnchoringConfig {
1129    /// Minimum relevance score to keep an anchor (0.0-1.0)
1130    #[serde(default = "default_anchor_min_relevance")]
1131    pub min_relevance: f32,
1132
1133    /// Maximum number of anchors to extract per query
1134    #[serde(default = "default_max_anchors")]
1135    pub max_anchors: usize,
1136
1137    /// Maximum entities per anchor
1138    #[serde(default = "default_max_entities_per_anchor")]
1139    pub max_entities_per_anchor: usize,
1140}
1141
1142/// Configuration for Dynamic Edge Weighting (Phase 2.2)
1143#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1144pub struct DynamicWeightingConfig {
1145    /// Enable semantic boost using embeddings
1146    #[serde(default = "default_true")]
1147    pub enable_semantic_boost: bool,
1148
1149    /// Enable temporal boost for recent relationships
1150    #[serde(default = "default_true")]
1151    pub enable_temporal_boost: bool,
1152
1153    /// Enable conceptual boost for matching concepts
1154    #[serde(default = "default_true")]
1155    pub enable_concept_boost: bool,
1156
1157    /// Enable causal boost for strong causal relationships
1158    #[serde(default = "default_true")]
1159    pub enable_causal_boost: bool,
1160}
1161
1162/// Configuration for Causal Chain Analysis (Phase 2.3)
1163#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1164pub struct CausalAnalysisConfig {
1165    /// Minimum confidence for causal chains (0.0-1.0)
1166    #[serde(default = "default_causal_min_confidence")]
1167    pub min_confidence: f32,
1168
1169    /// Minimum causal strength to consider (0.0-1.0)
1170    #[serde(default = "default_causal_min_strength")]
1171    pub min_causal_strength: f32,
1172
1173    /// Maximum chain depth to search
1174    #[serde(default = "default_max_chain_depth")]
1175    pub max_chain_depth: usize,
1176
1177    /// Require temporal consistency in chains
1178    #[serde(default = "default_true")]
1179    pub require_temporal_consistency: bool,
1180}
1181
1182/// Configuration for Hierarchical Relationship Clustering (Phase 3.1)
1183#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1184pub struct HierarchicalClusteringConfig {
1185    /// Number of hierarchy levels (2-5)
1186    #[serde(default = "default_num_levels")]
1187    pub num_levels: usize,
1188
1189    /// Resolution parameters for each level (higher = more clusters)
1190    /// Length should match num_levels
1191    #[serde(default = "default_resolutions")]
1192    pub resolutions: Vec<f32>,
1193
1194    /// Minimum relationships per cluster
1195    #[serde(default = "default_min_cluster_size")]
1196    pub min_cluster_size: usize,
1197
1198    /// Generate LLM summaries for clusters (requires Ollama)
1199    #[serde(default = "default_true")]
1200    pub generate_summaries: bool,
1201}
1202
1203/// Configuration for Graph Weight Optimization (Phase 3.2)
1204#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1205pub struct WeightOptimizationConfig {
1206    /// Learning rate for weight adjustments (0.01-0.5)
1207    #[serde(default = "default_learning_rate")]
1208    pub learning_rate: f32,
1209
1210    /// Maximum optimization iterations
1211    #[serde(default = "default_max_iterations")]
1212    pub max_iterations: usize,
1213
1214    /// Window size for slope calculation
1215    #[serde(default = "default_slope_window")]
1216    pub slope_window: usize,
1217
1218    /// Minimum slope to avoid stagnation
1219    #[serde(default = "default_stagnation_threshold")]
1220    pub stagnation_threshold: f32,
1221
1222    /// Use LLM for quality evaluation
1223    #[serde(default = "default_true")]
1224    pub use_llm_eval: bool,
1225
1226    /// Objective weights (relevance, faithfulness, conciseness)
1227    #[serde(default)]
1228    pub objective_weights: ObjectiveWeightsConfig,
1229}
1230
1231/// Configuration for optimization objective weights
1232#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1233pub struct ObjectiveWeightsConfig {
1234    /// Weight for relevance objective (0.0-1.0)
1235    #[serde(default = "default_relevance_weight")]
1236    pub relevance: f32,
1237
1238    /// Weight for faithfulness objective (0.0-1.0)
1239    #[serde(default = "default_faithfulness_weight")]
1240    pub faithfulness: f32,
1241
1242    /// Weight for conciseness objective (0.0-1.0)
1243    #[serde(default = "default_conciseness_weight")]
1244    pub conciseness: f32,
1245}
1246
1247/// Configuration for retrieval operations
1248#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1249pub struct RetrievalConfig {
1250    /// Number of top results to return
1251    pub top_k: usize,
1252
1253    /// Search algorithm to use
1254    pub search_algorithm: String,
1255}
1256
1257/// Configuration for parallel processing
1258#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1259pub struct ParallelConfig {
1260    /// Number of threads to use for parallel processing (0 = auto-detect)
1261    pub num_threads: usize,
1262
1263    /// Enable parallel processing
1264    pub enabled: bool,
1265
1266    /// Minimum batch size for parallel processing
1267    pub min_batch_size: usize,
1268
1269    /// Chunk size for parallel text processing
1270    pub chunk_batch_size: usize,
1271
1272    /// Parallel processing for embeddings
1273    pub parallel_embeddings: bool,
1274
1275    /// Parallel graph construction
1276    pub parallel_graph_ops: bool,
1277
1278    /// Parallel vector operations
1279    pub parallel_vector_ops: bool,
1280}
1281
1282// Default value functions
1283fn default_embedding_dim() -> usize {
1284    384
1285}
1286fn default_embedding_backend() -> String {
1287    "hash".to_string()
1288}
1289fn default_max_connections() -> usize {
1290    10
1291}
1292fn default_similarity_threshold() -> f32 {
1293    0.8
1294}
1295fn default_chunk_size() -> usize {
1296    1000
1297}
1298fn default_chunk_overlap() -> usize {
1299    200
1300}
1301fn default_languages() -> Vec<String> {
1302    vec!["en".to_string()]
1303}
1304fn default_min_confidence() -> f32 {
1305    0.7
1306}
1307fn default_entity_types() -> Vec<String> {
1308    vec![
1309        "PERSON".to_string(),
1310        "ORG".to_string(),
1311        "LOCATION".to_string(),
1312    ]
1313}
1314fn default_top_k() -> usize {
1315    10
1316}
1317fn default_search_algorithm() -> String {
1318    "cosine".to_string()
1319}
1320fn default_num_threads() -> usize {
1321    0
1322} // Auto-detect
1323fn default_min_batch_size() -> usize {
1324    10
1325}
1326fn default_chunk_batch_size() -> usize {
1327    100
1328}
1329fn default_true() -> bool {
1330    true
1331}
1332fn default_relationship_confidence() -> f32 {
1333    0.5
1334}
1335fn default_max_gleaning_rounds() -> usize {
1336    3
1337}
1338
1339fn default_validation_confidence() -> f32 {
1340    0.7
1341}
1342
1343// Advanced features defaults (Phases 2-3)
1344
1345// Phase 2.1: Symbolic Anchoring
1346fn default_anchor_min_relevance() -> f32 {
1347    0.3
1348}
1349
1350fn default_max_anchors() -> usize {
1351    5
1352}
1353
1354fn default_max_entities_per_anchor() -> usize {
1355    10
1356}
1357
1358// Phase 2.3: Causal Analysis
1359fn default_causal_min_confidence() -> f32 {
1360    0.3
1361}
1362
1363fn default_causal_min_strength() -> f32 {
1364    0.5
1365}
1366
1367fn default_max_chain_depth() -> usize {
1368    5
1369}
1370
1371// Phase 3.1: Hierarchical Clustering
1372fn default_num_levels() -> usize {
1373    3
1374}
1375
1376fn default_resolutions() -> Vec<f32> {
1377    vec![1.0, 0.5, 0.2]
1378}
1379
1380fn default_min_cluster_size() -> usize {
1381    2
1382}
1383
1384// Phase 3.2: Weight Optimization
1385fn default_learning_rate() -> f32 {
1386    0.1
1387}
1388
1389fn default_max_iterations() -> usize {
1390    20
1391}
1392
1393fn default_slope_window() -> usize {
1394    3
1395}
1396
1397fn default_stagnation_threshold() -> f32 {
1398    0.01
1399}
1400
1401fn default_relevance_weight() -> f32 {
1402    0.4
1403}
1404
1405fn default_faithfulness_weight() -> f32 {
1406    0.4
1407}
1408
1409fn default_conciseness_weight() -> f32 {
1410    0.2
1411}
1412
1413fn default_max_fact_tokens() -> usize {
1414    400
1415}
1416
1417fn default_approach() -> String {
1418    "semantic".to_string()
1419}
1420fn default_max_traversal_depth() -> usize {
1421    3
1422}
1423fn default_max_paths() -> usize {
1424    10
1425}
1426fn default_min_relationship_strength() -> f32 {
1427    0.3
1428}
1429fn default_auto_save_interval() -> u64 {
1430    300 // 5 minutes
1431}
1432fn default_max_versions() -> usize {
1433    5 // Keep 5 versions by default
1434}
1435
1436impl Default for Config {
1437    fn default() -> Self {
1438        Self {
1439            output_dir: "./output".to_string(),
1440            chunk_size: default_chunk_size(),
1441            chunk_overlap: default_chunk_overlap(),
1442            max_entities_per_chunk: Some(10),
1443            top_k_results: Some(default_top_k()),
1444            similarity_threshold: Some(default_similarity_threshold()),
1445            approach: default_approach(),
1446            embeddings: EmbeddingConfig {
1447                dimension: default_embedding_dim(),
1448                backend: default_embedding_backend(),
1449                model: Some("sentence-transformers/all-MiniLM-L6-v2".to_string()),
1450                fallback_to_hash: true,
1451                api_endpoint: None,
1452                api_key: None,
1453                cache_dir: None,
1454                batch_size: default_batch_size(),
1455            },
1456            graph: GraphConfig {
1457                max_connections: default_max_connections(),
1458                similarity_threshold: default_similarity_threshold(),
1459                extract_relationships: default_true(),
1460                relationship_confidence_threshold: default_relationship_confidence(),
1461                traversal: TraversalConfigParams::default(),
1462            },
1463            text: TextConfig {
1464                chunk_size: default_chunk_size(),
1465                chunk_overlap: default_chunk_overlap(),
1466                languages: default_languages(),
1467            },
1468            entities: EntityConfig {
1469                min_confidence: default_min_confidence(),
1470                entity_types: default_entity_types(),
1471                use_gleaning: false,
1472                max_gleaning_rounds: default_max_gleaning_rounds(),
1473                enable_triple_reflection: false,
1474                validation_min_confidence: default_validation_confidence(),
1475                use_atomic_facts: false,
1476                max_fact_tokens: default_max_fact_tokens(),
1477            },
1478            retrieval: RetrievalConfig {
1479                top_k: default_top_k(),
1480                search_algorithm: default_search_algorithm(),
1481            },
1482            parallel: ParallelConfig {
1483                num_threads: default_num_threads(),
1484                enabled: true,
1485                min_batch_size: default_min_batch_size(),
1486                chunk_batch_size: default_chunk_batch_size(),
1487                parallel_embeddings: true,
1488                parallel_graph_ops: true,
1489                parallel_vector_ops: true,
1490            },
1491            ollama: crate::ollama::OllamaConfig::default(),
1492            gliner: GlinerConfig::default(),
1493            enhancements: enhancements::EnhancementsConfig::default(),
1494            auto_save: AutoSaveConfig {
1495                enabled: false,
1496                base_dir: None,
1497                interval_seconds: default_auto_save_interval(),
1498                workspace_name: None,
1499                max_versions: default_max_versions(),
1500            },
1501            summarization: crate::summarization::HierarchicalConfig::default(),
1502            zero_cost_approach: ZeroCostApproachConfig::default(),
1503            advanced_features: AdvancedFeaturesConfig::default(),
1504            suppress_progress_bars: false,
1505        }
1506    }
1507}
1508
1509impl Default for AutoSaveConfig {
1510    fn default() -> Self {
1511        Self {
1512            enabled: false,
1513            base_dir: None,
1514            interval_seconds: default_auto_save_interval(),
1515            workspace_name: None,
1516            max_versions: default_max_versions(),
1517        }
1518    }
1519}
1520
1521impl Default for AdvancedFeaturesConfig {
1522    fn default() -> Self {
1523        Self {
1524            symbolic_anchoring: SymbolicAnchoringConfig::default(),
1525            dynamic_weighting: DynamicWeightingConfig::default(),
1526            causal_analysis: CausalAnalysisConfig::default(),
1527            hierarchical_clustering: HierarchicalClusteringConfig::default(),
1528            weight_optimization: WeightOptimizationConfig::default(),
1529        }
1530    }
1531}
1532
1533impl Default for SymbolicAnchoringConfig {
1534    fn default() -> Self {
1535        Self {
1536            min_relevance: default_anchor_min_relevance(),
1537            max_anchors: default_max_anchors(),
1538            max_entities_per_anchor: default_max_entities_per_anchor(),
1539        }
1540    }
1541}
1542
1543impl Default for DynamicWeightingConfig {
1544    fn default() -> Self {
1545        Self {
1546            enable_semantic_boost: default_true(),
1547            enable_temporal_boost: default_true(),
1548            enable_concept_boost: default_true(),
1549            enable_causal_boost: default_true(),
1550        }
1551    }
1552}
1553
1554impl Default for CausalAnalysisConfig {
1555    fn default() -> Self {
1556        Self {
1557            min_confidence: default_causal_min_confidence(),
1558            min_causal_strength: default_causal_min_strength(),
1559            max_chain_depth: default_max_chain_depth(),
1560            require_temporal_consistency: default_true(),
1561        }
1562    }
1563}
1564
1565impl Default for HierarchicalClusteringConfig {
1566    fn default() -> Self {
1567        Self {
1568            num_levels: default_num_levels(),
1569            resolutions: default_resolutions(),
1570            min_cluster_size: default_min_cluster_size(),
1571            generate_summaries: default_true(),
1572        }
1573    }
1574}
1575
1576impl Default for WeightOptimizationConfig {
1577    fn default() -> Self {
1578        Self {
1579            learning_rate: default_learning_rate(),
1580            max_iterations: default_max_iterations(),
1581            slope_window: default_slope_window(),
1582            stagnation_threshold: default_stagnation_threshold(),
1583            use_llm_eval: default_true(),
1584            objective_weights: ObjectiveWeightsConfig::default(),
1585        }
1586    }
1587}
1588
1589impl Default for ObjectiveWeightsConfig {
1590    fn default() -> Self {
1591        Self {
1592            relevance: default_relevance_weight(),
1593            faithfulness: default_faithfulness_weight(),
1594            conciseness: default_conciseness_weight(),
1595        }
1596    }
1597}
1598
1599impl Config {
1600    /// Load configuration with hierarchical merging (requires `hierarchical-config` feature)
1601    ///
1602    /// Configuration sources are merged in order of priority (lowest to highest):
1603    /// 1. Built-in defaults
1604    /// 2. User config: `~/.graphrag/config.toml`
1605    /// 3. Project config: `./graphrag.toml`
1606    /// 4. Environment variables: `GRAPHRAG_*` (e.g., `GRAPHRAG_OLLAMA_HOST`)
1607    ///
1608    /// # Example
1609    /// ```rust,no_run
1610    /// use graphrag_core::Config;
1611    ///
1612    /// // Auto-loads from all sources
1613    /// let config = Config::load()?;
1614    /// # Ok::<(), graphrag_core::GraphRAGError>(())
1615    /// ```
1616    #[cfg(feature = "hierarchical-config")]
1617    pub fn load() -> Result<Self> {
1618        use figment::{
1619            providers::{Env, Format, Serialized, Toml},
1620            Figment,
1621        };
1622
1623        // Build the configuration chain
1624        let mut figment = Figment::new()
1625            // 1. Start with defaults
1626            .merge(Serialized::defaults(Config::default()));
1627
1628        // 2. User-level config (~/.graphrag/config.toml)
1629        if let Some(home) = dirs::home_dir() {
1630            let user_config = home.join(".graphrag").join("config.toml");
1631            if user_config.exists() {
1632                figment = figment.merge(Toml::file(user_config));
1633            }
1634        }
1635
1636        // 3. Project-level config (./graphrag.toml)
1637        let project_config = std::path::Path::new("graphrag.toml");
1638        if project_config.exists() {
1639            figment = figment.merge(Toml::file(project_config));
1640        }
1641
1642        // 4. Environment variables (GRAPHRAG_*)
1643        // Maps GRAPHRAG_OLLAMA_HOST -> ollama.host
1644        figment = figment.merge(Env::prefixed("GRAPHRAG_").split("_"));
1645
1646        figment
1647            .extract()
1648            .map_err(|e| crate::core::GraphRAGError::Config {
1649                message: format!("Failed to load hierarchical configuration: {}", e),
1650            })
1651    }
1652
1653    /// Load configuration with hierarchical merging (stub for when feature is disabled)
1654    ///
1655    /// When the `hierarchical-config` feature is not enabled, this falls back to `Config::default()`.
1656    #[cfg(not(feature = "hierarchical-config"))]
1657    pub fn load() -> Result<Self> {
1658        Ok(Config::default())
1659    }
1660
1661    /// Load configuration from a TOML file with environment variable overrides
1662    ///
1663    /// This is the preferred method for loading configuration from a specific file
1664    /// while still allowing environment variable overrides.
1665    ///
1666    /// # Example
1667    /// ```rust,no_run
1668    /// use graphrag_core::Config;
1669    ///
1670    /// let config = Config::from_toml_file("./my-config.toml")?;
1671    /// # Ok::<(), graphrag_core::GraphRAGError>(())
1672    /// ```
1673    pub fn from_toml_file<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
1674        let content = fs::read_to_string(path.as_ref())?;
1675        let config: Config =
1676            toml::from_str(&content).map_err(|e| crate::core::GraphRAGError::Config {
1677                message: format!("Failed to parse TOML config: {}", e),
1678            })?;
1679        Ok(config)
1680    }
1681
1682    /// Load configuration from a JSON file
1683    pub fn from_file(path: &str) -> Result<Self> {
1684        let content = fs::read_to_string(path)?;
1685        let parsed = json::parse(&content)?;
1686
1687        let config = Config {
1688            output_dir: parsed["output_dir"]
1689                .as_str()
1690                .unwrap_or("./output")
1691                .to_string(),
1692            suppress_progress_bars: parsed["suppress_progress_bars"].as_bool().unwrap_or(false),
1693            chunk_size: parsed["chunk_size"]
1694                .as_usize()
1695                .unwrap_or(default_chunk_size()),
1696            chunk_overlap: parsed["chunk_overlap"]
1697                .as_usize()
1698                .unwrap_or(default_chunk_overlap()),
1699            max_entities_per_chunk: parsed["max_entities_per_chunk"].as_usize(),
1700            top_k_results: parsed["top_k_results"].as_usize(),
1701            similarity_threshold: parsed["similarity_threshold"].as_f32(),
1702            approach: parsed["approach"]
1703                .as_str()
1704                .unwrap_or(&default_approach())
1705                .to_string(),
1706            embeddings: EmbeddingConfig {
1707                dimension: parsed["embeddings"]["dimension"]
1708                    .as_usize()
1709                    .unwrap_or(default_embedding_dim()),
1710                backend: parsed["embeddings"]["backend"]
1711                    .as_str()
1712                    .unwrap_or(&default_embedding_backend())
1713                    .to_string(),
1714                model: parsed["embeddings"]["model"]
1715                    .as_str()
1716                    .map(|s| s.to_string()),
1717                fallback_to_hash: parsed["embeddings"]["fallback_to_hash"]
1718                    .as_bool()
1719                    .unwrap_or(true),
1720                api_endpoint: parsed["embeddings"]["api_endpoint"]
1721                    .as_str()
1722                    .map(|s| s.to_string()),
1723                api_key: parsed["embeddings"]["api_key"]
1724                    .as_str()
1725                    .map(|s| s.to_string()),
1726                cache_dir: parsed["embeddings"]["cache_dir"]
1727                    .as_str()
1728                    .map(|s| s.to_string()),
1729                batch_size: parsed["embeddings"]["batch_size"]
1730                    .as_usize()
1731                    .unwrap_or(default_batch_size()),
1732            },
1733            graph: GraphConfig {
1734                max_connections: parsed["graph"]["max_connections"]
1735                    .as_usize()
1736                    .unwrap_or(default_max_connections()),
1737                similarity_threshold: parsed["graph"]["similarity_threshold"]
1738                    .as_f32()
1739                    .unwrap_or(default_similarity_threshold()),
1740                extract_relationships: parsed["graph"]["extract_relationships"]
1741                    .as_bool()
1742                    .unwrap_or(default_true()),
1743                relationship_confidence_threshold: parsed["graph"]
1744                    ["relationship_confidence_threshold"]
1745                    .as_f32()
1746                    .unwrap_or(default_relationship_confidence()),
1747                traversal: TraversalConfigParams {
1748                    max_depth: parsed["graph"]["traversal"]["max_depth"]
1749                        .as_usize()
1750                        .unwrap_or(default_max_traversal_depth()),
1751                    max_paths: parsed["graph"]["traversal"]["max_paths"]
1752                        .as_usize()
1753                        .unwrap_or(default_max_paths()),
1754                    use_edge_weights: parsed["graph"]["traversal"]["use_edge_weights"]
1755                        .as_bool()
1756                        .unwrap_or(default_true()),
1757                    min_relationship_strength: parsed["graph"]["traversal"]
1758                        ["min_relationship_strength"]
1759                        .as_f32()
1760                        .unwrap_or(default_min_relationship_strength()),
1761                },
1762            },
1763            text: TextConfig {
1764                chunk_size: parsed["text"]["chunk_size"]
1765                    .as_usize()
1766                    .unwrap_or(default_chunk_size()),
1767                chunk_overlap: parsed["text"]["chunk_overlap"]
1768                    .as_usize()
1769                    .unwrap_or(default_chunk_overlap()),
1770                languages: if parsed["text"]["languages"].is_array() {
1771                    parsed["text"]["languages"]
1772                        .members()
1773                        .map(|v| v.as_str().unwrap_or("en").to_string())
1774                        .collect()
1775                } else {
1776                    default_languages()
1777                },
1778            },
1779            entities: EntityConfig {
1780                min_confidence: parsed["entities"]["min_confidence"]
1781                    .as_f32()
1782                    .unwrap_or(default_min_confidence()),
1783                entity_types: if parsed["entities"]["entity_types"].is_array() {
1784                    parsed["entities"]["entity_types"]
1785                        .members()
1786                        .map(|v| v.as_str().unwrap_or("PERSON").to_string())
1787                        .collect()
1788                } else {
1789                    default_entity_types()
1790                },
1791                use_gleaning: parsed["entities"]["use_gleaning"]
1792                    .as_bool()
1793                    .unwrap_or(false),
1794                max_gleaning_rounds: parsed["entities"]["max_gleaning_rounds"]
1795                    .as_usize()
1796                    .unwrap_or(default_max_gleaning_rounds()),
1797                enable_triple_reflection: parsed["entities"]["enable_triple_reflection"]
1798                    .as_bool()
1799                    .unwrap_or(false),
1800                validation_min_confidence: parsed["entities"]["validation_min_confidence"]
1801                    .as_f32()
1802                    .unwrap_or(default_validation_confidence()),
1803                use_atomic_facts: parsed["entities"]["use_atomic_facts"]
1804                    .as_bool()
1805                    .unwrap_or(false),
1806                max_fact_tokens: parsed["entities"]["max_fact_tokens"]
1807                    .as_usize()
1808                    .unwrap_or(default_max_fact_tokens()),
1809            },
1810            retrieval: RetrievalConfig {
1811                top_k: parsed["retrieval"]["top_k"]
1812                    .as_usize()
1813                    .unwrap_or(default_top_k()),
1814                search_algorithm: parsed["retrieval"]["search_algorithm"]
1815                    .as_str()
1816                    .unwrap_or(&default_search_algorithm())
1817                    .to_string(),
1818            },
1819            parallel: ParallelConfig {
1820                num_threads: parsed["parallel"]["num_threads"]
1821                    .as_usize()
1822                    .unwrap_or(default_num_threads()),
1823                enabled: parsed["parallel"]["enabled"].as_bool().unwrap_or(true),
1824                min_batch_size: parsed["parallel"]["min_batch_size"]
1825                    .as_usize()
1826                    .unwrap_or(default_min_batch_size()),
1827                chunk_batch_size: parsed["parallel"]["chunk_batch_size"]
1828                    .as_usize()
1829                    .unwrap_or(default_chunk_batch_size()),
1830                parallel_embeddings: parsed["parallel"]["parallel_embeddings"]
1831                    .as_bool()
1832                    .unwrap_or(true),
1833                parallel_graph_ops: parsed["parallel"]["parallel_graph_ops"]
1834                    .as_bool()
1835                    .unwrap_or(true),
1836                parallel_vector_ops: parsed["parallel"]["parallel_vector_ops"]
1837                    .as_bool()
1838                    .unwrap_or(true),
1839            },
1840            ollama: crate::ollama::OllamaConfig {
1841                enabled: parsed["ollama"]["enabled"].as_bool().unwrap_or(false),
1842                host: parsed["ollama"]["host"]
1843                    .as_str()
1844                    .unwrap_or("http://localhost")
1845                    .to_string(),
1846                port: parsed["ollama"]["port"].as_u16().unwrap_or(11434),
1847                embedding_model: parsed["ollama"]["embedding_model"]
1848                    .as_str()
1849                    .unwrap_or("nomic-embed-text")
1850                    .to_string(),
1851                chat_model: parsed["ollama"]["chat_model"]
1852                    .as_str()
1853                    .unwrap_or("llama3.2:3b")
1854                    .to_string(),
1855                timeout_seconds: parsed["ollama"]["timeout_seconds"].as_u64().unwrap_or(30),
1856                max_retries: parsed["ollama"]["max_retries"].as_u32().unwrap_or(3),
1857                fallback_to_hash: parsed["ollama"]["fallback_to_hash"]
1858                    .as_bool()
1859                    .unwrap_or(true),
1860                max_tokens: parsed["ollama"]["max_tokens"].as_u32(),
1861                temperature: parsed["ollama"]["temperature"].as_f32(),
1862                enable_caching: parsed["ollama"]["enable_caching"].as_bool().unwrap_or(true),
1863                keep_alive: parsed["ollama"]["keep_alive"]
1864                    .as_str()
1865                    .map(|s| s.to_string()),
1866                num_ctx: parsed["ollama"]["num_ctx"].as_u32(),
1867            },
1868            gliner: GlinerConfig {
1869                enabled: parsed["gliner"]["enabled"].as_bool().unwrap_or(false),
1870                model_path: parsed["gliner"]["model_path"]
1871                    .as_str()
1872                    .unwrap_or("")
1873                    .to_string(),
1874                tokenizer_path: parsed["gliner"]["tokenizer_path"]
1875                    .as_str()
1876                    .unwrap_or("")
1877                    .to_string(),
1878                mode: parsed["gliner"]["mode"]
1879                    .as_str()
1880                    .unwrap_or("span")
1881                    .to_string(),
1882                entity_labels: if parsed["gliner"]["entity_labels"].is_array() {
1883                    parsed["gliner"]["entity_labels"]
1884                        .members()
1885                        .filter_map(|v| v.as_str().map(|s| s.to_string()))
1886                        .collect()
1887                } else {
1888                    vec!["person".into(), "organization".into(), "location".into()]
1889                },
1890                relation_labels: if parsed["gliner"]["relation_labels"].is_array() {
1891                    parsed["gliner"]["relation_labels"]
1892                        .members()
1893                        .filter_map(|v| v.as_str().map(|s| s.to_string()))
1894                        .collect()
1895                } else {
1896                    vec!["related to".into(), "part of".into()]
1897                },
1898                entity_threshold: parsed["gliner"]["entity_threshold"].as_f32().unwrap_or(0.4),
1899                relation_threshold: parsed["gliner"]["relation_threshold"]
1900                    .as_f32()
1901                    .unwrap_or(0.5),
1902                use_gpu: parsed["gliner"]["use_gpu"].as_bool().unwrap_or(false),
1903            },
1904            enhancements: enhancements::EnhancementsConfig {
1905                enabled: parsed["enhancements"]["enabled"].as_bool().unwrap_or(true),
1906                query_analysis: enhancements::QueryAnalysisConfig {
1907                    enabled: parsed["enhancements"]["query_analysis"]["enabled"]
1908                        .as_bool()
1909                        .unwrap_or(true),
1910                    min_confidence: parsed["enhancements"]["query_analysis"]["min_confidence"]
1911                        .as_f32()
1912                        .unwrap_or(0.6),
1913                    enable_strategy_suggestion: parsed["enhancements"]["query_analysis"]
1914                        ["enable_strategy_suggestion"]
1915                        .as_bool()
1916                        .unwrap_or(true),
1917                    enable_keyword_analysis: parsed["enhancements"]["query_analysis"]
1918                        ["enable_keyword_analysis"]
1919                        .as_bool()
1920                        .unwrap_or(true),
1921                    enable_complexity_scoring: parsed["enhancements"]["query_analysis"]
1922                        ["enable_complexity_scoring"]
1923                        .as_bool()
1924                        .unwrap_or(true),
1925                },
1926                adaptive_retrieval: enhancements::AdaptiveRetrievalConfig {
1927                    enabled: parsed["enhancements"]["adaptive_retrieval"]["enabled"]
1928                        .as_bool()
1929                        .unwrap_or(true),
1930                    use_query_analysis: parsed["enhancements"]["adaptive_retrieval"]
1931                        ["use_query_analysis"]
1932                        .as_bool()
1933                        .unwrap_or(true),
1934                    enable_cross_strategy_fusion: parsed["enhancements"]["adaptive_retrieval"]
1935                        ["enable_cross_strategy_fusion"]
1936                        .as_bool()
1937                        .unwrap_or(true),
1938                    diversity_threshold: parsed["enhancements"]["adaptive_retrieval"]
1939                        ["diversity_threshold"]
1940                        .as_f32()
1941                        .unwrap_or(0.8),
1942                    enable_diversity_selection: parsed["enhancements"]["adaptive_retrieval"]
1943                        ["enable_diversity_selection"]
1944                        .as_bool()
1945                        .unwrap_or(true),
1946                    enable_confidence_weighting: parsed["enhancements"]["adaptive_retrieval"]
1947                        ["enable_confidence_weighting"]
1948                        .as_bool()
1949                        .unwrap_or(true),
1950                },
1951                performance_benchmarking: enhancements::BenchmarkingConfig {
1952                    enabled: parsed["enhancements"]["performance_benchmarking"]["enabled"]
1953                        .as_bool()
1954                        .unwrap_or(false),
1955                    auto_recommendations: parsed["enhancements"]["performance_benchmarking"]
1956                        ["auto_recommendations"]
1957                        .as_bool()
1958                        .unwrap_or(true),
1959                    comprehensive_testing: parsed["enhancements"]["performance_benchmarking"]
1960                        ["comprehensive_testing"]
1961                        .as_bool()
1962                        .unwrap_or(false),
1963                    iterations: parsed["enhancements"]["performance_benchmarking"]["iterations"]
1964                        .as_usize()
1965                        .unwrap_or(3),
1966                    include_parallel: parsed["enhancements"]["performance_benchmarking"]
1967                        ["include_parallel"]
1968                        .as_bool()
1969                        .unwrap_or(true),
1970                    enable_memory_profiling: parsed["enhancements"]["performance_benchmarking"]
1971                        ["enable_memory_profiling"]
1972                        .as_bool()
1973                        .unwrap_or(false),
1974                },
1975                enhanced_function_registry: enhancements::FunctionRegistryConfig {
1976                    enabled: parsed["enhancements"]["enhanced_function_registry"]["enabled"]
1977                        .as_bool()
1978                        .unwrap_or(true),
1979                    categorization: parsed["enhancements"]["enhanced_function_registry"]
1980                        ["categorization"]
1981                        .as_bool()
1982                        .unwrap_or(true),
1983                    usage_statistics: parsed["enhancements"]["enhanced_function_registry"]
1984                        ["usage_statistics"]
1985                        .as_bool()
1986                        .unwrap_or(true),
1987                    dynamic_registration: parsed["enhancements"]["enhanced_function_registry"]
1988                        ["dynamic_registration"]
1989                        .as_bool()
1990                        .unwrap_or(true),
1991                    performance_monitoring: parsed["enhancements"]["enhanced_function_registry"]
1992                        ["performance_monitoring"]
1993                        .as_bool()
1994                        .unwrap_or(false),
1995                    recommendation_system: parsed["enhancements"]["enhanced_function_registry"]
1996                        ["recommendation_system"]
1997                        .as_bool()
1998                        .unwrap_or(true),
1999                },
2000                #[cfg(feature = "lightrag")]
2001                lightrag: enhancements::LightRAGConfig {
2002                    enabled: parsed["enhancements"]["lightrag"]["enabled"]
2003                        .as_bool()
2004                        .unwrap_or(true),
2005                    max_keywords: parsed["enhancements"]["lightrag"]["max_keywords"]
2006                        .as_usize()
2007                        .unwrap_or(20),
2008                    high_level_weight: parsed["enhancements"]["lightrag"]["high_level_weight"]
2009                        .as_f32()
2010                        .unwrap_or(0.6),
2011                    low_level_weight: parsed["enhancements"]["lightrag"]["low_level_weight"]
2012                        .as_f32()
2013                        .unwrap_or(0.4),
2014                    merge_strategy: parsed["enhancements"]["lightrag"]["merge_strategy"]
2015                        .as_str()
2016                        .unwrap_or("weighted")
2017                        .to_string(),
2018                    language: parsed["enhancements"]["lightrag"]["language"]
2019                        .as_str()
2020                        .unwrap_or("English")
2021                        .to_string(),
2022                    enable_cache: parsed["enhancements"]["lightrag"]["enable_cache"]
2023                        .as_bool()
2024                        .unwrap_or(true),
2025                },
2026                #[cfg(feature = "leiden")]
2027                leiden: enhancements::LeidenCommunitiesConfig {
2028                    enabled: parsed["enhancements"]["leiden"]["enabled"]
2029                        .as_bool()
2030                        .unwrap_or(true),
2031                    max_cluster_size: parsed["enhancements"]["leiden"]["max_cluster_size"]
2032                        .as_usize()
2033                        .unwrap_or(10),
2034                    use_lcc: parsed["enhancements"]["leiden"]["use_lcc"]
2035                        .as_bool()
2036                        .unwrap_or(true),
2037                    seed: parsed["enhancements"]["leiden"]["seed"].as_u64(),
2038                    resolution: parsed["enhancements"]["leiden"]["resolution"]
2039                        .as_f32()
2040                        .unwrap_or(1.0),
2041                    max_levels: parsed["enhancements"]["leiden"]["max_levels"]
2042                        .as_usize()
2043                        .unwrap_or(5),
2044                    min_improvement: parsed["enhancements"]["leiden"]["min_improvement"]
2045                        .as_f32()
2046                        .unwrap_or(0.001),
2047                    enable_hierarchical: parsed["enhancements"]["leiden"]["enable_hierarchical"]
2048                        .as_bool()
2049                        .unwrap_or(true),
2050                    generate_summaries: parsed["enhancements"]["leiden"]["generate_summaries"]
2051                        .as_bool()
2052                        .unwrap_or(true),
2053                    max_summary_length: parsed["enhancements"]["leiden"]["max_summary_length"]
2054                        .as_usize()
2055                        .unwrap_or(5),
2056                    use_extractive_summary: parsed["enhancements"]["leiden"]
2057                        ["use_extractive_summary"]
2058                        .as_bool()
2059                        .unwrap_or(true),
2060                    adaptive_routing: enhancements::AdaptiveRoutingConfig {
2061                        enabled: parsed["enhancements"]["leiden"]["adaptive_routing"]["enabled"]
2062                            .as_bool()
2063                            .unwrap_or(true),
2064                        default_level: parsed["enhancements"]["leiden"]["adaptive_routing"]
2065                            ["default_level"]
2066                            .as_usize()
2067                            .unwrap_or(1),
2068                        keyword_weight: parsed["enhancements"]["leiden"]["adaptive_routing"]
2069                            ["keyword_weight"]
2070                            .as_f32()
2071                            .unwrap_or(0.5),
2072                        length_weight: parsed["enhancements"]["leiden"]["adaptive_routing"]
2073                            ["length_weight"]
2074                            .as_f32()
2075                            .unwrap_or(0.3),
2076                        entity_weight: parsed["enhancements"]["leiden"]["adaptive_routing"]
2077                            ["entity_weight"]
2078                            .as_f32()
2079                            .unwrap_or(0.2),
2080                    },
2081                },
2082                #[cfg(feature = "cross-encoder")]
2083                cross_encoder: enhancements::CrossEncoderConfig {
2084                    enabled: parsed["enhancements"]["cross_encoder"]["enabled"]
2085                        .as_bool()
2086                        .unwrap_or(true),
2087                    model_name: parsed["enhancements"]["cross_encoder"]["model_name"]
2088                        .as_str()
2089                        .unwrap_or("cross-encoder/ms-marco-MiniLM-L-6-v2")
2090                        .to_string(),
2091                    max_length: parsed["enhancements"]["cross_encoder"]["max_length"]
2092                        .as_usize()
2093                        .unwrap_or(512),
2094                    batch_size: parsed["enhancements"]["cross_encoder"]["batch_size"]
2095                        .as_usize()
2096                        .unwrap_or(32),
2097                    top_k: parsed["enhancements"]["cross_encoder"]["top_k"]
2098                        .as_usize()
2099                        .unwrap_or(10),
2100                    min_confidence: parsed["enhancements"]["cross_encoder"]["min_confidence"]
2101                        .as_f32()
2102                        .unwrap_or(0.0),
2103                    normalize_scores: parsed["enhancements"]["cross_encoder"]["normalize_scores"]
2104                        .as_bool()
2105                        .unwrap_or(true),
2106                },
2107                #[cfg(feature = "lazygraphrag")]
2108                concept_selection: enhancements::ConceptSelectionConfig {
2109                    enabled: parsed["enhancements"]["concept_selection"]["enabled"]
2110                        .as_bool()
2111                        .unwrap_or(true),
2112                    top_k: parsed["enhancements"]["concept_selection"]["top_k"]
2113                        .as_usize()
2114                        .unwrap_or(20),
2115                    min_score: parsed["enhancements"]["concept_selection"]["min_score"]
2116                        .as_f32()
2117                        .unwrap_or(0.1),
2118                    degree_weight: parsed["enhancements"]["concept_selection"]["degree_weight"]
2119                        .as_f32()
2120                        .unwrap_or(0.4),
2121                    pagerank_weight: parsed["enhancements"]["concept_selection"]["pagerank_weight"]
2122                        .as_f32()
2123                        .unwrap_or(0.4),
2124                    idf_weight: parsed["enhancements"]["concept_selection"]["idf_weight"]
2125                        .as_f32()
2126                        .unwrap_or(0.2),
2127                    use_semantic_matching: parsed["enhancements"]["concept_selection"]
2128                        ["use_semantic_matching"]
2129                        .as_bool()
2130                        .unwrap_or(true),
2131                    max_query_concepts: parsed["enhancements"]["concept_selection"]
2132                        ["max_query_concepts"]
2133                        .as_usize()
2134                        .unwrap_or(10),
2135                },
2136            },
2137            auto_save: AutoSaveConfig {
2138                enabled: parsed["auto_save"]["enabled"].as_bool().unwrap_or(false),
2139                base_dir: parsed["auto_save"]["base_dir"]
2140                    .as_str()
2141                    .map(|s| s.to_string()),
2142                interval_seconds: parsed["auto_save"]["interval_seconds"]
2143                    .as_u64()
2144                    .unwrap_or(default_auto_save_interval()),
2145                workspace_name: parsed["auto_save"]["workspace_name"]
2146                    .as_str()
2147                    .map(|s| s.to_string()),
2148                max_versions: parsed["auto_save"]["max_versions"]
2149                    .as_usize()
2150                    .unwrap_or(default_max_versions()),
2151            },
2152            summarization: if parsed["summarization"].is_object() {
2153                crate::summarization::HierarchicalConfig {
2154                    merge_size: parsed["summarization"]["merge_size"]
2155                        .as_usize()
2156                        .unwrap_or(3),
2157                    max_summary_length: parsed["summarization"]["max_summary_length"]
2158                        .as_usize()
2159                        .unwrap_or(250),
2160                    min_node_size: parsed["summarization"]["min_node_size"]
2161                        .as_usize()
2162                        .unwrap_or(50),
2163                    overlap_sentences: parsed["summarization"]["overlap_sentences"]
2164                        .as_usize()
2165                        .unwrap_or(2),
2166                    llm_config: if parsed["summarization"]["llm_config"].is_object() {
2167                        crate::summarization::LLMConfig {
2168                            enabled: parsed["summarization"]["llm_config"]["enabled"]
2169                                .as_bool()
2170                                .unwrap_or(false),
2171                            model_name: parsed["summarization"]["llm_config"]["model_name"]
2172                                .as_str()
2173                                .unwrap_or("llama3.1:8b")
2174                                .to_string(),
2175                            temperature: parsed["summarization"]["llm_config"]["temperature"]
2176                                .as_f32()
2177                                .unwrap_or(0.3),
2178                            max_tokens: parsed["summarization"]["llm_config"]["max_tokens"]
2179                                .as_usize()
2180                                .unwrap_or(180),
2181                            strategy: match parsed["summarization"]["llm_config"]["strategy"]
2182                                .as_str()
2183                                .unwrap_or("progressive")
2184                            {
2185                                "uniform" => crate::summarization::LLMStrategy::Uniform,
2186                                "adaptive" => crate::summarization::LLMStrategy::Adaptive,
2187                                "progressive" => crate::summarization::LLMStrategy::Progressive,
2188                                _ => crate::summarization::LLMStrategy::Progressive,
2189                            },
2190                            level_configs: std::collections::HashMap::new(), // Would need more complex parsing
2191                        }
2192                    } else {
2193                        crate::summarization::LLMConfig::default()
2194                    },
2195                }
2196            } else {
2197                crate::summarization::HierarchicalConfig::default()
2198            },
2199            zero_cost_approach: if parsed["zero_cost_approach"].is_object() {
2200                ZeroCostApproachConfig {
2201                    approach: parsed["zero_cost_approach"]["approach"]
2202                        .as_str()
2203                        .unwrap_or("pure_algorithmic")
2204                        .to_string(),
2205                    lazy_graphrag: if parsed["zero_cost_approach"]["lazy_graphrag"].is_object() {
2206                        LazyGraphRAGConfig {
2207                            enabled: parsed["zero_cost_approach"]["lazy_graphrag"]["enabled"]
2208                                .as_bool()
2209                                .unwrap_or(false),
2210                            concept_extraction: ConceptExtractionConfig::default(),
2211                            co_occurrence: CoOccurrenceConfig::default(),
2212                            indexing: LazyIndexingConfig::default(),
2213                            query_expansion: LazyQueryExpansionConfig::default(),
2214                            relevance_scoring: LazyRelevanceScoringConfig::default(),
2215                        }
2216                    } else {
2217                        LazyGraphRAGConfig::default()
2218                    },
2219                    e2_graphrag: E2GraphRAGConfig::default(),
2220                    pure_algorithmic: PureAlgorithmicConfig::default(),
2221                    hybrid_strategy: HybridStrategyConfig::default(),
2222                }
2223            } else {
2224                ZeroCostApproachConfig::default()
2225            },
2226            advanced_features: AdvancedFeaturesConfig::default(),
2227        };
2228
2229        Ok(config)
2230    }
2231
2232    /// Save configuration to a JSON file
2233    pub fn to_file(&self, path: &str) -> Result<()> {
2234        let mut config_json = json::JsonValue::new_object();
2235
2236        // Embeddings
2237        let mut embeddings = json::JsonValue::new_object();
2238        embeddings["dimension"] = json::JsonValue::from(self.embeddings.dimension);
2239        if let Some(endpoint) = &self.embeddings.api_endpoint {
2240            embeddings["api_endpoint"] = json::JsonValue::from(endpoint.as_str());
2241        }
2242        if let Some(key) = &self.embeddings.api_key {
2243            embeddings["api_key"] = json::JsonValue::from(key.as_str());
2244        }
2245        config_json["embeddings"] = embeddings;
2246
2247        // Graph
2248        let mut graph = json::JsonValue::new_object();
2249        graph["max_connections"] = json::JsonValue::from(self.graph.max_connections);
2250        graph["similarity_threshold"] = json::JsonValue::from(self.graph.similarity_threshold);
2251        graph["extract_relationships"] = json::JsonValue::from(self.graph.extract_relationships);
2252        graph["relationship_confidence_threshold"] =
2253            json::JsonValue::from(self.graph.relationship_confidence_threshold);
2254
2255        let mut traversal = json::JsonValue::new_object();
2256        traversal["max_depth"] = json::JsonValue::from(self.graph.traversal.max_depth);
2257        traversal["max_paths"] = json::JsonValue::from(self.graph.traversal.max_paths);
2258        traversal["use_edge_weights"] =
2259            json::JsonValue::from(self.graph.traversal.use_edge_weights);
2260        traversal["min_relationship_strength"] =
2261            json::JsonValue::from(self.graph.traversal.min_relationship_strength);
2262        graph["traversal"] = traversal;
2263
2264        config_json["graph"] = graph;
2265
2266        // Text
2267        let mut text = json::JsonValue::new_object();
2268        text["chunk_size"] = json::JsonValue::from(self.text.chunk_size);
2269        text["chunk_overlap"] = json::JsonValue::from(self.text.chunk_overlap);
2270        let languages_array: Vec<json::JsonValue> = self
2271            .text
2272            .languages
2273            .iter()
2274            .map(|s| json::JsonValue::from(s.as_str()))
2275            .collect();
2276        text["languages"] = json::JsonValue::from(languages_array);
2277        config_json["text"] = text;
2278
2279        // Entities
2280        let mut entities = json::JsonValue::new_object();
2281        entities["min_confidence"] = json::JsonValue::from(self.entities.min_confidence);
2282        let entity_types_array: Vec<json::JsonValue> = self
2283            .entities
2284            .entity_types
2285            .iter()
2286            .map(|s| json::JsonValue::from(s.as_str()))
2287            .collect();
2288        entities["entity_types"] = json::JsonValue::from(entity_types_array);
2289        entities["use_gleaning"] = json::JsonValue::from(self.entities.use_gleaning);
2290        entities["max_gleaning_rounds"] = json::JsonValue::from(self.entities.max_gleaning_rounds);
2291        config_json["entities"] = entities;
2292
2293        // Retrieval
2294        let mut retrieval = json::JsonValue::new_object();
2295        retrieval["top_k"] = json::JsonValue::from(self.retrieval.top_k);
2296        retrieval["search_algorithm"] =
2297            json::JsonValue::from(self.retrieval.search_algorithm.as_str());
2298        config_json["retrieval"] = retrieval;
2299
2300        // Parallel
2301        let mut parallel = json::JsonValue::new_object();
2302        parallel["num_threads"] = json::JsonValue::from(self.parallel.num_threads);
2303        parallel["enabled"] = json::JsonValue::from(self.parallel.enabled);
2304        parallel["min_batch_size"] = json::JsonValue::from(self.parallel.min_batch_size);
2305        parallel["chunk_batch_size"] = json::JsonValue::from(self.parallel.chunk_batch_size);
2306        parallel["parallel_embeddings"] = json::JsonValue::from(self.parallel.parallel_embeddings);
2307        parallel["parallel_graph_ops"] = json::JsonValue::from(self.parallel.parallel_graph_ops);
2308        parallel["parallel_vector_ops"] = json::JsonValue::from(self.parallel.parallel_vector_ops);
2309        config_json["parallel"] = parallel;
2310
2311        // Enhancements
2312        let mut enhancements = json::JsonValue::new_object();
2313        enhancements["enabled"] = json::JsonValue::from(self.enhancements.enabled);
2314
2315        let mut query_analysis = json::JsonValue::new_object();
2316        query_analysis["enabled"] = json::JsonValue::from(self.enhancements.query_analysis.enabled);
2317        query_analysis["min_confidence"] =
2318            json::JsonValue::from(self.enhancements.query_analysis.min_confidence);
2319        query_analysis["enable_strategy_suggestion"] =
2320            json::JsonValue::from(self.enhancements.query_analysis.enable_strategy_suggestion);
2321        query_analysis["enable_keyword_analysis"] =
2322            json::JsonValue::from(self.enhancements.query_analysis.enable_keyword_analysis);
2323        query_analysis["enable_complexity_scoring"] =
2324            json::JsonValue::from(self.enhancements.query_analysis.enable_complexity_scoring);
2325        enhancements["query_analysis"] = query_analysis;
2326
2327        let mut adaptive_retrieval = json::JsonValue::new_object();
2328        adaptive_retrieval["enabled"] =
2329            json::JsonValue::from(self.enhancements.adaptive_retrieval.enabled);
2330        adaptive_retrieval["use_query_analysis"] =
2331            json::JsonValue::from(self.enhancements.adaptive_retrieval.use_query_analysis);
2332        adaptive_retrieval["enable_cross_strategy_fusion"] = json::JsonValue::from(
2333            self.enhancements
2334                .adaptive_retrieval
2335                .enable_cross_strategy_fusion,
2336        );
2337        adaptive_retrieval["diversity_threshold"] =
2338            json::JsonValue::from(self.enhancements.adaptive_retrieval.diversity_threshold);
2339        adaptive_retrieval["enable_diversity_selection"] = json::JsonValue::from(
2340            self.enhancements
2341                .adaptive_retrieval
2342                .enable_diversity_selection,
2343        );
2344        adaptive_retrieval["enable_confidence_weighting"] = json::JsonValue::from(
2345            self.enhancements
2346                .adaptive_retrieval
2347                .enable_confidence_weighting,
2348        );
2349        enhancements["adaptive_retrieval"] = adaptive_retrieval;
2350
2351        let mut performance_benchmarking = json::JsonValue::new_object();
2352        performance_benchmarking["enabled"] =
2353            json::JsonValue::from(self.enhancements.performance_benchmarking.enabled);
2354        performance_benchmarking["auto_recommendations"] = json::JsonValue::from(
2355            self.enhancements
2356                .performance_benchmarking
2357                .auto_recommendations,
2358        );
2359        performance_benchmarking["comprehensive_testing"] = json::JsonValue::from(
2360            self.enhancements
2361                .performance_benchmarking
2362                .comprehensive_testing,
2363        );
2364        performance_benchmarking["iterations"] =
2365            json::JsonValue::from(self.enhancements.performance_benchmarking.iterations);
2366        performance_benchmarking["include_parallel"] =
2367            json::JsonValue::from(self.enhancements.performance_benchmarking.include_parallel);
2368        performance_benchmarking["enable_memory_profiling"] = json::JsonValue::from(
2369            self.enhancements
2370                .performance_benchmarking
2371                .enable_memory_profiling,
2372        );
2373        enhancements["performance_benchmarking"] = performance_benchmarking;
2374
2375        let mut enhanced_function_registry = json::JsonValue::new_object();
2376        enhanced_function_registry["enabled"] =
2377            json::JsonValue::from(self.enhancements.enhanced_function_registry.enabled);
2378        enhanced_function_registry["categorization"] =
2379            json::JsonValue::from(self.enhancements.enhanced_function_registry.categorization);
2380        enhanced_function_registry["usage_statistics"] = json::JsonValue::from(
2381            self.enhancements
2382                .enhanced_function_registry
2383                .usage_statistics,
2384        );
2385        enhanced_function_registry["dynamic_registration"] = json::JsonValue::from(
2386            self.enhancements
2387                .enhanced_function_registry
2388                .dynamic_registration,
2389        );
2390        enhanced_function_registry["performance_monitoring"] = json::JsonValue::from(
2391            self.enhancements
2392                .enhanced_function_registry
2393                .performance_monitoring,
2394        );
2395        enhanced_function_registry["recommendation_system"] = json::JsonValue::from(
2396            self.enhancements
2397                .enhanced_function_registry
2398                .recommendation_system,
2399        );
2400        enhancements["enhanced_function_registry"] = enhanced_function_registry;
2401
2402        config_json["enhancements"] = enhancements;
2403
2404        // Summarization
2405        let mut summarization = json::JsonValue::new_object();
2406        summarization["merge_size"] = json::JsonValue::from(self.summarization.merge_size);
2407        summarization["max_summary_length"] =
2408            json::JsonValue::from(self.summarization.max_summary_length);
2409        summarization["min_node_size"] = json::JsonValue::from(self.summarization.min_node_size);
2410        summarization["overlap_sentences"] =
2411            json::JsonValue::from(self.summarization.overlap_sentences);
2412
2413        let mut llm_config = json::JsonValue::new_object();
2414        llm_config["enabled"] = json::JsonValue::from(self.summarization.llm_config.enabled);
2415        llm_config["model_name"] =
2416            json::JsonValue::from(self.summarization.llm_config.model_name.as_str());
2417        llm_config["temperature"] =
2418            json::JsonValue::from(self.summarization.llm_config.temperature);
2419        llm_config["max_tokens"] = json::JsonValue::from(self.summarization.llm_config.max_tokens);
2420        let strategy_str = match self.summarization.llm_config.strategy {
2421            crate::summarization::LLMStrategy::Uniform => "uniform",
2422            crate::summarization::LLMStrategy::Adaptive => "adaptive",
2423            crate::summarization::LLMStrategy::Progressive => "progressive",
2424        };
2425        llm_config["strategy"] = json::JsonValue::from(strategy_str);
2426
2427        summarization["llm_config"] = llm_config;
2428        config_json["summarization"] = summarization;
2429
2430        let content = json::stringify_pretty(config_json, 2);
2431        fs::write(path, content)?;
2432        Ok(())
2433    }
2434}
graphrag_core/config/mod.rs

graphrag_core/config/
mod.rs