graphrag_core/config/
mod.rs

1//! Configuration types for GraphRAG.
2//!
3//! Holds the runtime [`Config`] struct used by `GraphRAG`, plus the serde-friendly
4//! `SetConfig` schema (TOML / JSON5) that maps onto it. Both must stay in sync when
5//! adding fields — see the crate `CLAUDE.md` for the update checklist.
6
7use crate::Result;
8use std::fs;
9
10/// Enhanced configuration options for GraphRAG
11pub mod enhancements;
12/// JSON5 configuration support
13#[cfg(feature = "json5-support")]
14pub mod json5_loader;
15/// Hand-rolled JSON loader/writer for `Config` (extracted Phase 4 split).
16mod json_parser;
17/// Configuration file loading utilities
18pub mod loader;
19/// JSON Schema validation
20#[cfg(feature = "json5-support")]
21pub mod schema_validator;
22/// SetConfig configuration support (TOML, JSON5, YAML, JSON)
23pub mod setconfig;
24/// Configuration validation utilities
25pub mod validation;
26
27pub use setconfig::{
28    AlgorithmicEmbeddingsConfig,
29    AlgorithmicEntityConfig,
30    AlgorithmicGraphConfig,
31    // Algorithmic/Classic NLP pipeline
32    AlgorithmicPipelineConfig,
33    AlgorithmicRetrievalConfig,
34    HybridEmbeddingsConfig,
35    HybridEntityConfig,
36    HybridGraphConfig,
37    // Hybrid pipeline
38    HybridPipelineConfig,
39    HybridRetrievalConfig,
40    HybridWeightsConfig,
41    // Pipeline approach configuration
42    ModeConfig,
43    SemanticEmbeddingsConfig,
44    SemanticEntityConfig,
45    SemanticGraphConfig,
46    // Semantic/Neural pipeline
47    SemanticPipelineConfig,
48    SemanticRetrievalConfig,
49    SetConfig,
50};
51pub use validation::{validate_config_file, Validatable, ValidationResult};
52
53/// Configuration for the GraphRAG system
54#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
55pub struct Config {
56    /// Output directory for storing graphs and data
57    pub output_dir: String,
58
59    /// Chunk size for text processing
60    pub chunk_size: usize,
61
62    /// Overlap between chunks
63    pub chunk_overlap: usize,
64
65    /// Maximum entities per chunk
66    pub max_entities_per_chunk: Option<usize>,
67
68    /// Top-k results for retrieval
69    pub top_k_results: Option<usize>,
70
71    /// Similarity threshold for retrieval
72    pub similarity_threshold: Option<f32>,
73
74    /// Pipeline approach: "semantic", "algorithmic", or "hybrid"
75    /// Determines which implementation strategy to use for entity extraction and retrieval
76    #[serde(default = "default_approach")]
77    pub approach: String,
78
79    /// Vector embedding configuration
80    pub embeddings: EmbeddingConfig,
81
82    /// Graph construction parameters
83    pub graph: GraphConfig,
84
85    /// Text processing settings
86    pub text: TextConfig,
87
88    /// Entity extraction settings
89    pub entities: EntityConfig,
90
91    /// Retrieval system configuration
92    pub retrieval: RetrievalConfig,
93
94    /// Parallel processing configuration
95    pub parallel: ParallelConfig,
96
97    /// Ollama integration configuration
98    pub ollama: crate::ollama::OllamaConfig,
99
100    /// GLiNER-Relex extractor configuration
101    pub gliner: GlinerConfig,
102
103    /// Latest enhancements configuration
104    pub enhancements: enhancements::EnhancementsConfig,
105
106    /// Auto-save configuration for workspace persistence
107    pub auto_save: AutoSaveConfig,
108
109    /// Hierarchical summarization configuration
110    pub summarization: crate::summarization::HierarchicalConfig,
111
112    /// Zero-cost approach configuration
113    pub zero_cost_approach: ZeroCostApproachConfig,
114
115    /// Advanced features configuration (Phases 2-3)
116    #[serde(default)]
117    pub advanced_features: AdvancedFeaturesConfig,
118
119    /// Suppress indicatif progress bars (use hidden draw target).
120    /// Set to `true` when running inside a TUI to avoid corrupting the terminal.
121    #[serde(default)]
122    pub suppress_progress_bars: bool,
123}
124
125/// GLiNER-Relex extractor configuration (joint NER + RE via ONNX Runtime)
126#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
127pub struct GlinerConfig {
128    /// Enable GLiNER-Relex extraction
129    pub enabled: bool,
130    /// Path to the ONNX model file (e.g. "models/gliner-relex-large-v0.5.onnx")
131    pub model_path: String,
132    /// Path to tokenizer.json — defaults to same directory as model_path if empty
133    pub tokenizer_path: String,
134    /// Span-based ("span", default) or token-based ("token") NER pipeline
135    pub mode: String,
136    /// Entity types to extract
137    pub entity_labels: Vec<String>,
138    /// Relation types to extract (empty list disables RE stage)
139    pub relation_labels: Vec<String>,
140    /// Minimum entity confidence threshold (0.0–1.0)
141    pub entity_threshold: f32,
142    /// Minimum relation confidence threshold (0.0–1.0)
143    pub relation_threshold: f32,
144    /// Use GPU (CUDA) for inference
145    pub use_gpu: bool,
146    /// Max concurrent chunk inferences. `None` → default 4. Set to 1 to force
147    /// sequential. Cap matches CPU cores or GPU stream count for best throughput.
148    #[serde(default)]
149    pub max_concurrent_chunks: Option<usize>,
150}
151
152impl Default for GlinerConfig {
153    fn default() -> Self {
154        Self {
155            enabled: false,
156            model_path: String::new(),
157            tokenizer_path: String::new(),
158            mode: "span".to_string(),
159            entity_labels: vec![
160                "person".into(),
161                "organization".into(),
162                "location".into(),
163                "concept".into(),
164            ],
165            relation_labels: vec!["related to".into(), "part of".into(), "causes".into()],
166            entity_threshold: 0.4,
167            relation_threshold: 0.5,
168            use_gpu: false,
169            max_concurrent_chunks: None,
170        }
171    }
172}
173
174/// Configuration for automatic workspace saving
175#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
176pub struct AutoSaveConfig {
177    /// Enable persistent storage. When false (default), the graph lives in memory only.
178    /// When true, the graph is saved to disk after `build_graph()` and loaded from disk
179    /// on the next `initialize()` call (if the workspace already exists).
180    #[serde(default)]
181    pub enabled: bool,
182
183    /// Base directory where workspace folders are stored.
184    /// Required when `enabled = true`. Example: `"./output"` or `"/data/graphrag"`.
185    #[serde(default)]
186    pub base_dir: Option<String>,
187
188    /// Auto-save interval in seconds (0 = save after every graph build)
189    #[serde(default = "default_auto_save_interval")]
190    pub interval_seconds: u64,
191
192    /// Workspace name — the sub-folder inside `base_dir` (default: "default").
193    #[serde(default)]
194    pub workspace_name: Option<String>,
195
196    /// Maximum number of auto-save versions to keep (0 = unlimited)
197    #[serde(default = "default_max_versions")]
198    pub max_versions: usize,
199}
200
201/// Configuration for zero-cost GraphRAG approaches
202#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
203pub struct ZeroCostApproachConfig {
204    /// Which zero-cost approach to use
205    #[serde(default = "default_zero_cost_approach")]
206    pub approach: String,
207
208    /// LazyGraphRAG-style configuration
209    #[serde(default)]
210    pub lazy_graphrag: LazyGraphRAGConfig,
211
212    /// E2GraphRAG-style configuration
213    #[serde(default)]
214    pub e2_graphrag: E2GraphRAGConfig,
215
216    /// Pure algorithmic configuration
217    #[serde(default)]
218    pub pure_algorithmic: PureAlgorithmicConfig,
219
220    /// Hybrid strategy configuration
221    #[serde(default)]
222    pub hybrid_strategy: HybridStrategyConfig,
223}
224
225/// Configuration for LazyGraphRAG, an efficient approach for large-scale knowledge graphs.
226/// This configuration enables lazy loading and processing of graph components.
227#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
228pub struct LazyGraphRAGConfig {
229    /// Whether LazyGraphRAG is enabled
230    pub enabled: bool,
231    /// Configuration for concept extraction from text
232    pub concept_extraction: ConceptExtractionConfig,
233    /// Configuration for co-occurrence analysis of concepts
234    pub co_occurrence: CoOccurrenceConfig,
235    /// Configuration for lazy indexing of graph components
236    pub indexing: LazyIndexingConfig,
237    /// Configuration for query expansion strategies
238    pub query_expansion: LazyQueryExpansionConfig,
239    /// Configuration for relevance scoring of results
240    pub relevance_scoring: LazyRelevanceScoringConfig,
241}
242
243/// Configuration for extracting concepts from text documents.
244/// This configuration controls how key concepts are identified and extracted from text.
245#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
246pub struct ConceptExtractionConfig {
247    /// Minimum length of a concept in characters
248    pub min_concept_length: usize,
249    /// Maximum number of words in a multi-word concept
250    pub max_concept_words: usize,
251    /// Whether to extract noun phrases as concepts
252    pub use_noun_phrases: bool,
253    /// Whether to consider capitalized words as potential concepts
254    pub use_capitalization: bool,
255    /// Whether to consider title-cased phrases as potential concepts
256    pub use_title_case: bool,
257    /// Whether to use TF-IDF scoring for concept importance
258    pub use_tf_idf_scoring: bool,
259    /// Minimum term frequency for a term to be considered a concept
260    pub min_term_frequency: usize,
261    /// Maximum number of concepts to extract per document chunk
262    pub max_concepts_per_chunk: usize,
263    /// Minimum score threshold for a term to be considered a concept
264    pub min_concept_score: f32,
265    /// Whether to exclude common stopwords from concept extraction
266    pub exclude_stopwords: bool,
267    /// Custom list of stopwords to exclude from concept extraction
268    pub custom_stopwords: Vec<String>,
269}
270
271/// Configuration for co-occurrence analysis of concepts in documents.
272/// This determines how relationships between concepts are identified based on their co-occurrence.
273#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
274pub struct CoOccurrenceConfig {
275    /// Size of the sliding window (in words) to consider for co-occurrence
276    pub window_size: usize,
277    /// Minimum number of co-occurrences required to create an edge between concepts
278    pub min_co_occurrence: usize,
279    /// Jaccard similarity threshold for merging similar concepts
280    pub jaccard_threshold: f32,
281    /// Maximum number of edges allowed per node in the co-occurrence graph
282    pub max_edges_per_node: usize,
283}
284
285/// Configuration for lazy indexing of graph components.
286/// Controls how graph components are indexed for efficient retrieval.
287#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
288pub struct LazyIndexingConfig {
289    /// Whether to use bidirectional indexing for faster lookups
290    pub use_bidirectional_index: bool,
291    /// Whether to enable HNSW (Hierarchical Navigable Small World) index for approximate nearest neighbor search
292    pub enable_hnsw_index: bool,
293    /// Maximum number of items to keep in the index cache
294    pub cache_size: usize,
295}
296
297/// Configuration for lazy query expansion in the retrieval process.
298/// Controls how queries are expanded to improve search results.
299#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
300pub struct LazyQueryExpansionConfig {
301    /// Whether query expansion is enabled
302    pub enabled: bool,
303    /// Maximum number of query expansions to generate
304    pub max_expansions: usize,
305    /// Name of the model to use for query expansion
306    pub expansion_model: String,
307    /// Temperature parameter for controlling randomness in expansion generation
308    pub expansion_temperature: f32,
309    /// Maximum number of tokens to generate per expansion
310    pub max_tokens_per_expansion: usize,
311}
312
313/// Configuration for lazy relevance scoring of search results.
314/// Controls how search results are scored for relevance to the query.
315#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
316pub struct LazyRelevanceScoringConfig {
317    /// Whether relevance scoring is enabled
318    pub enabled: bool,
319    /// Name of the model to use for relevance scoring
320    pub scoring_model: String,
321    /// Number of items to score in a single batch
322    pub batch_size: usize,
323    /// Temperature parameter for controlling randomness in scoring
324    pub temperature: f32,
325    /// Maximum number of tokens to consider for each score calculation
326    pub max_tokens_per_score: usize,
327}
328
329/// End-to-End GraphRAG configuration for comprehensive knowledge graph construction.
330/// This configuration enables fine-grained control over the entire pipeline from text to knowledge graph.
331#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
332pub struct E2GraphRAGConfig {
333    /// Whether the E2E GraphRAG pipeline is enabled
334    pub enabled: bool,
335
336    /// Configuration for Named Entity Recognition (NER) extraction
337    pub ner_extraction: NERExtractionConfig,
338
339    /// Configuration for keyword extraction from text
340    pub keyword_extraction: KeywordExtractionConfig,
341
342    /// Configuration for graph construction parameters
343    pub graph_construction: E2GraphConstructionConfig,
344
345    /// Configuration for indexing strategies
346    pub indexing: E2IndexingConfig,
347}
348
349/// Configuration for Named Entity Recognition (NER) extraction from text.
350/// Controls how named entities are identified and extracted from documents.
351#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
352pub struct NERExtractionConfig {
353    /// List of entity types to recognize (e.g., ["PERSON", "ORG", "LOCATION"])
354    pub entity_types: Vec<String>,
355
356    /// Whether to recognize capitalized words as potential named entities
357    pub use_capitalized_patterns: bool,
358
359    /// Whether to recognize title-cased phrases as potential named entities
360    pub use_title_case_patterns: bool,
361
362    /// Whether to recognize quoted phrases as potential named entities
363    pub use_quoted_patterns: bool,
364
365    /// Whether to recognize common abbreviations as entities
366    pub use_abbreviations: bool,
367
368    /// Whether to use contextual disambiguation to resolve entity ambiguity
369    pub use_contextual_disambiguation: bool,
370
371    /// Minimum number of context words to consider for disambiguation
372    pub min_context_words: usize,
373
374    /// Minimum confidence score (0.0-1.0) required for an entity to be included
375    pub min_confidence: f32,
376
377    /// Whether to apply positional boost to entities based on their position in the text
378    pub use_positional_boost: bool,
379
380    /// Whether to apply frequency boost to entities based on their frequency in the text
381    pub use_frequency_boost: bool,
382}
383
384impl Default for NERExtractionConfig {
385    fn default() -> Self {
386        Self {
387            entity_types: vec![
388                "PERSON".to_string(),
389                "ORG".to_string(),
390                "LOCATION".to_string(),
391            ],
392            use_capitalized_patterns: true,
393            use_title_case_patterns: true,
394            use_quoted_patterns: true,
395            use_abbreviations: true,
396            use_contextual_disambiguation: true,
397            min_context_words: 5,
398            min_confidence: 0.7,
399            use_positional_boost: true,
400            use_frequency_boost: true,
401        }
402    }
403}
404
405/// Configuration for keyword extraction from text documents.
406/// Controls how keywords are identified and extracted from text content.
407#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
408pub struct KeywordExtractionConfig {
409    /// List of algorithms to use for keyword extraction (e.g., ["tfidf", "yake", "textrank"])
410    pub algorithms: Vec<String>,
411
412    /// Maximum number of keywords to extract per document chunk
413    pub max_keywords_per_chunk: usize,
414
415    /// Minimum length of a keyword in characters
416    pub min_keyword_length: usize,
417
418    /// Whether to combine results from multiple algorithms
419    pub combine_algorithms: bool,
420}
421
422impl Default for KeywordExtractionConfig {
423    fn default() -> Self {
424        Self {
425            algorithms: vec!["tfidf".to_string(), "yake".to_string()],
426            max_keywords_per_chunk: 10,
427            min_keyword_length: 3,
428            combine_algorithms: true,
429        }
430    }
431}
432
433/// Configuration for graph construction in the E2E GraphRAG pipeline.
434/// Controls how entities and their relationships are organized into a knowledge graph.
435#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
436pub struct E2GraphConstructionConfig {
437    /// Types of relationships to extract between entities (e.g., ["CO_OCCURS_WITH", "RELATED_TO"])
438    pub relationship_types: Vec<String>,
439
440    /// Minimum score required to establish a relationship between entities (0.0-1.0)
441    pub min_relationship_score: f32,
442
443    /// Maximum number of relationships to maintain per entity
444    pub max_relationships_per_entity: usize,
445
446    /// Whether to use mutual information for relationship scoring
447    pub use_mutual_information: bool,
448}
449
450impl Default for E2GraphConstructionConfig {
451    fn default() -> Self {
452        Self {
453            relationship_types: vec!["CO_OCCURS_WITH".to_string(), "RELATED_TO".to_string()],
454            min_relationship_score: 0.5,
455            max_relationships_per_entity: 20,
456            use_mutual_information: true,
457        }
458    }
459}
460
461/// Configuration for indexing in the E2E GraphRAG pipeline.
462/// Controls how entities, relationships, and their embeddings are indexed for efficient retrieval.
463#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
464pub struct E2IndexingConfig {
465    /// Number of items to process in a single batch during indexing
466    pub batch_size: usize,
467
468    /// Whether to enable parallel processing during indexing
469    pub enable_parallel_processing: bool,
470
471    /// Whether to cache concept vectors for faster retrieval
472    pub cache_concept_vectors: bool,
473
474    /// Whether to use hash embeddings for more efficient storage
475    pub use_hash_embeddings: bool,
476}
477
478impl Default for E2IndexingConfig {
479    fn default() -> Self {
480        Self {
481            batch_size: 32,
482            enable_parallel_processing: true,
483            cache_concept_vectors: true,
484            use_hash_embeddings: false,
485        }
486    }
487}
488
489/// Configuration for pure algorithmic GraphRAG approach without LLM dependencies.
490///
491/// This configuration enables cost-effective graph construction and analysis
492/// using only algorithmic methods for pattern extraction, keyword analysis,
493/// and relationship discovery.
494#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
495pub struct PureAlgorithmicConfig {
496    /// Whether the pure algorithmic approach is enabled
497    pub enabled: bool,
498    /// Configuration for extracting linguistic patterns from text
499    pub pattern_extraction: PatternExtractionConfig,
500    /// Configuration for keyword extraction using statistical methods
501    pub keyword_extraction: PureKeywordExtractionConfig,
502    /// Configuration for discovering relationships between entities
503    pub relationship_discovery: RelationshipDiscoveryConfig,
504    /// Configuration for search result ranking algorithms
505    pub search_ranking: SearchRankingConfig,
506}
507
508/// Configuration for pattern extraction from text using regex and linguistic rules.
509///
510/// Pattern extraction identifies consistent linguistic structures that can indicate
511/// entities, relationships, and semantic patterns without requiring LLM processing.
512#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
513pub struct PatternExtractionConfig {
514    /// Regex patterns for identifying capitalized entities (proper nouns, acronyms)
515    pub capitalized_patterns: Vec<String>,
516    /// Regex patterns for technical terms, jargon, and specialized language
517    pub technical_patterns: Vec<String>,
518    /// Regex patterns for contextual relationships and semantic structures
519    pub context_patterns: Vec<String>,
520}
521
522/// Configuration for keyword extraction using statistical algorithms.
523///
524/// This configuration enables extraction of important terms from text using
525/// algorithms like TF-IDF, RAKE, or YAKE without requiring LLM processing.
526#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
527pub struct PureKeywordExtractionConfig {
528    /// Algorithm to use for keyword extraction (e.g., "tfidf", "rake", "yake")
529    pub algorithm: String,
530    /// Maximum number of keywords to extract per document
531    pub max_keywords: usize,
532    /// Minimum word length to consider for keywords
533    pub min_word_length: usize,
534    /// Whether to boost keywords based on their position in text
535    pub use_positional_boost: bool,
536    /// Whether to filter keywords based on frequency thresholds
537    pub use_frequency_filter: bool,
538    /// Minimum term frequency for a word to be considered a keyword
539    pub min_term_frequency: usize,
540    /// Maximum term frequency ratio to filter out overly common terms
541    pub max_term_frequency_ratio: f32,
542}
543
544/// Configuration for discovering relationships between entities using co-occurrence analysis.
545///
546/// This configuration enables algorithmic relationship discovery by analyzing
547/// word co-occurrence patterns and statistical measures without LLM inference.
548#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
549pub struct RelationshipDiscoveryConfig {
550    /// Window size for co-occurrence analysis (number of words to check around entities)
551    pub window_size: usize,
552    /// Minimum co-occurrence count to establish a relationship
553    pub min_co_occurrence: usize,
554    /// Whether to use mutual information scoring for relationship strength
555    pub use_mutual_information: bool,
556    /// Types of relationships to identify (e.g., "causal", "hierarchical", "temporal")
557    pub relationship_types: Vec<String>,
558    /// Scoring method for relationship ranking (e.g., "frequency", "mi", "pmi")
559    pub scoring_method: String,
560    /// Minimum similarity score threshold for valid relationships
561    pub min_similarity_score: f32,
562}
563
564/// Configuration for search result ranking across multiple retrieval strategies.
565///
566/// This configuration enables combining different search approaches (vector, keyword,
567/// graph traversal) and fusing their results for optimal relevance ranking.
568#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
569pub struct SearchRankingConfig {
570    /// Configuration for vector-based similarity search
571    pub vector_search: VectorSearchConfig,
572    /// Configuration for keyword-based search algorithms (e.g., BM25)
573    pub keyword_search: KeywordSearchConfig,
574    /// Configuration for graph-based traversal and ranking
575    pub graph_traversal: GraphTraversalConfig,
576    /// Configuration for hybrid fusion of multiple search strategies
577    pub hybrid_fusion: HybridFusionConfig,
578}
579
580/// Configuration for vector-based similarity search.
581///
582/// Enables semantic search using embeddings and similarity scoring
583/// for finding conceptually related content.
584#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
585pub struct VectorSearchConfig {
586    /// Whether vector similarity search is enabled
587    pub enabled: bool,
588}
589
590/// Configuration for keyword-based search algorithms.
591///
592/// Enables traditional information retrieval algorithms like BM25
593/// for keyword matching and scoring.
594#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
595pub struct KeywordSearchConfig {
596    /// Whether keyword-based search is enabled
597    pub enabled: bool,
598    /// Search algorithm to use (e.g., "bm25", "tfidf", "dirichlet")
599    pub algorithm: String,
600    /// BM25 parameter k1: controls term frequency saturation (typically 1.2-2.0)
601    pub k1: f32,
602    /// BM25 parameter b: controls document length normalization (typically 0.0-1.0)
603    pub b: f32,
604}
605
606/// Configuration for graph-based traversal and ranking algorithms.
607///
608/// Enables graph algorithms like PageRank and personalized search
609/// for navigating and ranking content in the knowledge graph.
610#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
611pub struct GraphTraversalConfig {
612    /// Whether graph traversal algorithms are enabled
613    pub enabled: bool,
614    /// Algorithm to use for graph traversal (e.g., "pagerank", "hits", "random_walk")
615    pub algorithm: String,
616    /// Damping factor for PageRank algorithm (typically 0.85)
617    pub damping_factor: f32,
618    /// Maximum iterations for graph algorithms
619    pub max_iterations: usize,
620    /// Whether to use personalized graph traversal
621    pub personalized: bool,
622}
623
624/// Configuration for hybrid fusion of multiple search strategies.
625///
626/// Enables combining results from different search approaches (vector, keyword,
627/// graph) using weighted scoring for improved relevance.
628#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
629pub struct HybridFusionConfig {
630    /// Whether hybrid fusion of search results is enabled
631    pub enabled: bool,
632    /// Weight configuration for different search strategies
633    pub weights: FusionWeights,
634}
635
636/// Weight configuration for combining different search strategies.
637///
638/// Defines the relative importance of each search approach in the
639/// hybrid fusion algorithm. Weights should typically sum to 1.0.
640#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
641pub struct FusionWeights {
642    /// Weight for keyword-based search results
643    pub keywords: f32,
644    /// Weight for graph traversal-based search results
645    pub graph: f32,
646    /// Weight for BM25/TF-IDF statistical search results
647    pub bm25: f32,
648}
649
650/// Configuration for hybrid GraphRAG strategies combining algorithmic and LLM approaches.
651///
652/// This configuration enables different hybrid strategies for balancing cost,
653/// performance, and quality through intelligent LLM usage.
654#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
655pub struct HybridStrategyConfig {
656    /// Configuration for lazy algorithmic approach with selective LLM enhancement
657    pub lazy_algorithmic: LazyAlgorithmicConfig,
658    /// Configuration for progressive multi-level LLM usage
659    pub progressive: ProgressiveConfig,
660    /// Configuration for budget-aware LLM optimization
661    pub budget_aware: BudgetAwareConfig,
662}
663
664/// Configuration for lazy algorithmic approach with selective LLM enhancement.
665///
666/// This strategy primarily uses algorithmic methods and only invokes LLMs
667/// when necessary to improve quality or handle complex cases.
668#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
669pub struct LazyAlgorithmicConfig {
670    /// Indexing strategy (e.g., "algorithmic_first", "llm_assisted", "hybrid")
671    pub indexing_approach: String,
672    /// Query processing strategy (e.g., "algorithmic_only", "selective_llm", "adaptive")
673    pub query_approach: String,
674    /// Cost optimization strategy (e.g., "aggressive", "balanced", "quality_first")
675    pub cost_optimization: String,
676}
677
678/// Configuration for progressive multi-level LLM usage strategy.
679///
680/// This strategy uses different levels of LLM involvement based on
681/// query complexity, budget, and quality requirements.
682#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
683pub struct ProgressiveConfig {
684    /// Level 0: Pure algorithmic processing (no LLM usage)
685    pub level_0: String,
686    /// Level 1: Minimal LLM usage (entity extraction only)
687    pub level_1: String,
688    /// Level 2: Moderate LLM usage (entity + relationship extraction)
689    pub level_2: String,
690    /// Level 3: Heavy LLM usage (full semantic analysis)
691    pub level_3: String,
692    /// Level 4+: Maximum LLM usage (comprehensive processing)
693    pub level_4_plus: String,
694}
695
696/// Configuration for budget-aware LLM optimization strategy.
697///
698/// This strategy dynamically adjusts LLM usage based on budget constraints,
699/// query costs, and daily spending limits to ensure cost control.
700#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
701pub struct BudgetAwareConfig {
702    /// Daily budget limit in USD for LLM operations
703    pub daily_budget_usd: f64,
704    /// Maximum number of queries allowed per day
705    pub queries_per_day: usize,
706    /// Maximum LLM cost allowed per individual query
707    pub max_llm_cost_per_query: f64,
708    /// Budget management strategy (e.g., "throttle", "degrade", "stop")
709    pub strategy: String,
710    /// Whether to fall back to pure algorithmic processing when budget is exceeded
711    pub fallback_to_algorithmic: bool,
712}
713
714// Default functions for zero-cost approach
715fn default_zero_cost_approach() -> String {
716    "pure_algorithmic".to_string()
717}
718
719impl Default for ZeroCostApproachConfig {
720    fn default() -> Self {
721        Self {
722            approach: default_zero_cost_approach(),
723            lazy_graphrag: LazyGraphRAGConfig::default(),
724            e2_graphrag: E2GraphRAGConfig::default(),
725            pure_algorithmic: PureAlgorithmicConfig::default(),
726            hybrid_strategy: HybridStrategyConfig::default(),
727        }
728    }
729}
730
731// Default implementations for sub-configs (simplified for now)
732impl Default for ConceptExtractionConfig {
733    fn default() -> Self {
734        Self {
735            min_concept_length: 3,
736            max_concept_words: 5,
737            use_noun_phrases: true,
738            use_capitalization: true,
739            use_title_case: true,
740            use_tf_idf_scoring: true,
741            min_term_frequency: 2,
742            max_concepts_per_chunk: 10,
743            min_concept_score: 0.1,
744            exclude_stopwords: true,
745            custom_stopwords: vec!["the".to_string(), "and".to_string(), "or".to_string()],
746        }
747    }
748}
749impl Default for CoOccurrenceConfig {
750    fn default() -> Self {
751        Self {
752            window_size: 50,
753            min_co_occurrence: 2,
754            jaccard_threshold: 0.2,
755            max_edges_per_node: 25,
756        }
757    }
758}
759impl Default for LazyIndexingConfig {
760    fn default() -> Self {
761        Self {
762            use_bidirectional_index: true,
763            enable_hnsw_index: false,
764            cache_size: 10000,
765        }
766    }
767}
768impl Default for LazyQueryExpansionConfig {
769    fn default() -> Self {
770        Self {
771            enabled: true,
772            max_expansions: 3,
773            expansion_model: "llama3.1:8b".to_string(),
774            expansion_temperature: 0.1,
775            max_tokens_per_expansion: 50,
776        }
777    }
778}
779impl Default for LazyRelevanceScoringConfig {
780    fn default() -> Self {
781        Self {
782            enabled: true,
783            scoring_model: "llama3.1:8b".to_string(),
784            batch_size: 10,
785            temperature: 0.2,
786            max_tokens_per_score: 30,
787        }
788    }
789}
790impl Default for PureAlgorithmicConfig {
791    fn default() -> Self {
792        Self {
793            enabled: true,
794            pattern_extraction: Default::default(),
795            keyword_extraction: Default::default(),
796            relationship_discovery: Default::default(),
797            search_ranking: Default::default(),
798        }
799    }
800}
801impl Default for PatternExtractionConfig {
802    fn default() -> Self {
803        Self {
804            capitalized_patterns: vec![r"[A-Z][a-z]+".to_string()],
805            technical_patterns: vec![r"[a-z]+-[a-z]+".to_string()],
806            context_patterns: vec![r"\b(the|this)\s+(\w+)".to_string()],
807        }
808    }
809}
810impl Default for PureKeywordExtractionConfig {
811    fn default() -> Self {
812        Self {
813            algorithm: "tf_idf".to_string(),
814            max_keywords: 20,
815            min_word_length: 4,
816            use_positional_boost: true,
817            use_frequency_filter: true,
818            min_term_frequency: 2,
819            max_term_frequency_ratio: 0.8,
820        }
821    }
822}
823impl Default for RelationshipDiscoveryConfig {
824    fn default() -> Self {
825        Self {
826            window_size: 30,
827            min_co_occurrence: 2,
828            use_mutual_information: true,
829            relationship_types: vec!["co_occurs_with".to_string()],
830            scoring_method: "jaccard_similarity".to_string(),
831            min_similarity_score: 0.1,
832        }
833    }
834}
835impl Default for SearchRankingConfig {
836    fn default() -> Self {
837        Self {
838            vector_search: VectorSearchConfig { enabled: false },
839            keyword_search: KeywordSearchConfig {
840                enabled: true,
841                algorithm: "bm25".to_string(),
842                k1: 1.2,
843                b: 0.75,
844            },
845            graph_traversal: GraphTraversalConfig {
846                enabled: true,
847                algorithm: "pagerank".to_string(),
848                damping_factor: 0.85,
849                max_iterations: 20,
850                personalized: true,
851            },
852            hybrid_fusion: HybridFusionConfig {
853                enabled: true,
854                weights: FusionWeights {
855                    keywords: 0.4,
856                    graph: 0.4,
857                    bm25: 0.2,
858                },
859            },
860        }
861    }
862}
863impl Default for HybridStrategyConfig {
864    fn default() -> Self {
865        Self {
866            lazy_algorithmic: LazyAlgorithmicConfig {
867                indexing_approach: "e2_graphrag".to_string(),
868                query_approach: "lazy_graphrag".to_string(),
869                cost_optimization: "indexing".to_string(),
870            },
871            progressive: ProgressiveConfig {
872                level_0: "pure_algorithmic".to_string(),
873                level_1: "pure_algorithmic".to_string(),
874                level_2: "e2_graphrag".to_string(),
875                level_3: "lazy_graphrag".to_string(),
876                level_4_plus: "lazy_graphrag".to_string(),
877            },
878            budget_aware: BudgetAwareConfig {
879                daily_budget_usd: 1.0,
880                queries_per_day: 1000,
881                max_llm_cost_per_query: 0.002,
882                strategy: "lazy_graphrag".to_string(),
883                fallback_to_algorithmic: true,
884            },
885        }
886    }
887}
888impl Default for KeywordSearchConfig {
889    fn default() -> Self {
890        Self {
891            enabled: true,
892            algorithm: "bm25".to_string(),
893            k1: 1.2,
894            b: 0.75,
895        }
896    }
897}
898impl Default for GraphTraversalConfig {
899    fn default() -> Self {
900        Self {
901            enabled: true,
902            algorithm: "pagerank".to_string(),
903            damping_factor: 0.85,
904            max_iterations: 20,
905            personalized: true,
906        }
907    }
908}
909impl Default for HybridFusionConfig {
910    fn default() -> Self {
911        Self {
912            enabled: true,
913            weights: FusionWeights {
914                keywords: 0.4,
915                graph: 0.4,
916                bm25: 0.2,
917            },
918        }
919    }
920}
921impl Default for FusionWeights {
922    fn default() -> Self {
923        Self {
924            keywords: 0.4,
925            graph: 0.4,
926            bm25: 0.2,
927        }
928    }
929}
930impl Default for LazyAlgorithmicConfig {
931    fn default() -> Self {
932        Self {
933            indexing_approach: "e2_graphrag".to_string(),
934            query_approach: "lazy_graphrag".to_string(),
935            cost_optimization: "indexing".to_string(),
936        }
937    }
938}
939impl Default for ProgressiveConfig {
940    fn default() -> Self {
941        Self {
942            level_0: "pure_algorithmic".to_string(),
943            level_1: "pure_algorithmic".to_string(),
944            level_2: "e2_graphrag".to_string(),
945            level_3: "lazy_graphrag".to_string(),
946            level_4_plus: "lazy_graphrag".to_string(),
947        }
948    }
949}
950impl Default for BudgetAwareConfig {
951    fn default() -> Self {
952        Self {
953            daily_budget_usd: 1.0,
954            queries_per_day: 1000,
955            max_llm_cost_per_query: 0.002,
956            strategy: "lazy_graphrag".to_string(),
957            fallback_to_algorithmic: true,
958        }
959    }
960}
961
962/// Configuration for embedding generation
963#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
964pub struct EmbeddingConfig {
965    /// Dimension of the embedding vectors
966    pub dimension: usize,
967
968    /// Embedding backend: "hash", "ollama", "huggingface", "openai", "voyage", "cohere", "jina", "mistral", "together", "onnx", "candle"
969    pub backend: String,
970
971    /// Model identifier (provider-specific)
972    /// - HuggingFace: "sentence-transformers/all-MiniLM-L6-v2"
973    /// - OpenAI: "text-embedding-3-small"
974    /// - Voyage: "voyage-3-large"
975    /// - Cohere: "embed-english-v3.0"
976    /// - Jina: "jina-embeddings-v3"
977    /// - Mistral: "mistral-embed"
978    /// - Together: "BAAI/bge-large-en-v1.5"
979    /// - Ollama: "nomic-embed-text"
980    #[serde(default)]
981    pub model: Option<String>,
982
983    /// Whether to fallback to hash-based embeddings if primary backend fails
984    pub fallback_to_hash: bool,
985
986    /// API endpoint for embeddings (if using external service)
987    pub api_endpoint: Option<String>,
988
989    /// API key for external embedding service
990    /// Can also be set via environment variables (OPENAI_API_KEY, VOYAGE_API_KEY, etc.)
991    pub api_key: Option<String>,
992
993    /// Cache directory for downloaded models (HuggingFace)
994    #[serde(default)]
995    pub cache_dir: Option<String>,
996
997    /// Batch size for processing multiple texts
998    #[serde(default = "default_batch_size")]
999    pub batch_size: usize,
1000}
1001
1002fn default_batch_size() -> usize {
1003    32
1004}
1005
1006/// Configuration for graph construction
1007#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1008pub struct GraphConfig {
1009    /// Maximum number of connections per node
1010    pub max_connections: usize,
1011
1012    /// Similarity threshold for creating edges
1013    pub similarity_threshold: f32,
1014
1015    /// Whether to extract relationships between entities
1016    #[serde(default = "default_true")]
1017    pub extract_relationships: bool,
1018
1019    /// Confidence threshold for relationships
1020    #[serde(default = "default_relationship_confidence")]
1021    pub relationship_confidence_threshold: f32,
1022
1023    /// Graph traversal configuration
1024    #[serde(default)]
1025    pub traversal: TraversalConfigParams,
1026}
1027
1028/// Configuration for graph traversal algorithms
1029#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1030pub struct TraversalConfigParams {
1031    /// Maximum depth for traversal algorithms (BFS, DFS)
1032    #[serde(default = "default_max_traversal_depth")]
1033    pub max_depth: usize,
1034
1035    /// Maximum number of paths to find (for pathfinding algorithms)
1036    #[serde(default = "default_max_paths")]
1037    pub max_paths: usize,
1038
1039    /// Whether to use edge weights in traversal
1040    #[serde(default = "default_true")]
1041    pub use_edge_weights: bool,
1042
1043    /// Minimum relationship strength to consider in traversal
1044    #[serde(default = "default_min_relationship_strength")]
1045    pub min_relationship_strength: f32,
1046}
1047
1048impl Default for TraversalConfigParams {
1049    fn default() -> Self {
1050        Self {
1051            max_depth: default_max_traversal_depth(),
1052            max_paths: default_max_paths(),
1053            use_edge_weights: true,
1054            min_relationship_strength: default_min_relationship_strength(),
1055        }
1056    }
1057}
1058
1059/// Configuration for text processing
1060#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1061pub struct TextConfig {
1062    /// Maximum chunk size for text processing
1063    pub chunk_size: usize,
1064
1065    /// Overlap between chunks
1066    pub chunk_overlap: usize,
1067
1068    /// Languages to support for text processing
1069    pub languages: Vec<String>,
1070}
1071
1072/// Configuration for entity extraction
1073#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1074pub struct EntityConfig {
1075    /// Minimum confidence score for entity extraction
1076    pub min_confidence: f32,
1077
1078    /// Types of entities to extract
1079    pub entity_types: Vec<String>,
1080
1081    /// Whether to use LLM-based gleaning for entity extraction
1082    #[serde(default)]
1083    pub use_gleaning: bool,
1084
1085    /// Maximum number of gleaning rounds for refinement
1086    #[serde(default = "default_max_gleaning_rounds")]
1087    pub max_gleaning_rounds: usize,
1088
1089    /// Enable triple reflection validation (DEG-RAG methodology)
1090    /// Validates extracted relationships against source text using LLM
1091    #[serde(default)]
1092    pub enable_triple_reflection: bool,
1093
1094    /// Minimum confidence score for relationship validation
1095    /// Relationships below this threshold will be filtered out
1096    #[serde(default = "default_validation_confidence")]
1097    pub validation_min_confidence: f32,
1098
1099    /// Enable ATOM atomic fact extraction (Phase 1.3)
1100    /// Extracts self-contained facts as 5-tuples for better granularity
1101    #[serde(default)]
1102    pub use_atomic_facts: bool,
1103
1104    /// Maximum tokens per atomic fact
1105    /// Facts longer than this will be rejected
1106    #[serde(default = "default_max_fact_tokens")]
1107    pub max_fact_tokens: usize,
1108}
1109
1110/// Configuration for advanced GraphRAG features (Phases 2-3)
1111#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
1112pub struct AdvancedFeaturesConfig {
1113    /// Phase 2.1: Symbolic Anchoring (CatRAG)
1114    /// Automatically applied for conceptual queries - no config needed
1115    #[serde(default)]
1116    pub symbolic_anchoring: SymbolicAnchoringConfig,
1117
1118    /// Phase 2.2: Dynamic Edge Weighting
1119    /// Query-aware relationship weight adjustment
1120    #[serde(default)]
1121    pub dynamic_weighting: DynamicWeightingConfig,
1122
1123    /// Phase 2.3: Causal Chain Analysis
1124    /// Multi-step causal reasoning
1125    #[serde(default)]
1126    pub causal_analysis: CausalAnalysisConfig,
1127
1128    /// Phase 3.1: Hierarchical Relationship Clustering
1129    /// Multi-level relationship organization
1130    #[serde(default)]
1131    pub hierarchical_clustering: HierarchicalClusteringConfig,
1132
1133    /// Phase 3.2: Graph Weight Optimization (DW-GRPO)
1134    /// Heuristic optimization of relationship weights
1135    #[serde(default)]
1136    pub weight_optimization: WeightOptimizationConfig,
1137}
1138
1139/// Configuration for Symbolic Anchoring (Phase 2.1)
1140#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1141pub struct SymbolicAnchoringConfig {
1142    /// Minimum relevance score to keep an anchor (0.0-1.0)
1143    #[serde(default = "default_anchor_min_relevance")]
1144    pub min_relevance: f32,
1145
1146    /// Maximum number of anchors to extract per query
1147    #[serde(default = "default_max_anchors")]
1148    pub max_anchors: usize,
1149
1150    /// Maximum entities per anchor
1151    #[serde(default = "default_max_entities_per_anchor")]
1152    pub max_entities_per_anchor: usize,
1153}
1154
1155/// Configuration for Dynamic Edge Weighting (Phase 2.2)
1156#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1157pub struct DynamicWeightingConfig {
1158    /// Enable semantic boost using embeddings
1159    #[serde(default = "default_true")]
1160    pub enable_semantic_boost: bool,
1161
1162    /// Enable temporal boost for recent relationships
1163    #[serde(default = "default_true")]
1164    pub enable_temporal_boost: bool,
1165
1166    /// Enable conceptual boost for matching concepts
1167    #[serde(default = "default_true")]
1168    pub enable_concept_boost: bool,
1169
1170    /// Enable causal boost for strong causal relationships
1171    #[serde(default = "default_true")]
1172    pub enable_causal_boost: bool,
1173}
1174
1175/// Configuration for Causal Chain Analysis (Phase 2.3)
1176#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1177pub struct CausalAnalysisConfig {
1178    /// Minimum confidence for causal chains (0.0-1.0)
1179    #[serde(default = "default_causal_min_confidence")]
1180    pub min_confidence: f32,
1181
1182    /// Minimum causal strength to consider (0.0-1.0)
1183    #[serde(default = "default_causal_min_strength")]
1184    pub min_causal_strength: f32,
1185
1186    /// Maximum chain depth to search
1187    #[serde(default = "default_max_chain_depth")]
1188    pub max_chain_depth: usize,
1189
1190    /// Require temporal consistency in chains
1191    #[serde(default = "default_true")]
1192    pub require_temporal_consistency: bool,
1193}
1194
1195/// Configuration for Hierarchical Relationship Clustering (Phase 3.1)
1196#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1197pub struct HierarchicalClusteringConfig {
1198    /// Number of hierarchy levels (2-5)
1199    #[serde(default = "default_num_levels")]
1200    pub num_levels: usize,
1201
1202    /// Resolution parameters for each level (higher = more clusters)
1203    /// Length should match num_levels
1204    #[serde(default = "default_resolutions")]
1205    pub resolutions: Vec<f32>,
1206
1207    /// Minimum relationships per cluster
1208    #[serde(default = "default_min_cluster_size")]
1209    pub min_cluster_size: usize,
1210
1211    /// Generate LLM summaries for clusters (requires Ollama)
1212    #[serde(default = "default_true")]
1213    pub generate_summaries: bool,
1214}
1215
1216/// Configuration for Graph Weight Optimization (Phase 3.2)
1217#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1218pub struct WeightOptimizationConfig {
1219    /// Learning rate for weight adjustments (0.01-0.5)
1220    #[serde(default = "default_learning_rate")]
1221    pub learning_rate: f32,
1222
1223    /// Maximum optimization iterations
1224    #[serde(default = "default_max_iterations")]
1225    pub max_iterations: usize,
1226
1227    /// Window size for slope calculation
1228    #[serde(default = "default_slope_window")]
1229    pub slope_window: usize,
1230
1231    /// Minimum slope to avoid stagnation
1232    #[serde(default = "default_stagnation_threshold")]
1233    pub stagnation_threshold: f32,
1234
1235    /// Use LLM for quality evaluation
1236    #[serde(default = "default_true")]
1237    pub use_llm_eval: bool,
1238
1239    /// Objective weights (relevance, faithfulness, conciseness)
1240    #[serde(default)]
1241    pub objective_weights: ObjectiveWeightsConfig,
1242}
1243
1244/// Configuration for optimization objective weights
1245#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1246pub struct ObjectiveWeightsConfig {
1247    /// Weight for relevance objective (0.0-1.0)
1248    #[serde(default = "default_relevance_weight")]
1249    pub relevance: f32,
1250
1251    /// Weight for faithfulness objective (0.0-1.0)
1252    #[serde(default = "default_faithfulness_weight")]
1253    pub faithfulness: f32,
1254
1255    /// Weight for conciseness objective (0.0-1.0)
1256    #[serde(default = "default_conciseness_weight")]
1257    pub conciseness: f32,
1258}
1259
1260/// Configuration for retrieval operations
1261#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1262pub struct RetrievalConfig {
1263    /// Number of top results to return
1264    pub top_k: usize,
1265
1266    /// Search algorithm to use
1267    pub search_algorithm: String,
1268}
1269
1270/// Configuration for parallel processing
1271#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1272pub struct ParallelConfig {
1273    /// Number of threads to use for parallel processing (0 = auto-detect)
1274    pub num_threads: usize,
1275
1276    /// Enable parallel processing
1277    pub enabled: bool,
1278
1279    /// Minimum batch size for parallel processing
1280    pub min_batch_size: usize,
1281
1282    /// Chunk size for parallel text processing
1283    pub chunk_batch_size: usize,
1284
1285    /// Parallel processing for embeddings
1286    pub parallel_embeddings: bool,
1287
1288    /// Parallel graph construction
1289    pub parallel_graph_ops: bool,
1290
1291    /// Parallel vector operations
1292    pub parallel_vector_ops: bool,
1293}
1294
1295// Default value functions
1296fn default_embedding_dim() -> usize {
1297    384
1298}
1299fn default_embedding_backend() -> String {
1300    "hash".to_string()
1301}
1302fn default_max_connections() -> usize {
1303    10
1304}
1305fn default_similarity_threshold() -> f32 {
1306    0.8
1307}
1308fn default_chunk_size() -> usize {
1309    1000
1310}
1311fn default_chunk_overlap() -> usize {
1312    200
1313}
1314fn default_languages() -> Vec<String> {
1315    vec!["en".to_string()]
1316}
1317fn default_min_confidence() -> f32 {
1318    0.7
1319}
1320fn default_entity_types() -> Vec<String> {
1321    vec![
1322        "PERSON".to_string(),
1323        "ORG".to_string(),
1324        "LOCATION".to_string(),
1325    ]
1326}
1327fn default_top_k() -> usize {
1328    10
1329}
1330fn default_search_algorithm() -> String {
1331    "cosine".to_string()
1332}
1333fn default_num_threads() -> usize {
1334    0
1335} // Auto-detect
1336fn default_min_batch_size() -> usize {
1337    10
1338}
1339fn default_chunk_batch_size() -> usize {
1340    100
1341}
1342fn default_true() -> bool {
1343    true
1344}
1345fn default_relationship_confidence() -> f32 {
1346    0.5
1347}
1348fn default_max_gleaning_rounds() -> usize {
1349    3
1350}
1351
1352fn default_validation_confidence() -> f32 {
1353    0.7
1354}
1355
1356// Advanced features defaults (Phases 2-3)
1357
1358// Phase 2.1: Symbolic Anchoring
1359fn default_anchor_min_relevance() -> f32 {
1360    0.3
1361}
1362
1363fn default_max_anchors() -> usize {
1364    5
1365}
1366
1367fn default_max_entities_per_anchor() -> usize {
1368    10
1369}
1370
1371// Phase 2.3: Causal Analysis
1372fn default_causal_min_confidence() -> f32 {
1373    0.3
1374}
1375
1376fn default_causal_min_strength() -> f32 {
1377    0.5
1378}
1379
1380fn default_max_chain_depth() -> usize {
1381    5
1382}
1383
1384// Phase 3.1: Hierarchical Clustering
1385fn default_num_levels() -> usize {
1386    3
1387}
1388
1389fn default_resolutions() -> Vec<f32> {
1390    vec![1.0, 0.5, 0.2]
1391}
1392
1393fn default_min_cluster_size() -> usize {
1394    2
1395}
1396
1397// Phase 3.2: Weight Optimization
1398fn default_learning_rate() -> f32 {
1399    0.1
1400}
1401
1402fn default_max_iterations() -> usize {
1403    20
1404}
1405
1406fn default_slope_window() -> usize {
1407    3
1408}
1409
1410fn default_stagnation_threshold() -> f32 {
1411    0.01
1412}
1413
1414fn default_relevance_weight() -> f32 {
1415    0.4
1416}
1417
1418fn default_faithfulness_weight() -> f32 {
1419    0.4
1420}
1421
1422fn default_conciseness_weight() -> f32 {
1423    0.2
1424}
1425
1426fn default_max_fact_tokens() -> usize {
1427    400
1428}
1429
1430fn default_approach() -> String {
1431    "semantic".to_string()
1432}
1433fn default_max_traversal_depth() -> usize {
1434    3
1435}
1436fn default_max_paths() -> usize {
1437    10
1438}
1439fn default_min_relationship_strength() -> f32 {
1440    0.3
1441}
1442fn default_auto_save_interval() -> u64 {
1443    300 // 5 minutes
1444}
1445fn default_max_versions() -> usize {
1446    5 // Keep 5 versions by default
1447}
1448
1449impl Default for Config {
1450    fn default() -> Self {
1451        Self {
1452            output_dir: "./output".to_string(),
1453            chunk_size: default_chunk_size(),
1454            chunk_overlap: default_chunk_overlap(),
1455            max_entities_per_chunk: Some(10),
1456            top_k_results: Some(default_top_k()),
1457            similarity_threshold: Some(default_similarity_threshold()),
1458            approach: default_approach(),
1459            embeddings: EmbeddingConfig {
1460                dimension: default_embedding_dim(),
1461                backend: default_embedding_backend(),
1462                model: Some("sentence-transformers/all-MiniLM-L6-v2".to_string()),
1463                fallback_to_hash: true,
1464                api_endpoint: None,
1465                api_key: None,
1466                cache_dir: None,
1467                batch_size: default_batch_size(),
1468            },
1469            graph: GraphConfig {
1470                max_connections: default_max_connections(),
1471                similarity_threshold: default_similarity_threshold(),
1472                extract_relationships: default_true(),
1473                relationship_confidence_threshold: default_relationship_confidence(),
1474                traversal: TraversalConfigParams::default(),
1475            },
1476            text: TextConfig {
1477                chunk_size: default_chunk_size(),
1478                chunk_overlap: default_chunk_overlap(),
1479                languages: default_languages(),
1480            },
1481            entities: EntityConfig {
1482                min_confidence: default_min_confidence(),
1483                entity_types: default_entity_types(),
1484                use_gleaning: false,
1485                max_gleaning_rounds: default_max_gleaning_rounds(),
1486                enable_triple_reflection: false,
1487                validation_min_confidence: default_validation_confidence(),
1488                use_atomic_facts: false,
1489                max_fact_tokens: default_max_fact_tokens(),
1490            },
1491            retrieval: RetrievalConfig {
1492                top_k: default_top_k(),
1493                search_algorithm: default_search_algorithm(),
1494            },
1495            parallel: ParallelConfig {
1496                num_threads: default_num_threads(),
1497                enabled: true,
1498                min_batch_size: default_min_batch_size(),
1499                chunk_batch_size: default_chunk_batch_size(),
1500                parallel_embeddings: true,
1501                parallel_graph_ops: true,
1502                parallel_vector_ops: true,
1503            },
1504            ollama: crate::ollama::OllamaConfig::default(),
1505            gliner: GlinerConfig::default(),
1506            enhancements: enhancements::EnhancementsConfig::default(),
1507            auto_save: AutoSaveConfig {
1508                enabled: false,
1509                base_dir: None,
1510                interval_seconds: default_auto_save_interval(),
1511                workspace_name: None,
1512                max_versions: default_max_versions(),
1513            },
1514            summarization: crate::summarization::HierarchicalConfig::default(),
1515            zero_cost_approach: ZeroCostApproachConfig::default(),
1516            advanced_features: AdvancedFeaturesConfig::default(),
1517            suppress_progress_bars: false,
1518        }
1519    }
1520}
1521
1522impl Default for AutoSaveConfig {
1523    fn default() -> Self {
1524        Self {
1525            enabled: false,
1526            base_dir: None,
1527            interval_seconds: default_auto_save_interval(),
1528            workspace_name: None,
1529            max_versions: default_max_versions(),
1530        }
1531    }
1532}
1533
1534impl Config {
1535    /// Turnkey config for a workspace directory. Persists graph + index to
1536    /// `workspace`. Uses hash-fallback embeddings and pattern-based entity
1537    /// extraction by default — works offline, no Ollama required. Chain
1538    /// `.with_ollama(...)` etc. to enable LLM features.
1539    pub fn quick(workspace: impl AsRef<std::path::Path>) -> Self {
1540        let ws = workspace.as_ref();
1541        let ws_str = ws.to_string_lossy().into_owned();
1542        let (base, name) = match (ws.parent(), ws.file_name()) {
1543            (Some(p), Some(f)) if !p.as_os_str().is_empty() => (
1544                p.to_string_lossy().into_owned(),
1545                f.to_string_lossy().into_owned(),
1546            ),
1547            _ => (".".to_string(), ws_str.clone()),
1548        };
1549        Self {
1550            output_dir: ws_str,
1551            auto_save: AutoSaveConfig {
1552                enabled: true,
1553                base_dir: Some(base),
1554                workspace_name: Some(name),
1555                ..AutoSaveConfig::default()
1556            },
1557            ..Self::default()
1558        }
1559    }
1560
1561    /// Enable Ollama with sensible defaults (localhost:11434, llama3.2:3b).
1562    pub fn with_ollama(mut self) -> Self {
1563        self.ollama.enabled = true;
1564        self.embeddings.backend = "ollama".to_string();
1565        self
1566    }
1567
1568    /// Override Ollama host (e.g. `http://gpu-box:11434`).
1569    pub fn with_ollama_host(mut self, host: impl Into<String>) -> Self {
1570        self.ollama.host = host.into();
1571        self.ollama.enabled = true;
1572        self
1573    }
1574
1575    /// Override chunk size and overlap (overlap defaults to 20 % of size).
1576    pub fn with_chunk_size(mut self, size: usize) -> Self {
1577        self.chunk_size = size;
1578        self.chunk_overlap = size / 5;
1579        self.text.chunk_size = size;
1580        self.text.chunk_overlap = size / 5;
1581        self
1582    }
1583}
1584
1585impl Default for SymbolicAnchoringConfig {
1586    fn default() -> Self {
1587        Self {
1588            min_relevance: default_anchor_min_relevance(),
1589            max_anchors: default_max_anchors(),
1590            max_entities_per_anchor: default_max_entities_per_anchor(),
1591        }
1592    }
1593}
1594
1595impl Default for DynamicWeightingConfig {
1596    fn default() -> Self {
1597        Self {
1598            enable_semantic_boost: default_true(),
1599            enable_temporal_boost: default_true(),
1600            enable_concept_boost: default_true(),
1601            enable_causal_boost: default_true(),
1602        }
1603    }
1604}
1605
1606impl Default for CausalAnalysisConfig {
1607    fn default() -> Self {
1608        Self {
1609            min_confidence: default_causal_min_confidence(),
1610            min_causal_strength: default_causal_min_strength(),
1611            max_chain_depth: default_max_chain_depth(),
1612            require_temporal_consistency: default_true(),
1613        }
1614    }
1615}
1616
1617impl Default for HierarchicalClusteringConfig {
1618    fn default() -> Self {
1619        Self {
1620            num_levels: default_num_levels(),
1621            resolutions: default_resolutions(),
1622            min_cluster_size: default_min_cluster_size(),
1623            generate_summaries: default_true(),
1624        }
1625    }
1626}
1627
1628impl Default for WeightOptimizationConfig {
1629    fn default() -> Self {
1630        Self {
1631            learning_rate: default_learning_rate(),
1632            max_iterations: default_max_iterations(),
1633            slope_window: default_slope_window(),
1634            stagnation_threshold: default_stagnation_threshold(),
1635            use_llm_eval: default_true(),
1636            objective_weights: ObjectiveWeightsConfig::default(),
1637        }
1638    }
1639}
1640
1641impl Default for ObjectiveWeightsConfig {
1642    fn default() -> Self {
1643        Self {
1644            relevance: default_relevance_weight(),
1645            faithfulness: default_faithfulness_weight(),
1646            conciseness: default_conciseness_weight(),
1647        }
1648    }
1649}
1650
1651impl Config {
1652    /// Load configuration with hierarchical merging (requires `hierarchical-config` feature)
1653    ///
1654    /// Configuration sources are merged in order of priority (lowest to highest):
1655    /// 1. Built-in defaults
1656    /// 2. User config: `~/.graphrag/config.toml`
1657    /// 3. Project config: `./graphrag.toml`
1658    /// 4. Environment variables: `GRAPHRAG_*` (e.g., `GRAPHRAG_OLLAMA_HOST`)
1659    ///
1660    /// # Example
1661    /// ```rust,no_run
1662    /// use graphrag_core::Config;
1663    ///
1664    /// // Auto-loads from all sources
1665    /// let config = Config::load()?;
1666    /// # Ok::<(), graphrag_core::GraphRAGError>(())
1667    /// ```
1668    #[cfg(feature = "hierarchical-config")]
1669    pub fn load() -> Result<Self> {
1670        use figment::{
1671            providers::{Env, Format, Serialized, Toml},
1672            Figment,
1673        };
1674
1675        // Build the configuration chain
1676        let mut figment = Figment::new()
1677            // 1. Start with defaults
1678            .merge(Serialized::defaults(Config::default()));
1679
1680        // 2. User-level config (~/.graphrag/config.toml)
1681        if let Some(home) = dirs::home_dir() {
1682            let user_config = home.join(".graphrag").join("config.toml");
1683            if user_config.exists() {
1684                figment = figment.merge(Toml::file(user_config));
1685            }
1686        }
1687
1688        // 3. Project-level config (./graphrag.toml)
1689        let project_config = std::path::Path::new("graphrag.toml");
1690        if project_config.exists() {
1691            figment = figment.merge(Toml::file(project_config));
1692        }
1693
1694        // 4. Environment variables (GRAPHRAG_*)
1695        // Maps GRAPHRAG_OLLAMA_HOST -> ollama.host
1696        figment = figment.merge(Env::prefixed("GRAPHRAG_").split("_"));
1697
1698        figment
1699            .extract()
1700            .map_err(|e| crate::core::GraphRAGError::Config {
1701                message: format!("Failed to load hierarchical configuration: {}", e),
1702            })
1703    }
1704
1705    /// Load configuration with hierarchical merging (stub for when feature is disabled)
1706    ///
1707    /// When the `hierarchical-config` feature is not enabled, this falls back to `Config::default()`.
1708    #[cfg(not(feature = "hierarchical-config"))]
1709    pub fn load() -> Result<Self> {
1710        Ok(Config::default())
1711    }
1712
1713    /// Load configuration from a TOML file with environment variable overrides
1714    ///
1715    /// This is the preferred method for loading configuration from a specific file
1716    /// while still allowing environment variable overrides.
1717    ///
1718    /// # Example
1719    /// ```rust,no_run
1720    /// use graphrag_core::Config;
1721    ///
1722    /// let config = Config::from_toml_file("./my-config.toml")?;
1723    /// # Ok::<(), graphrag_core::GraphRAGError>(())
1724    /// ```
1725    pub fn from_toml_file<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
1726        let content = fs::read_to_string(path.as_ref())?;
1727        let config: Config =
1728            toml::from_str(&content).map_err(|e| crate::core::GraphRAGError::Config {
1729                message: format!("Failed to parse TOML config: {}", e),
1730            })?;
1731        Ok(config)
1732    }
1733}
graphrag_core/config/mod.rs

graphrag_core/config/
mod.rs