Skip to main content

graphrag_core/config/
loader.rs

1// Allow dead code for configuration structures used for TOML parsing
2#![allow(dead_code)]
3
4use crate::config::Config;
5use crate::core::{GraphRAGError, Result};
6use std::fs;
7use std::path::Path;
8
9#[cfg(feature = "toml-support")]
10use toml;
11
12#[cfg(feature = "serde_json")]
13use serde_json;
14
15/// Configuration file format
16#[derive(Debug, Clone)]
17pub enum ConfigFormat {
18    /// TOML configuration format
19    Toml,
20    /// JSON configuration format
21    Json,
22    /// YAML configuration format
23    Yaml,
24}
25
26impl ConfigFormat {
27    /// Determine configuration format from file extension
28    pub fn from_extension(path: &str) -> Self {
29        let path = Path::new(path);
30        match path.extension().and_then(|s| s.to_str()) {
31            Some("toml") => ConfigFormat::Toml,
32            Some("json") => ConfigFormat::Json,
33            Some("yaml" | "yml") => ConfigFormat::Yaml,
34            _ => ConfigFormat::Toml, // Default
35        }
36    }
37}
38
39/// Load configuration from file
40pub fn load_config(path: &str) -> Result<Config> {
41    let format = ConfigFormat::from_extension(path);
42
43    if !Path::new(path).exists() {
44        return Err(GraphRAGError::Config {
45            message: format!("Configuration file not found: {path}"),
46        });
47    }
48
49    let content = fs::read_to_string(path)?;
50
51    match format {
52        ConfigFormat::Toml => load_toml_config(&content),
53        ConfigFormat::Json => load_json_config(&content),
54        ConfigFormat::Yaml => load_yaml_config(&content),
55    }
56}
57
58#[cfg(feature = "toml-support")]
59fn load_toml_config(content: &str) -> Result<Config> {
60    let raw_config: RawConfig = toml::from_str(content).map_err(|e| GraphRAGError::Config {
61        message: format!("Failed to parse TOML config: {e}"),
62    })?;
63
64    Ok(convert_raw_config(raw_config))
65}
66
67#[cfg(not(feature = "toml-support"))]
68fn load_toml_config(_content: &str) -> Result<Config> {
69    Err(GraphRAGError::Config {
70        message: "TOML support not enabled. Enable 'toml-support' feature.".to_string(),
71    })
72}
73
74#[cfg(feature = "serde_json")]
75fn load_json_config(content: &str) -> Result<Config> {
76    let raw_config: RawConfig =
77        serde_json::from_str(content).map_err(|e| GraphRAGError::Config {
78            message: format!("Failed to parse JSON config: {e}"),
79        })?;
80
81    Ok(convert_raw_config(raw_config))
82}
83
84#[cfg(not(feature = "serde_json"))]
85fn load_json_config(_content: &str) -> Result<Config> {
86    Err(GraphRAGError::Config {
87        message: "JSON support not enabled. Enable 'serde_json' feature.".to_string(),
88    })
89}
90
91#[cfg(feature = "yaml-support")]
92fn load_yaml_config(content: &str) -> Result<Config> {
93    let raw_config: RawConfig =
94        serde_yaml::from_str(content).map_err(|e| GraphRAGError::Config {
95            message: format!("Failed to parse YAML config: {e}"),
96        })?;
97
98    Ok(convert_raw_config(raw_config))
99}
100
101#[cfg(not(feature = "yaml-support"))]
102fn load_yaml_config(_content: &str) -> Result<Config> {
103    Err(GraphRAGError::Config {
104        message: "YAML support not enabled. Enable 'yaml-support' feature.".to_string(),
105    })
106}
107
108/// Raw configuration structure that matches the TOML file
109#[derive(Debug, serde::Deserialize, Default)]
110#[allow(dead_code)]
111struct RawConfig {
112    #[serde(default)]
113    system: SystemConfig,
114    #[serde(default)]
115    features: FeaturesConfig,
116    #[serde(default)]
117    text_processing: RawTextProcessingConfig,
118    #[serde(default)]
119    entity_extraction: RawEntityExtractionConfig,
120    #[serde(default)]
121    graph_construction: RawGraphConstructionConfig,
122    #[serde(default)]
123    vector_processing: RawVectorProcessingConfig,
124    #[serde(default)]
125    query_processing: RawQueryProcessingConfig,
126    #[serde(default)]
127    adaptive_retrieval: RawAdaptiveRetrievalConfig,
128    #[serde(default)]
129    ranking_policies: RawRankingPoliciesConfig,
130    #[serde(default)]
131    reranking: RawRerankingConfig,
132    #[serde(default)]
133    generation: RawGenerationConfig,
134    #[serde(default)]
135    ollama: RawOllamaConfig,
136    #[serde(default)]
137    async_processing: RawAsyncProcessingConfig,
138    #[serde(default)]
139    function_calling: RawFunctionCallingConfig,
140    #[serde(default)]
141    monitoring: RawMonitoringConfig,
142    #[serde(default)]
143    storage: RawStorageConfig,
144    #[serde(default)]
145    parallel_processing: RawParallelProcessingConfig,
146    #[serde(default)]
147    logging: RawLoggingConfig,
148    #[serde(default)]
149    experimental: RawExperimentalConfig,
150}
151
152#[derive(Debug, serde::Deserialize, Default)]
153#[allow(dead_code)]
154struct SystemConfig {
155    log_level: Option<String>,
156    max_memory_mb: Option<u64>,
157    temp_dir: Option<String>,
158    output_dir: Option<String>,
159}
160
161#[derive(Debug, serde::Deserialize, Default)]
162#[allow(dead_code)]
163struct FeaturesConfig {
164    text_processing: Option<bool>,
165    entity_extraction: Option<bool>,
166    graph_construction: Option<bool>,
167    vector_processing: Option<bool>,
168    async_processing: Option<bool>,
169    function_calling: Option<bool>,
170    monitoring: Option<bool>,
171}
172
173#[derive(Debug, serde::Deserialize, Default)]
174#[allow(dead_code)]
175struct RawTextProcessingConfig {
176    enabled: Option<bool>,
177    chunk_size: Option<usize>,
178    chunk_overlap: Option<usize>,
179    min_chunk_size: Option<usize>,
180    max_chunk_size: Option<usize>,
181    normalize_whitespace: Option<bool>,
182    remove_artifacts: Option<bool>,
183    extract_keywords: Option<bool>,
184    keyword_min_score: Option<f64>,
185    #[serde(default)]
186    enrichment: Option<RawEnrichmentConfig>,
187}
188
189#[derive(Debug, serde::Deserialize, Default)]
190#[allow(dead_code)]
191struct RawEnrichmentConfig {
192    enabled: Option<bool>,
193    auto_detect_format: Option<bool>,
194    parser_type: Option<String>,
195    extract_keywords: Option<bool>,
196    max_keywords_per_chunk: Option<usize>,
197    use_tfidf: Option<bool>,
198    generate_summaries: Option<bool>,
199    min_chunk_length_for_summary: Option<usize>,
200    max_summary_length: Option<usize>,
201    extract_chapter: Option<bool>,
202    extract_section: Option<bool>,
203    extract_position: Option<bool>,
204    calculate_confidence: Option<bool>,
205    detect_headings: Option<bool>,
206    detect_numbering: Option<bool>,
207    detect_underlines: Option<bool>,
208    detect_all_caps: Option<bool>,
209    detect_roman_numerals: Option<bool>,
210}
211
212#[derive(Debug, serde::Deserialize, Default)]
213#[allow(dead_code)]
214struct RawEntityExtractionConfig {
215    enabled: Option<bool>,
216    min_confidence: Option<f32>,
217    use_gleaning: Option<bool>,
218    max_gleaning_rounds: Option<usize>,
219    gleaning_improvement_threshold: Option<f64>,
220    semantic_merging: Option<bool>,
221    merge_similarity_threshold: Option<f64>,
222    automatic_linking: Option<bool>,
223    linking_confidence_threshold: Option<f64>,
224    gleaning: Option<RawGleaningConfig>,
225}
226
227#[derive(Debug, serde::Deserialize, Default)]
228struct RawGleaningConfig {
229    focus_areas: Option<Vec<String>>,
230    context_window: Option<usize>,
231    llm_temperature: Option<f64>,
232}
233
234#[derive(Debug, serde::Deserialize, Default)]
235struct RawGraphConstructionConfig {
236    enabled: Option<bool>,
237    incremental_updates: Option<bool>,
238    use_pagerank: Option<bool>,
239    pagerank_damping: Option<f64>,
240    pagerank_iterations: Option<usize>,
241    pagerank_convergence: Option<f64>,
242    extract_relationships: Option<bool>,
243    relationship_confidence_threshold: Option<f64>,
244}
245
246#[derive(Debug, serde::Deserialize, Default)]
247struct RawVectorProcessingConfig {
248    enabled: Option<bool>,
249    embedding_model: Option<String>,
250    embedding_dimensions: Option<usize>,
251    use_hnsw_index: Option<bool>,
252    hnsw_ef_construction: Option<usize>,
253    hnsw_m: Option<usize>,
254    similarity_threshold: Option<f64>,
255}
256
257#[derive(Debug, serde::Deserialize, Default)]
258struct RawQueryProcessingConfig {
259    enabled: Option<bool>,
260    use_advanced_pipeline: Option<bool>,
261    use_intent_classification: Option<bool>,
262    use_concept_extraction: Option<bool>,
263    use_temporal_parsing: Option<bool>,
264    confidence_threshold: Option<f64>,
265    intent_classification: Option<RawIntentClassificationConfig>,
266}
267
268#[derive(Debug, serde::Deserialize, Default)]
269struct RawIntentClassificationConfig {
270    factual_patterns: Option<Vec<String>>,
271    relational_patterns: Option<Vec<String>>,
272    temporal_patterns: Option<Vec<String>>,
273    causal_patterns: Option<Vec<String>>,
274    comparative_patterns: Option<Vec<String>>,
275}
276
277#[derive(Debug, serde::Deserialize, Default)]
278struct RawAdaptiveRetrievalConfig {
279    enabled: Option<bool>,
280    default_strategies: Option<Vec<String>>,
281    strategy_weights: Option<std::collections::HashMap<String, f64>>,
282    dynamic_weighting: Option<bool>,
283    diversity_factor: Option<f64>,
284    max_results_per_strategy: Option<usize>,
285}
286
287#[derive(Debug, serde::Deserialize, Default)]
288struct RawRankingPoliciesConfig {
289    enabled: Option<bool>,
290    use_elbow_detection: Option<bool>,
291    use_top_k_diversity: Option<bool>,
292    use_threshold_filtering: Option<bool>,
293    use_intent_aware_ranking: Option<bool>,
294    use_confidence_filtering: Option<bool>,
295    elbow_detection: Option<RawElbowDetectionConfig>,
296    top_k: Option<RawTopKConfig>,
297    threshold: Option<RawThresholdConfig>,
298}
299
300#[derive(Debug, serde::Deserialize, Default)]
301struct RawElbowDetectionConfig {
302    min_results: Option<usize>,
303    max_results: Option<usize>,
304    smoothing_factor: Option<f64>,
305}
306
307#[derive(Debug, serde::Deserialize, Default)]
308struct RawTopKConfig {
309    k: Option<usize>,
310    diversity_threshold: Option<f64>,
311    entity_type_balance: Option<bool>,
312}
313
314#[derive(Debug, serde::Deserialize, Default)]
315struct RawThresholdConfig {
316    min_score: Option<f64>,
317    confidence_weight: Option<f64>,
318}
319
320#[derive(Debug, serde::Deserialize, Default)]
321struct RawRerankingConfig {
322    enabled: Option<bool>,
323    use_confidence_filtering: Option<bool>,
324    use_cross_encoder: Option<bool>,
325    use_diversity_selection: Option<bool>,
326    final_result_limit: Option<usize>,
327}
328
329#[derive(Debug, serde::Deserialize, Default)]
330struct RawGenerationConfig {
331    enabled: Option<bool>,
332    use_context_assembly: Option<bool>,
333    max_context_length: Option<usize>,
334    use_prompt_templates: Option<bool>,
335    include_citations: Option<bool>,
336    include_confidence_scores: Option<bool>,
337    templates: Option<RawTemplatesConfig>,
338}
339
340#[derive(Debug, serde::Deserialize, Default)]
341struct RawTemplatesConfig {
342    factual: Option<String>,
343    relational: Option<String>,
344    temporal: Option<String>,
345}
346
347#[derive(Debug, serde::Deserialize, Default)]
348struct RawOllamaConfig {
349    enabled: Option<bool>,
350    base_url: Option<String>,
351    model_name: Option<String>,
352    embedding_model: Option<String>,
353    timeout_seconds: Option<u64>,
354    max_retries: Option<u32>,
355    generation: Option<RawOllamaGenerationConfig>,
356}
357
358#[derive(Debug, serde::Deserialize, Default)]
359struct RawOllamaGenerationConfig {
360    temperature: Option<f64>,
361    top_p: Option<f64>,
362    max_tokens: Option<u32>,
363    stream: Option<bool>,
364}
365
366#[derive(Debug, serde::Deserialize, Default)]
367struct RawAsyncProcessingConfig {
368    enabled: Option<bool>,
369    max_concurrent_llm_calls: Option<usize>,
370    max_concurrent_embeddings: Option<usize>,
371    max_concurrent_documents: Option<usize>,
372    llm_rate_limit_per_second: Option<f64>,
373    embedding_rate_limit_per_second: Option<f64>,
374    batching: Option<RawBatchingConfig>,
375}
376
377#[derive(Debug, serde::Deserialize, Default)]
378struct RawBatchingConfig {
379    batch_size: Option<usize>,
380    batch_timeout_seconds: Option<u64>,
381    max_batch_memory_mb: Option<usize>,
382}
383
384#[derive(Debug, serde::Deserialize, Default)]
385struct RawFunctionCallingConfig {
386    enabled: Option<bool>,
387    max_function_calls: Option<usize>,
388    timeout_per_call_seconds: Option<u64>,
389    allow_nested_calls: Option<bool>,
390}
391
392#[derive(Debug, serde::Deserialize, Default)]
393struct RawMonitoringConfig {
394    enabled: Option<bool>,
395    collect_performance_metrics: Option<bool>,
396    collect_usage_statistics: Option<bool>,
397    health_check_interval_seconds: Option<u64>,
398    log_slow_operations: Option<bool>,
399    slow_operation_threshold_ms: Option<u64>,
400    benchmarking: Option<RawBenchmarkingConfig>,
401}
402
403#[derive(Debug, serde::Deserialize, Default)]
404struct RawBenchmarkingConfig {
405    enabled: Option<bool>,
406    run_periodic_benchmarks: Option<bool>,
407    benchmark_interval_hours: Option<u64>,
408    auto_recommendations: Option<bool>,
409}
410
411#[derive(Debug, serde::Deserialize, Default)]
412struct RawStorageConfig {
413    r#type: Option<String>,
414    workspace_isolation: Option<bool>,
415    max_workspaces: Option<usize>,
416    backup_enabled: Option<bool>,
417    backup_interval_hours: Option<u64>,
418    persistent: Option<RawPersistentConfig>,
419}
420
421#[derive(Debug, serde::Deserialize, Default)]
422struct RawPersistentConfig {
423    database_path: Option<String>,
424    enable_wal: Option<bool>,
425    cache_size_mb: Option<usize>,
426}
427
428#[derive(Debug, serde::Deserialize, Default)]
429struct RawParallelProcessingConfig {
430    enabled: Option<bool>,
431    max_threads: Option<usize>,
432    thread_pool_size: Option<usize>,
433    load_balancing: Option<bool>,
434    work_stealing: Option<bool>,
435}
436
437#[derive(Debug, serde::Deserialize, Default)]
438struct RawLoggingConfig {
439    level: Option<String>,
440    format: Option<String>,
441    include_timestamps: Option<bool>,
442    include_module_path: Option<bool>,
443    log_to_file: Option<bool>,
444    log_file: Option<String>,
445    max_log_file_mb: Option<usize>,
446    rotate_logs: Option<bool>,
447}
448
449#[derive(Debug, serde::Deserialize, Default)]
450struct RawExperimentalConfig {
451    neural_reranking: Option<bool>,
452    federated_learning: Option<bool>,
453    real_time_updates: Option<bool>,
454    distributed_processing: Option<bool>,
455}
456
457/// Convert raw configuration to the main Config struct
458fn convert_raw_config(raw: RawConfig) -> Config {
459    let mut config = Config::default();
460
461    // Apply feature toggles
462    if let Some(_text_enabled) = raw.features.text_processing {
463        // Apply to appropriate config sections
464    }
465
466    // Apply text processing configuration
467    if let Some(_chunk_size) = raw.text_processing.chunk_size {
468        // config.text.chunk_size = chunk_size;
469    }
470
471    // Apply entity extraction configuration
472    if let Some(min_confidence) = raw.entity_extraction.min_confidence {
473        config.entities.min_confidence = min_confidence;
474    }
475
476    // Apply graph construction configuration
477    if let Some(extract_rels) = raw.graph_construction.extract_relationships {
478        config.graph.extract_relationships = extract_rels;
479    }
480    if let Some(threshold) = raw.graph_construction.relationship_confidence_threshold {
481        config.graph.relationship_confidence_threshold = threshold as f32;
482    }
483
484    // Apply parallel processing configuration
485    if let Some(enabled) = raw.parallel_processing.enabled {
486        config.parallel.enabled = enabled;
487    }
488    if let Some(max_threads) = raw.parallel_processing.max_threads {
489        config.parallel.num_threads = if max_threads == 0 {
490            #[cfg(feature = "parallel-processing")]
491            {
492                num_cpus::get()
493            }
494            #[cfg(not(feature = "parallel-processing"))]
495            {
496                1
497            }
498        } else {
499            max_threads
500        };
501    }
502
503    config
504}
505
506/// Save configuration to file
507pub fn save_config(config: &Config, path: &str) -> Result<()> {
508    let format = ConfigFormat::from_extension(path);
509
510    match format {
511        ConfigFormat::Toml => save_toml_config(config, path),
512        ConfigFormat::Json => save_json_config(config, path),
513        ConfigFormat::Yaml => save_yaml_config(config, path),
514    }
515}
516
517#[cfg(feature = "toml-support")]
518fn save_toml_config(_config: &Config, path: &str) -> Result<()> {
519    let content = r#"[text]
520chunk_size = 1000
521chunk_overlap = 200
522
523[entities]
524min_confidence = 0.7
525entity_types = ["PERSON", "ORG", "LOCATION"]
526
527[graph]
528max_connections = 10
529similarity_threshold = 0.8
530
531[parallel]
532enabled = true
533num_threads = 0
534"#;
535    fs::write(path, content)?;
536    Ok(())
537}
538
539#[cfg(not(feature = "toml-support"))]
540fn save_toml_config(_config: &Config, _path: &str) -> Result<()> {
541    Err(GraphRAGError::Config {
542        message: "TOML support not enabled. Enable 'toml-support' feature.".to_string(),
543    })
544}
545
546#[cfg(feature = "serde_json")]
547fn save_json_config(_config: &Config, path: &str) -> Result<()> {
548    let content = r#"{
549  "text": {
550    "chunk_size": 1000,
551    "chunk_overlap": 200
552  },
553  "entities": {
554    "min_confidence": 0.7,
555    "entity_types": ["PERSON", "ORG", "LOCATION"]
556  },
557  "graph": {
558    "max_connections": 10,
559    "similarity_threshold": 0.8
560  },
561  "parallel": {
562    "enabled": true,
563    "num_threads": 0
564  }
565}"#;
566    fs::write(path, content)?;
567    Ok(())
568}
569
570#[cfg(not(feature = "serde_json"))]
571fn save_json_config(_config: &Config, _path: &str) -> Result<()> {
572    Err(GraphRAGError::Config {
573        message: "JSON support not enabled.".to_string(),
574    })
575}
576
577#[cfg(feature = "yaml-support")]
578fn save_yaml_config(_config: &Config, path: &str) -> Result<()> {
579    let content = r#"text:
580  chunk_size: 1000
581  chunk_overlap: 200
582
583entities:
584  min_confidence: 0.7
585  entity_types: ["PERSON", "ORG", "LOCATION"]
586
587graph:
588  max_connections: 10
589  similarity_threshold: 0.8
590
591parallel:
592  enabled: true
593  num_threads: 0
594"#;
595    fs::write(path, content)?;
596    Ok(())
597}
598
599#[cfg(not(feature = "yaml-support"))]
600fn save_yaml_config(_config: &Config, _path: &str) -> Result<()> {
601    Err(GraphRAGError::Config {
602        message: "YAML support not enabled.".to_string(),
603    })
604}
605
606// Removed convert_config_to_raw function as we now use static templates
607
608#[cfg(test)]
609mod tests {
610    use super::*;
611
612    #[test]
613    fn test_config_format_detection() {
614        assert!(matches!(
615            ConfigFormat::from_extension("config.toml"),
616            ConfigFormat::Toml
617        ));
618        assert!(matches!(
619            ConfigFormat::from_extension("config.json"),
620            ConfigFormat::Json
621        ));
622        assert!(matches!(
623            ConfigFormat::from_extension("config.yaml"),
624            ConfigFormat::Yaml
625        ));
626        assert!(matches!(
627            ConfigFormat::from_extension("config.yml"),
628            ConfigFormat::Yaml
629        ));
630        assert!(matches!(
631            ConfigFormat::from_extension("config"),
632            ConfigFormat::Toml
633        ));
634    }
635}