graphrag_core/config/
setconfig.rs

1//! TOML Configuration System for GraphRAG
2//! Complete configuration management with extensive TOML support
3
4use crate::Result;
5use serde::{Deserialize, Serialize};
6use std::fs;
7use std::path::Path;
8
9/// Complete GraphRAG configuration loaded from TOML
10#[derive(Debug, Clone, Serialize, Deserialize, Default)]
11pub struct SetConfig {
12    /// Pipeline mode/approach configuration
13    #[serde(default)]
14    pub mode: ModeConfig,
15
16    /// Semantic/Neural pipeline configuration
17    #[serde(default)]
18    pub semantic: Option<SemanticPipelineConfig>,
19
20    /// Algorithmic/Classic NLP pipeline configuration
21    #[serde(default)]
22    pub algorithmic: Option<AlgorithmicPipelineConfig>,
23
24    /// Hybrid pipeline configuration
25    #[serde(default)]
26    pub hybrid: Option<HybridPipelineConfig>,
27
28    /// General system settings
29    #[serde(default)]
30    pub general: GeneralConfig,
31
32    /// Pipeline configuration
33    #[serde(default)]
34    pub pipeline: PipelineConfig,
35
36    /// Storage configuration
37    #[serde(default)]
38    pub storage: StorageConfig,
39
40    /// Model configuration
41    #[serde(default)]
42    pub models: ModelsConfig,
43
44    /// Performance tuning
45    #[serde(default)]
46    pub performance: PerformanceConfig,
47
48    /// Ollama-specific configuration
49    #[serde(default)]
50    pub ollama: OllamaSetConfig,
51
52    /// GLiNER-Relex extractor configuration
53    #[serde(default)]
54    pub gliner: GlinerSetConfig,
55
56    /// Experimental features
57    #[serde(default)]
58    pub experimental: ExperimentalConfig,
59
60    /// Top-level entity extraction configuration (for gleaning)
61    #[serde(default)]
62    pub entity_extraction: EntityExtractionTopLevelConfig,
63
64    /// Auto-save configuration for workspace persistence
65    #[serde(default)]
66    pub auto_save: AutoSaveSetConfig,
67}
68
69/// Auto-save / storage configuration
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct AutoSaveSetConfig {
72    /// Enable persistent storage.
73    /// `false` (default) → graph lives in memory only.
74    /// `true` → graph is saved to disk after `build_graph()` and reloaded on the next run.
75    #[serde(default)]
76    pub enabled: bool,
77
78    /// Base directory for workspace storage. Required when `enabled = true`.
79    /// Example: `"./output"` or `"/data/graphrag"`.
80    /// The workspace folder is created at `<base_dir>/<workspace_name>/`.
81    #[serde(default)]
82    pub base_dir: Option<String>,
83
84    /// Auto-save interval in seconds (0 = save after every graph build)
85    #[serde(default = "default_auto_save_interval")]
86    pub interval_seconds: u64,
87
88    /// Workspace name — sub-folder inside `base_dir` (default: "default").
89    #[serde(default)]
90    pub workspace_name: Option<String>,
91
92    /// Maximum number of auto-save versions to keep (0 = unlimited)
93    #[serde(default = "default_max_auto_save_versions")]
94    pub max_versions: usize,
95}
96
97impl Default for AutoSaveSetConfig {
98    fn default() -> Self {
99        Self {
100            enabled: false,
101            base_dir: None,
102            interval_seconds: default_auto_save_interval(),
103            workspace_name: None,
104            max_versions: default_max_auto_save_versions(),
105        }
106    }
107}
108
109/// General system configuration settings
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct GeneralConfig {
112    /// Logging level (error, warn, info, debug, trace)
113    #[serde(default = "default_log_level")]
114    pub log_level: String,
115
116    /// Output directory for results
117    #[serde(default = "default_output_dir")]
118    pub output_dir: String,
119
120    /// Path to the input document to process
121    #[serde(default)]
122    pub input_document_path: Option<String>,
123
124    /// Maximum threads (0 = auto-detect)
125    #[serde(default)]
126    pub max_threads: Option<usize>,
127
128    /// Enable performance profiling
129    #[serde(default)]
130    pub enable_profiling: bool,
131}
132
133/// Pipeline execution configuration
134#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct PipelineConfig {
136    /// Workflows to execute in sequence
137    #[serde(default = "default_workflows")]
138    pub workflows: Vec<String>,
139
140    /// Enable parallel execution
141    #[serde(default = "default_true")]
142    pub parallel_execution: bool,
143
144    /// Text extraction configuration
145    #[serde(default)]
146    pub text_extraction: TextExtractionConfig,
147
148    /// Entity extraction configuration
149    #[serde(default)]
150    pub entity_extraction: EntityExtractionConfig,
151
152    /// Graph building configuration
153    #[serde(default)]
154    pub graph_building: GraphBuildingConfig,
155
156    /// Community detection configuration
157    #[serde(default)]
158    pub community_detection: CommunityDetectionConfig,
159}
160
161/// Text extraction and chunking configuration
162#[derive(Debug, Clone, Serialize, Deserialize)]
163pub struct TextExtractionConfig {
164    /// Chunk size for text splitting
165    #[serde(default = "default_chunk_size")]
166    pub chunk_size: usize,
167
168    /// Overlap between chunks
169    #[serde(default = "default_chunk_overlap")]
170    pub chunk_overlap: usize,
171
172    /// Clean control characters
173    #[serde(default = "default_true")]
174    pub clean_control_chars: bool,
175
176    /// Minimum chunk size to keep
177    #[serde(default = "default_min_chunk_size")]
178    pub min_chunk_size: usize,
179
180    /// Text cleaning options
181    #[serde(default)]
182    pub cleaning: Option<CleaningConfig>,
183}
184
185/// Text cleaning options configuration
186#[derive(Debug, Clone, Serialize, Deserialize)]
187pub struct CleaningConfig {
188    /// Remove URLs from text
189    #[serde(default)]
190    pub remove_urls: bool,
191
192    /// Remove email addresses
193    #[serde(default)]
194    pub remove_emails: bool,
195
196    /// Normalize whitespace
197    #[serde(default = "default_true")]
198    pub normalize_whitespace: bool,
199
200    /// Remove special characters
201    #[serde(default)]
202    pub remove_special_chars: bool,
203}
204
205/// Entity extraction configuration
206#[derive(Debug, Clone, Serialize, Deserialize)]
207pub struct EntityExtractionConfig {
208    /// Model name for NER
209    #[serde(default = "default_ner_model")]
210    pub model_name: String,
211
212    /// Temperature for entity extraction (0.0 = fully deterministic JSON output)
213    #[serde(default = "default_extraction_temperature")]
214    pub temperature: f32,
215
216    /// Maximum tokens for extraction
217    #[serde(default = "default_max_tokens")]
218    pub max_tokens: usize,
219
220    /// Entity types to extract (dynamic configuration)
221    pub entity_types: Option<Vec<String>>,
222
223    /// Confidence threshold for entity extraction (top-level)
224    #[serde(default = "default_confidence_threshold")]
225    pub confidence_threshold: f32,
226
227    /// Custom extraction prompt
228    pub custom_prompt: Option<String>,
229
230    /// Entity filtering options
231    #[serde(default)]
232    pub filters: Option<EntityFiltersConfig>,
233}
234
235/// Entity filtering configuration
236#[derive(Debug, Clone, Serialize, Deserialize)]
237pub struct EntityFiltersConfig {
238    /// Minimum entity length
239    #[serde(default = "default_min_entity_length")]
240    pub min_entity_length: usize,
241
242    /// Maximum entity length
243    #[serde(default = "default_max_entity_length")]
244    pub max_entity_length: usize,
245
246    /// Allowed entity types
247    pub allowed_entity_types: Option<Vec<String>>,
248
249    /// Confidence threshold
250    #[serde(default = "default_confidence_threshold")]
251    pub confidence_threshold: f32,
252
253    /// Allowed regex patterns for entity matching
254    pub allowed_patterns: Option<Vec<String>>,
255
256    /// Excluded regex patterns for entity filtering
257    pub excluded_patterns: Option<Vec<String>>,
258
259    /// Enable fuzzy matching for entity resolution
260    #[serde(default)]
261    pub enable_fuzzy_matching: bool,
262}
263
264/// Graph building configuration
265#[derive(Debug, Clone, Serialize, Deserialize)]
266pub struct GraphBuildingConfig {
267    /// Relation scoring algorithm
268    #[serde(default = "default_relation_scorer")]
269    pub relation_scorer: String,
270
271    /// Minimum relation score threshold
272    #[serde(default = "default_min_relation_score")]
273    pub min_relation_score: f32,
274
275    /// Maximum connections per node
276    #[serde(default = "default_max_connections")]
277    pub max_connections_per_node: usize,
278
279    /// Use bidirectional relationships
280    #[serde(default = "default_true")]
281    pub bidirectional_relations: bool,
282}
283
284/// Community detection configuration
285#[derive(Debug, Clone, Serialize, Deserialize)]
286pub struct CommunityDetectionConfig {
287    /// Algorithm for community detection
288    #[serde(default = "default_community_algorithm")]
289    pub algorithm: String,
290
291    /// Resolution parameter
292    #[serde(default = "default_resolution")]
293    pub resolution: f32,
294
295    /// Minimum community size
296    #[serde(default = "default_min_community_size")]
297    pub min_community_size: usize,
298
299    /// Maximum community size (0 = unlimited)
300    #[serde(default)]
301    pub max_community_size: usize,
302}
303
304/// Storage backend configuration
305#[derive(Debug, Clone, Serialize, Deserialize)]
306pub struct StorageConfig {
307    /// Database type
308    #[serde(default = "default_database_type")]
309    pub database_type: String,
310
311    /// Database path for SQLite
312    #[serde(default = "default_database_path")]
313    pub database_path: String,
314
315    /// Enable WAL for SQLite
316    #[serde(default = "default_true")]
317    pub enable_wal: bool,
318
319    /// PostgreSQL configuration
320    pub postgresql: Option<PostgreSQLConfig>,
321
322    /// Neo4j configuration
323    pub neo4j: Option<Neo4jConfig>,
324}
325
326/// PostgreSQL database configuration
327#[derive(Debug, Clone, Serialize, Deserialize)]
328pub struct PostgreSQLConfig {
329    /// PostgreSQL server host
330    pub host: String,
331    /// PostgreSQL server port
332    pub port: u16,
333    /// Database name
334    pub database: String,
335    /// Username for authentication
336    pub username: String,
337    /// Password for authentication
338    pub password: String,
339    /// Connection pool size
340    #[serde(default = "default_pool_size")]
341    pub pool_size: usize,
342}
343
344/// Neo4j graph database configuration
345#[derive(Debug, Clone, Serialize, Deserialize)]
346pub struct Neo4jConfig {
347    /// Neo4j server URI
348    pub uri: String,
349    /// Username for authentication
350    pub username: String,
351    /// Password for authentication
352    pub password: String,
353    /// Enable encrypted connections
354    #[serde(default)]
355    pub encrypted: bool,
356}
357
358/// Model configuration for LLM and embeddings
359#[derive(Debug, Clone, Serialize, Deserialize)]
360pub struct ModelsConfig {
361    /// Primary LLM for generation
362    #[serde(default = "default_primary_llm")]
363    pub primary_llm: String,
364
365    /// Embedding model
366    #[serde(default = "default_embedding_model")]
367    pub embedding_model: String,
368
369    /// Maximum context length
370    #[serde(default = "default_max_context")]
371    pub max_context_length: usize,
372
373    /// LLM parameters
374    #[serde(default)]
375    pub llm_params: Option<LLMParamsConfig>,
376
377    /// Local model configuration
378    #[serde(default)]
379    pub local: Option<LocalModelsConfig>,
380}
381
382/// LLM generation parameters
383#[derive(Debug, Clone, Serialize, Deserialize)]
384pub struct LLMParamsConfig {
385    /// Sampling temperature (0.0-2.0)
386    #[serde(default = "default_temperature")]
387    pub temperature: f32,
388
389    /// Nucleus sampling parameter (0.0-1.0)
390    #[serde(default = "default_top_p")]
391    pub top_p: f32,
392
393    /// Frequency penalty (-2.0-2.0)
394    #[serde(default)]
395    pub frequency_penalty: f32,
396
397    /// Presence penalty (-2.0-2.0)
398    #[serde(default)]
399    pub presence_penalty: f32,
400
401    /// Sequences where the model will stop generating
402    pub stop_sequences: Option<Vec<String>>,
403}
404
405/// Local model configuration (Ollama)
406#[derive(Debug, Clone, Serialize, Deserialize)]
407pub struct LocalModelsConfig {
408    /// Ollama API base URL
409    #[serde(default = "default_ollama_url")]
410    pub ollama_base_url: String,
411
412    /// Local model name for generation
413    #[serde(default = "default_ollama_model")]
414    pub model_name: String,
415
416    /// Local embedding model name
417    #[serde(default = "default_ollama_embedding")]
418    pub embedding_model: String,
419}
420
421/// Performance tuning configuration
422#[derive(Debug, Clone, Serialize, Deserialize)]
423pub struct PerformanceConfig {
424    /// Enable batch processing
425    #[serde(default = "default_true")]
426    pub batch_processing: bool,
427
428    /// Batch size
429    #[serde(default = "default_batch_size")]
430    pub batch_size: usize,
431
432    /// Worker threads
433    #[serde(default = "default_worker_threads")]
434    pub worker_threads: usize,
435
436    /// Memory limit per worker (MB)
437    #[serde(default = "default_memory_limit")]
438    pub memory_limit_mb: usize,
439}
440
441/// Ollama-specific configuration for local LLM and embeddings
442#[derive(Debug, Clone, Serialize, Deserialize)]
443pub struct OllamaSetConfig {
444    /// Enable Ollama integration
445    #[serde(default = "default_true")]
446    pub enabled: bool,
447
448    /// Ollama host
449    #[serde(default = "default_ollama_host")]
450    pub host: String,
451
452    /// Ollama port
453    #[serde(default = "default_ollama_port")]
454    pub port: u16,
455
456    /// Chat model name
457    #[serde(default = "default_chat_model")]
458    pub chat_model: String,
459
460    /// Embedding model name
461    #[serde(default = "default_embedding_model_ollama")]
462    pub embedding_model: String,
463
464    /// Timeout in seconds
465    #[serde(default = "default_timeout")]
466    pub timeout_seconds: u64,
467
468    /// Maximum retries
469    #[serde(default = "default_max_retries")]
470    pub max_retries: u32,
471
472    /// Fallback to hash-based embeddings
473    #[serde(default)]
474    pub fallback_to_hash: bool,
475
476    /// Maximum tokens
477    pub max_tokens: Option<u32>,
478
479    /// Temperature
480    pub temperature: Option<f32>,
481
482    /// How long to keep the model loaded in memory (e.g. "1h", "30m", "0")
483    ///
484    /// Critical for KV Cache efficiency when processing multiple chunks from the same document.
485    pub keep_alive: Option<String>,
486
487    /// Context window size in tokens (overrides Ollama model default)
488    ///
489    /// Ollama silently truncates prompts exceeding the context window.
490    /// Set this when processing long documents to avoid silent truncation.
491    pub num_ctx: Option<u32>,
492}
493
494/// GLiNER-Relex extractor configuration
495#[derive(Debug, Clone, Serialize, Deserialize)]
496pub struct GlinerSetConfig {
497    /// Enable GLiNER-Relex extraction
498    #[serde(default)]
499    pub enabled: bool,
500    /// Path to the ONNX model file
501    #[serde(default)]
502    pub model_path: String,
503    /// Path to tokenizer.json (defaults to same dir as model_path)
504    #[serde(default)]
505    pub tokenizer_path: String,
506    /// "span" (default) or "token"
507    #[serde(default = "default_gliner_mode")]
508    pub mode: String,
509    /// Entity labels to extract
510    #[serde(default = "default_gliner_entity_labels")]
511    pub entity_labels: Vec<String>,
512    /// Relation labels to extract
513    #[serde(default = "default_gliner_relation_labels")]
514    pub relation_labels: Vec<String>,
515    /// Minimum entity confidence threshold
516    #[serde(default = "default_entity_threshold")]
517    pub entity_threshold: f32,
518    /// Minimum relation confidence threshold
519    #[serde(default = "default_relation_threshold")]
520    pub relation_threshold: f32,
521    /// Use GPU (CUDA) for inference
522    #[serde(default)]
523    pub use_gpu: bool,
524    /// Max concurrent chunk inferences (None → 4)
525    #[serde(default)]
526    pub max_concurrent_chunks: Option<usize>,
527}
528
529fn default_gliner_mode() -> String {
530    "span".to_string()
531}
532// Kept in sync with `GlinerConfig::default()` in config/mod.rs (canonical runtime
533// defaults). The drift-guard test `gliner_setconfig_default_matches_runtime` enforces it.
534fn default_gliner_entity_labels() -> Vec<String> {
535    vec![
536        "person".into(),
537        "organization".into(),
538        "location".into(),
539        "concept".into(),
540    ]
541}
542fn default_gliner_relation_labels() -> Vec<String> {
543    vec!["related to".into(), "part of".into(), "causes".into()]
544}
545fn default_entity_threshold() -> f32 {
546    0.4
547}
548fn default_relation_threshold() -> f32 {
549    0.5
550}
551
552impl Default for GlinerSetConfig {
553    fn default() -> Self {
554        Self {
555            enabled: false,
556            model_path: String::new(),
557            tokenizer_path: String::new(),
558            mode: default_gliner_mode(),
559            entity_labels: default_gliner_entity_labels(),
560            relation_labels: default_gliner_relation_labels(),
561            entity_threshold: default_entity_threshold(),
562            relation_threshold: default_relation_threshold(),
563            use_gpu: false,
564            max_concurrent_chunks: None,
565        }
566    }
567}
568
569/// Experimental features configuration
570#[derive(Debug, Clone, Serialize, Deserialize, Default)]
571pub struct ExperimentalConfig {
572    /// Enable neural reranking
573    #[serde(default)]
574    pub neural_reranking: bool,
575
576    /// Enable federated learning
577    #[serde(default)]
578    pub federated_learning: bool,
579
580    /// Enable real-time updates
581    #[serde(default)]
582    pub real_time_updates: bool,
583
584    /// Enable distributed processing
585    #[serde(default)]
586    pub distributed_processing: bool,
587
588    /// Enable LazyGraphRAG mode (no prior summarization, 0.1% indexing cost)
589    #[serde(default)]
590    pub lazy_graphrag: bool,
591
592    /// Enable E2GraphRAG mode (efficient entity extraction without LLM)
593    #[serde(default)]
594    pub e2_graphrag: bool,
595
596    /// LazyGraphRAG configuration
597    #[serde(default)]
598    pub lazy_graphrag_config: Option<LazyGraphRAGConfig>,
599
600    /// E2GraphRAG configuration
601    #[serde(default)]
602    pub e2_graphrag_config: Option<E2GraphRAGConfig>,
603}
604
605/// LazyGraphRAG configuration
606/// Concept-based retrieval without prior summarization (Microsoft Research, 2025)
607/// Achieves 0.1% of full GraphRAG indexing cost and 700x cheaper query costs
608#[derive(Debug, Clone, Serialize, Deserialize)]
609pub struct LazyGraphRAGConfig {
610    /// Enable concept extraction (noun phrases without LLM)
611    #[serde(default = "default_true")]
612    pub use_concept_extraction: bool,
613
614    /// Minimum concept length in characters
615    #[serde(default = "default_min_concept_length")]
616    pub min_concept_length: usize,
617
618    /// Maximum concept length in words
619    #[serde(default = "default_max_concept_words")]
620    pub max_concept_words: usize,
621
622    /// Co-occurrence threshold (minimum shared chunks for relationship)
623    #[serde(default = "default_co_occurrence_threshold")]
624    pub co_occurrence_threshold: usize,
625
626    /// Enable query refinement with iterative deepening
627    #[serde(default = "default_true")]
628    pub use_query_refinement: bool,
629
630    /// Maximum refinement iterations
631    #[serde(default = "default_max_refinement_iterations")]
632    pub max_refinement_iterations: usize,
633
634    /// Enable bidirectional entity-chunk indexing for fast lookups
635    #[serde(default = "default_true")]
636    pub use_bidirectional_index: bool,
637}
638
639impl Default for LazyGraphRAGConfig {
640    fn default() -> Self {
641        Self {
642            use_concept_extraction: true,
643            min_concept_length: 3,
644            max_concept_words: 5,
645            co_occurrence_threshold: 1,
646            use_query_refinement: true,
647            max_refinement_iterations: 3,
648            use_bidirectional_index: true,
649        }
650    }
651}
652
653/// E2GraphRAG configuration
654/// Efficient entity extraction using SpaCy-like approach without LLM
655/// Achieves 10x faster indexing and 100x faster retrieval
656#[derive(Debug, Clone, Serialize, Deserialize)]
657pub struct E2GraphRAGConfig {
658    /// Enable lightweight NER (no LLM required)
659    #[serde(default = "default_true")]
660    pub use_lightweight_ner: bool,
661
662    /// Entity types to extract (using pattern matching)
663    #[serde(default = "default_e2_entity_types")]
664    pub entity_types: Vec<String>,
665
666    /// Minimum entity confidence for pattern-based extraction
667    #[serde(default = "default_e2_min_confidence")]
668    pub min_confidence: f32,
669
670    /// Enable capitalization-based named entity detection
671    #[serde(default = "default_true")]
672    pub use_capitalization_detection: bool,
673
674    /// Enable noun phrase extraction
675    #[serde(default = "default_true")]
676    pub use_noun_phrase_extraction: bool,
677
678    /// Minimum entity frequency (entities must appear at least N times)
679    #[serde(default = "default_min_entity_frequency")]
680    pub min_entity_frequency: usize,
681
682    /// Use fast co-occurrence for relationships (no LLM)
683    #[serde(default = "default_true")]
684    pub use_fast_cooccurrence: bool,
685
686    /// Enable bidirectional entity-chunk indexing
687    #[serde(default = "default_true")]
688    pub use_bidirectional_index: bool,
689}
690
691impl Default for E2GraphRAGConfig {
692    fn default() -> Self {
693        Self {
694            use_lightweight_ner: true,
695            entity_types: default_e2_entity_types(),
696            min_confidence: 0.6,
697            use_capitalization_detection: true,
698            use_noun_phrase_extraction: true,
699            min_entity_frequency: 1,
700            use_fast_cooccurrence: true,
701            use_bidirectional_index: true,
702        }
703    }
704}
705
706// =============================================================================
707// PIPELINE APPROACH CONFIGURATION (Semantic vs Algorithmic vs Hybrid)
708// =============================================================================
709
710/// Pipeline mode/approach configuration
711/// Determines which pipeline implementation to use
712#[derive(Debug, Clone, Serialize, Deserialize)]
713pub struct ModeConfig {
714    /// Pipeline approach: "semantic", "algorithmic", or "hybrid"
715    /// - semantic: Neural embeddings + LLM extraction + vector search
716    /// - algorithmic: Pattern matching + TF-IDF + BM25 keyword search
717    /// - hybrid: Combines both with weighted fusion
718    #[serde(default = "default_approach")]
719    pub approach: String,
720}
721
722impl Default for ModeConfig {
723    fn default() -> Self {
724        Self {
725            approach: default_approach(),
726        }
727    }
728}
729
730/// Semantic/Neural pipeline configuration
731/// Uses deep learning models for embeddings, entity extraction, and retrieval
732#[derive(Debug, Clone, Serialize, Deserialize)]
733pub struct SemanticPipelineConfig {
734    /// Enable semantic pipeline
735    #[serde(default)]
736    pub enabled: bool,
737
738    /// Embeddings configuration for semantic approach
739    pub embeddings: SemanticEmbeddingsConfig,
740
741    /// Entity extraction configuration for semantic approach
742    pub entity_extraction: SemanticEntityConfig,
743
744    /// Retrieval configuration for semantic approach
745    pub retrieval: SemanticRetrievalConfig,
746
747    /// Graph construction configuration for semantic approach
748    pub graph_construction: SemanticGraphConfig,
749}
750
751/// Semantic embeddings configuration (neural models)
752#[derive(Debug, Clone, Serialize, Deserialize)]
753pub struct SemanticEmbeddingsConfig {
754    /// Backend: "huggingface", "openai", "voyage", "cohere", "jina", "mistral", "together", "ollama"
755    #[serde(default = "default_semantic_embedding_backend")]
756    pub backend: String,
757
758    /// Model identifier (provider-specific)
759    #[serde(default = "default_semantic_embedding_model")]
760    pub model: String,
761
762    /// Embedding dimension
763    #[serde(default = "default_semantic_embedding_dim")]
764    pub dimension: usize,
765
766    /// Use GPU acceleration if available
767    #[serde(default = "default_true")]
768    pub use_gpu: bool,
769
770    /// Similarity metric (cosine, euclidean, dot_product)
771    #[serde(default = "default_similarity_metric")]
772    pub similarity_metric: String,
773
774    /// Batch size for embeddings generation
775    #[serde(default = "default_batch_size")]
776    pub batch_size: usize,
777}
778
779/// Semantic entity extraction configuration (LLM-based)
780#[derive(Debug, Clone, Serialize, Deserialize)]
781pub struct SemanticEntityConfig {
782    /// Extraction method (always "llm" for semantic)
783    #[serde(default = "default_semantic_entity_method")]
784    pub method: String,
785
786    /// Enable gleaning (iterative refinement)
787    #[serde(default = "default_true")]
788    pub use_gleaning: bool,
789
790    /// Maximum gleaning rounds
791    #[serde(default = "default_max_gleaning_rounds")]
792    pub max_gleaning_rounds: usize,
793
794    /// LLM model for extraction
795    #[serde(default = "default_chat_model")]
796    pub model: String,
797
798    /// Temperature for LLM
799    #[serde(default = "default_semantic_temperature")]
800    pub temperature: f32,
801
802    /// Confidence threshold
803    #[serde(default = "default_semantic_confidence")]
804    pub confidence_threshold: f32,
805}
806
807/// Semantic retrieval configuration (vector search)
808#[derive(Debug, Clone, Serialize, Deserialize)]
809pub struct SemanticRetrievalConfig {
810    /// Retrieval strategy (always "vector" for semantic)
811    #[serde(default = "default_semantic_retrieval_strategy")]
812    pub strategy: String,
813
814    /// Use HNSW index for fast approximate search
815    #[serde(default = "default_true")]
816    pub use_hnsw: bool,
817
818    /// HNSW ef_construction parameter
819    #[serde(default = "default_hnsw_ef_construction")]
820    pub hnsw_ef_construction: usize,
821
822    /// HNSW M parameter (connections per node)
823    #[serde(default = "default_hnsw_m")]
824    pub hnsw_m: usize,
825
826    /// Top-k results
827    #[serde(default = "default_top_k")]
828    pub top_k: usize,
829
830    /// Similarity threshold
831    #[serde(default = "default_semantic_similarity_threshold")]
832    pub similarity_threshold: f32,
833}
834
835/// Semantic graph construction configuration (embedding-based)
836#[derive(Debug, Clone, Serialize, Deserialize)]
837pub struct SemanticGraphConfig {
838    /// Relation scorer (always "embedding_similarity" for semantic)
839    #[serde(default = "default_semantic_relation_scorer")]
840    pub relation_scorer: String,
841
842    /// Use transformer embeddings for relationships
843    #[serde(default = "default_true")]
844    pub use_transformer_embeddings: bool,
845
846    /// Minimum relation score
847    #[serde(default = "default_min_relation_score")]
848    pub min_relation_score: f32,
849}
850
851/// Algorithmic/Classic NLP pipeline configuration
852/// Uses pattern matching, TF-IDF, and keyword-based methods
853#[derive(Debug, Clone, Serialize, Deserialize, Default)]
854pub struct AlgorithmicPipelineConfig {
855    /// Enable algorithmic pipeline
856    #[serde(default)]
857    pub enabled: bool,
858
859    /// Embeddings configuration for algorithmic approach
860    pub embeddings: AlgorithmicEmbeddingsConfig,
861
862    /// Entity extraction configuration for algorithmic approach
863    pub entity_extraction: AlgorithmicEntityConfig,
864
865    /// Retrieval configuration for algorithmic approach
866    pub retrieval: AlgorithmicRetrievalConfig,
867
868    /// Graph construction configuration for algorithmic approach
869    pub graph_construction: AlgorithmicGraphConfig,
870}
871
872/// Algorithmic embeddings configuration (hash-based, TF-IDF)
873#[derive(Debug, Clone, Serialize, Deserialize)]
874pub struct AlgorithmicEmbeddingsConfig {
875    /// Backend (always "hash" for algorithmic)
876    #[serde(default = "default_algorithmic_embedding_backend")]
877    pub backend: String,
878
879    /// Embedding dimension
880    #[serde(default = "default_algorithmic_embedding_dim")]
881    pub dimension: usize,
882
883    /// Use TF-IDF weighting
884    #[serde(default = "default_true")]
885    pub use_tfidf: bool,
886
887    /// Vocabulary size
888    #[serde(default = "default_vocabulary_size")]
889    pub vocabulary_size: usize,
890
891    /// Minimum term frequency
892    #[serde(default = "default_min_term_frequency")]
893    pub min_term_frequency: usize,
894
895    /// Maximum document frequency (0.0-1.0)
896    #[serde(default = "default_max_document_frequency")]
897    pub max_document_frequency: f32,
898}
899
900/// Algorithmic entity extraction configuration (pattern-based)
901#[derive(Debug, Clone, Serialize, Deserialize)]
902pub struct AlgorithmicEntityConfig {
903    /// Extraction method (always "pattern" for algorithmic)
904    #[serde(default = "default_algorithmic_entity_method")]
905    pub method: String,
906
907    /// Use NER rules
908    #[serde(default = "default_true")]
909    pub use_ner_rules: bool,
910
911    /// Use POS tagging
912    #[serde(default)]
913    pub use_pos_tagging: bool,
914
915    /// Minimum entity length
916    #[serde(default = "default_min_entity_length")]
917    pub min_entity_length: usize,
918
919    /// Confidence threshold
920    #[serde(default = "default_algorithmic_confidence")]
921    pub confidence_threshold: f32,
922
923    /// Regex patterns for entity matching
924    pub patterns: Option<Vec<String>>,
925}
926
927/// Algorithmic retrieval configuration (BM25 keyword search)
928#[derive(Debug, Clone, Serialize, Deserialize)]
929pub struct AlgorithmicRetrievalConfig {
930    /// Retrieval strategy (always "bm25" for algorithmic)
931    #[serde(default = "default_algorithmic_retrieval_strategy")]
932    pub strategy: String,
933
934    /// BM25 k1 parameter (term frequency saturation)
935    #[serde(default = "default_bm25_k1")]
936    pub k1: f32,
937
938    /// BM25 b parameter (length normalization)
939    #[serde(default = "default_bm25_b")]
940    pub b: f32,
941
942    /// Use stemming
943    #[serde(default = "default_true")]
944    pub use_stemming: bool,
945
946    /// Language for stemming
947    #[serde(default = "default_language")]
948    pub language: String,
949
950    /// Top-k results
951    #[serde(default = "default_top_k")]
952    pub top_k: usize,
953}
954
955/// Algorithmic graph construction configuration (token overlap)
956#[derive(Debug, Clone, Serialize, Deserialize)]
957pub struct AlgorithmicGraphConfig {
958    /// Relation scorer (jaccard, cosine on token vectors)
959    #[serde(default = "default_algorithmic_relation_scorer")]
960    pub relation_scorer: String,
961
962    /// Use co-occurrence for relationship detection
963    #[serde(default = "default_true")]
964    pub use_cooccurrence: bool,
965
966    /// Co-occurrence window size
967    #[serde(default = "default_cooccurrence_window")]
968    pub window_size: usize,
969
970    /// Minimum relation score
971    #[serde(default = "default_algorithmic_min_relation_score")]
972    pub min_relation_score: f32,
973}
974
975/// Hybrid pipeline configuration
976/// Combines semantic and algorithmic approaches with weighted fusion
977#[derive(Debug, Clone, Serialize, Deserialize)]
978pub struct HybridPipelineConfig {
979    /// Enable hybrid pipeline
980    #[serde(default)]
981    pub enabled: bool,
982
983    /// Weight configuration for combining approaches
984    pub weights: HybridWeightsConfig,
985
986    /// Embeddings configuration for hybrid
987    pub embeddings: HybridEmbeddingsConfig,
988
989    /// Entity extraction configuration for hybrid
990    pub entity_extraction: HybridEntityConfig,
991
992    /// Retrieval configuration for hybrid
993    pub retrieval: HybridRetrievalConfig,
994
995    /// Graph construction configuration for hybrid
996    pub graph_construction: HybridGraphConfig,
997
998    /// Fallback strategy when primary fails
999    #[serde(default = "default_hybrid_fallback_strategy")]
1000    pub fallback_strategy: String,
1001
1002    /// Enable cross-validation between approaches
1003    #[serde(default = "default_true")]
1004    pub cross_validation: bool,
1005}
1006
1007/// Hybrid weight configuration
1008#[derive(Debug, Clone, Serialize, Deserialize)]
1009pub struct HybridWeightsConfig {
1010    /// Weight for semantic approach (0.0-1.0)
1011    #[serde(default = "default_hybrid_semantic_weight")]
1012    pub semantic_weight: f32,
1013
1014    /// Weight for algorithmic approach (0.0-1.0)
1015    #[serde(default = "default_hybrid_algorithmic_weight")]
1016    pub algorithmic_weight: f32,
1017}
1018
1019/// Hybrid embeddings configuration
1020#[derive(Debug, Clone, Serialize, Deserialize)]
1021pub struct HybridEmbeddingsConfig {
1022    /// Primary backend (neural)
1023    #[serde(default = "default_semantic_embedding_backend")]
1024    pub primary: String,
1025
1026    /// Fallback backend (hash-based)
1027    #[serde(default = "default_algorithmic_embedding_backend")]
1028    pub fallback: String,
1029
1030    /// Combine scores from both
1031    #[serde(default = "default_true")]
1032    pub combine_scores: bool,
1033
1034    /// Auto-fallback when primary unavailable
1035    #[serde(default = "default_true")]
1036    pub auto_fallback: bool,
1037}
1038
1039/// Hybrid entity extraction configuration
1040#[derive(Debug, Clone, Serialize, Deserialize)]
1041pub struct HybridEntityConfig {
1042    /// Use both LLM and pattern extraction
1043    #[serde(default = "default_true")]
1044    pub use_both: bool,
1045
1046    /// Weight for LLM extraction (0.0-1.0)
1047    #[serde(default = "default_hybrid_llm_weight")]
1048    pub llm_weight: f32,
1049
1050    /// Weight for pattern extraction (0.0-1.0)
1051    #[serde(default = "default_hybrid_pattern_weight")]
1052    pub pattern_weight: f32,
1053
1054    /// Cross-validate LLM results with patterns
1055    #[serde(default = "default_true")]
1056    pub cross_validate: bool,
1057
1058    /// Confidence boost when both agree
1059    #[serde(default = "default_hybrid_confidence_boost")]
1060    pub confidence_boost: f32,
1061}
1062
1063/// Hybrid retrieval configuration (RRF fusion)
1064#[derive(Debug, Clone, Serialize, Deserialize)]
1065pub struct HybridRetrievalConfig {
1066    /// Retrieval strategy (always "fusion" for hybrid)
1067    #[serde(default = "default_hybrid_retrieval_strategy")]
1068    pub strategy: String,
1069
1070    /// Combine vector and BM25
1071    #[serde(default = "default_true")]
1072    pub combine_vector_bm25: bool,
1073
1074    /// Weight for vector search
1075    #[serde(default = "default_hybrid_vector_weight")]
1076    pub vector_weight: f32,
1077
1078    /// Weight for BM25 search
1079    #[serde(default = "default_hybrid_bm25_weight")]
1080    pub bm25_weight: f32,
1081
1082    /// RRF constant (typically 60)
1083    #[serde(default = "default_rrf_constant")]
1084    pub rrf_constant: usize,
1085}
1086
1087/// Hybrid graph construction configuration
1088#[derive(Debug, Clone, Serialize, Deserialize)]
1089pub struct HybridGraphConfig {
1090    /// Primary relation scorer (embedding-based)
1091    #[serde(default = "default_semantic_relation_scorer")]
1092    pub primary_scorer: String,
1093
1094    /// Fallback relation scorer (token-based)
1095    #[serde(default = "default_algorithmic_relation_scorer")]
1096    pub fallback_scorer: String,
1097
1098    /// Combine scores from both scorers
1099    #[serde(default = "default_true")]
1100    pub combine_scores: bool,
1101}
1102
1103/// Top-level entity extraction configuration (gleaning settings)
1104#[derive(Debug, Clone, Serialize, Deserialize)]
1105pub struct EntityExtractionTopLevelConfig {
1106    /// Enable entity extraction
1107    #[serde(default = "default_true")]
1108    pub enabled: bool,
1109
1110    /// Minimum confidence threshold
1111    #[serde(default = "default_confidence_threshold")]
1112    pub min_confidence: f32,
1113
1114    /// Use LLM-based gleaning
1115    #[serde(default)]
1116    pub use_gleaning: bool,
1117
1118    /// Maximum gleaning rounds
1119    #[serde(default = "default_gleaning_rounds")]
1120    pub max_gleaning_rounds: usize,
1121
1122    /// Gleaning improvement threshold
1123    #[serde(default = "default_gleaning_improvement")]
1124    pub gleaning_improvement_threshold: f32,
1125
1126    /// Enable semantic merging
1127    #[serde(default)]
1128    pub semantic_merging: bool,
1129
1130    /// Merge similarity threshold
1131    #[serde(default = "default_merge_threshold")]
1132    pub merge_similarity_threshold: f32,
1133
1134    /// Enable automatic linking
1135    #[serde(default)]
1136    pub automatic_linking: bool,
1137
1138    /// Linking confidence threshold
1139    #[serde(default = "default_confidence_threshold")]
1140    pub linking_confidence_threshold: f32,
1141}
1142
1143impl Default for EntityExtractionTopLevelConfig {
1144    fn default() -> Self {
1145        Self {
1146            enabled: true,
1147            min_confidence: default_confidence_threshold(),
1148            use_gleaning: false,
1149            max_gleaning_rounds: default_gleaning_rounds(),
1150            gleaning_improvement_threshold: default_gleaning_improvement(),
1151            semantic_merging: false,
1152            merge_similarity_threshold: default_merge_threshold(),
1153            automatic_linking: false,
1154            linking_confidence_threshold: default_confidence_threshold(),
1155        }
1156    }
1157}
1158
1159// Default value functions
1160fn default_log_level() -> String {
1161    "info".to_string()
1162}
1163fn default_output_dir() -> String {
1164    "./output".to_string()
1165}
1166fn default_true() -> bool {
1167    true
1168}
1169fn default_workflows() -> Vec<String> {
1170    vec![
1171        "extract_text".to_string(),
1172        "extract_entities".to_string(),
1173        "build_graph".to_string(),
1174        "detect_communities".to_string(),
1175    ]
1176}
1177fn default_chunk_size() -> usize {
1178    512
1179}
1180fn default_chunk_overlap() -> usize {
1181    64
1182}
1183fn default_min_chunk_size() -> usize {
1184    50
1185}
1186fn default_ner_model() -> String {
1187    "microsoft/DialoGPT-medium".to_string()
1188}
1189fn default_temperature() -> f32 {
1190    0.1
1191}
1192fn default_extraction_temperature() -> f32 {
1193    0.0
1194}
1195fn default_max_tokens() -> usize {
1196    2048
1197}
1198fn default_min_entity_length() -> usize {
1199    3
1200}
1201fn default_max_entity_length() -> usize {
1202    100
1203}
1204fn default_confidence_threshold() -> f32 {
1205    0.8
1206}
1207fn default_relation_scorer() -> String {
1208    "cosine_similarity".to_string()
1209}
1210fn default_min_relation_score() -> f32 {
1211    0.7
1212}
1213fn default_max_connections() -> usize {
1214    10
1215}
1216fn default_community_algorithm() -> String {
1217    "leiden".to_string()
1218}
1219fn default_resolution() -> f32 {
1220    1.0
1221}
1222fn default_min_community_size() -> usize {
1223    3
1224}
1225fn default_database_type() -> String {
1226    "sqlite".to_string()
1227}
1228fn default_database_path() -> String {
1229    "./graphrag.db".to_string()
1230}
1231fn default_pool_size() -> usize {
1232    10
1233}
1234fn default_primary_llm() -> String {
1235    "gpt-4".to_string()
1236}
1237fn default_embedding_model() -> String {
1238    "text-embedding-ada-002".to_string()
1239}
1240fn default_max_context() -> usize {
1241    4096
1242}
1243fn default_top_p() -> f32 {
1244    0.9
1245}
1246fn default_ollama_url() -> String {
1247    "http://localhost:11434".to_string()
1248}
1249fn default_ollama_model() -> String {
1250    "llama2:7b".to_string()
1251}
1252fn default_ollama_embedding() -> String {
1253    "nomic-embed-text".to_string()
1254}
1255fn default_batch_size() -> usize {
1256    100
1257}
1258fn default_worker_threads() -> usize {
1259    4
1260}
1261fn default_memory_limit() -> usize {
1262    1024
1263}
1264fn default_ollama_host() -> String {
1265    "http://localhost".to_string()
1266}
1267fn default_ollama_port() -> u16 {
1268    11434
1269}
1270fn default_chat_model() -> String {
1271    "llama3.1:8b".to_string()
1272}
1273fn default_embedding_model_ollama() -> String {
1274    "nomic-embed-text".to_string()
1275}
1276fn default_timeout() -> u64 {
1277    60
1278}
1279fn default_max_retries() -> u32 {
1280    3
1281}
1282fn default_gleaning_rounds() -> usize {
1283    3
1284}
1285fn default_gleaning_improvement() -> f32 {
1286    0.1
1287}
1288fn default_merge_threshold() -> f32 {
1289    0.85
1290}
1291
1292// =============================================================================
1293// Default functions for Pipeline Approach Configuration
1294// =============================================================================
1295
1296// Mode defaults
1297fn default_approach() -> String {
1298    "semantic".to_string() // Default to semantic pipeline
1299}
1300
1301// Semantic pipeline defaults
1302fn default_semantic_embedding_backend() -> String {
1303    "huggingface".to_string()
1304}
1305fn default_semantic_embedding_model() -> String {
1306    "sentence-transformers/all-MiniLM-L6-v2".to_string()
1307}
1308fn default_semantic_embedding_dim() -> usize {
1309    384 // MiniLM-L6-v2 dimension
1310}
1311fn default_similarity_metric() -> String {
1312    "cosine".to_string()
1313}
1314fn default_semantic_entity_method() -> String {
1315    "llm".to_string()
1316}
1317fn default_max_gleaning_rounds() -> usize {
1318    3
1319}
1320fn default_semantic_temperature() -> f32 {
1321    0.1
1322}
1323fn default_semantic_confidence() -> f32 {
1324    0.7
1325}
1326fn default_semantic_retrieval_strategy() -> String {
1327    "vector".to_string()
1328}
1329fn default_hnsw_ef_construction() -> usize {
1330    200
1331}
1332fn default_hnsw_m() -> usize {
1333    16
1334}
1335fn default_top_k() -> usize {
1336    10
1337}
1338fn default_semantic_similarity_threshold() -> f32 {
1339    0.7
1340}
1341fn default_semantic_relation_scorer() -> String {
1342    "embedding_similarity".to_string()
1343}
1344
1345// Algorithmic pipeline defaults
1346fn default_algorithmic_embedding_backend() -> String {
1347    "hash".to_string()
1348}
1349fn default_algorithmic_embedding_dim() -> usize {
1350    128
1351}
1352fn default_vocabulary_size() -> usize {
1353    10000
1354}
1355fn default_min_term_frequency() -> usize {
1356    2
1357}
1358fn default_max_document_frequency() -> f32 {
1359    0.8
1360}
1361fn default_algorithmic_entity_method() -> String {
1362    "pattern".to_string()
1363}
1364fn default_algorithmic_confidence() -> f32 {
1365    0.75
1366}
1367fn default_algorithmic_retrieval_strategy() -> String {
1368    "bm25".to_string()
1369}
1370fn default_bm25_k1() -> f32 {
1371    1.5
1372}
1373fn default_bm25_b() -> f32 {
1374    0.75
1375}
1376fn default_language() -> String {
1377    "english".to_string()
1378}
1379fn default_algorithmic_relation_scorer() -> String {
1380    "jaccard".to_string()
1381}
1382fn default_cooccurrence_window() -> usize {
1383    10
1384}
1385fn default_algorithmic_min_relation_score() -> f32 {
1386    0.6
1387}
1388
1389// Hybrid pipeline defaults
1390fn default_hybrid_semantic_weight() -> f32 {
1391    0.6
1392}
1393fn default_hybrid_algorithmic_weight() -> f32 {
1394    0.4
1395}
1396fn default_hybrid_llm_weight() -> f32 {
1397    0.7
1398}
1399fn default_hybrid_pattern_weight() -> f32 {
1400    0.3
1401}
1402fn default_hybrid_confidence_boost() -> f32 {
1403    0.15
1404}
1405fn default_hybrid_retrieval_strategy() -> String {
1406    "fusion".to_string()
1407}
1408fn default_hybrid_vector_weight() -> f32 {
1409    0.6
1410}
1411fn default_hybrid_bm25_weight() -> f32 {
1412    0.4
1413}
1414fn default_rrf_constant() -> usize {
1415    60
1416}
1417fn default_hybrid_fallback_strategy() -> String {
1418    "semantic_first".to_string()
1419}
1420fn default_auto_save_interval() -> u64 {
1421    300 // 5 minutes
1422}
1423fn default_max_auto_save_versions() -> usize {
1424    5 // Keep 5 versions by default
1425}
1426
1427// LazyGraphRAG default functions
1428fn default_min_concept_length() -> usize {
1429    3 // Minimum 3 characters for concepts
1430}
1431fn default_max_concept_words() -> usize {
1432    5 // Maximum 5 words per concept
1433}
1434fn default_co_occurrence_threshold() -> usize {
1435    1 // Minimum 1 shared chunk for relationship
1436}
1437fn default_max_refinement_iterations() -> usize {
1438    3 // Up to 3 query refinement iterations
1439}
1440
1441// E2GraphRAG default functions
1442fn default_e2_entity_types() -> Vec<String> {
1443    vec![
1444        "PERSON".to_string(),
1445        "ORGANIZATION".to_string(),
1446        "LOCATION".to_string(),
1447        "CONCEPT".to_string(),
1448    ]
1449}
1450fn default_e2_min_confidence() -> f32 {
1451    0.6 // 60% minimum confidence for pattern-based extraction
1452}
1453fn default_min_entity_frequency() -> usize {
1454    1 // Entities must appear at least once
1455}
1456
1457impl Default for GeneralConfig {
1458    fn default() -> Self {
1459        Self {
1460            log_level: default_log_level(),
1461            output_dir: default_output_dir(),
1462            input_document_path: None,
1463            max_threads: None,
1464            enable_profiling: false,
1465        }
1466    }
1467}
1468
1469impl Default for PipelineConfig {
1470    fn default() -> Self {
1471        Self {
1472            workflows: default_workflows(),
1473            parallel_execution: default_true(),
1474            text_extraction: TextExtractionConfig::default(),
1475            entity_extraction: EntityExtractionConfig::default(),
1476            graph_building: GraphBuildingConfig::default(),
1477            community_detection: CommunityDetectionConfig::default(),
1478        }
1479    }
1480}
1481
1482impl Default for TextExtractionConfig {
1483    fn default() -> Self {
1484        Self {
1485            chunk_size: default_chunk_size(),
1486            chunk_overlap: default_chunk_overlap(),
1487            clean_control_chars: default_true(),
1488            min_chunk_size: default_min_chunk_size(),
1489            cleaning: None,
1490        }
1491    }
1492}
1493
1494impl Default for EntityExtractionConfig {
1495    fn default() -> Self {
1496        Self {
1497            model_name: default_ner_model(),
1498            temperature: default_temperature(),
1499            max_tokens: default_max_tokens(),
1500            entity_types: None,
1501            confidence_threshold: default_confidence_threshold(),
1502            custom_prompt: None,
1503            filters: None,
1504        }
1505    }
1506}
1507
1508impl Default for GraphBuildingConfig {
1509    fn default() -> Self {
1510        Self {
1511            relation_scorer: default_relation_scorer(),
1512            min_relation_score: default_min_relation_score(),
1513            max_connections_per_node: default_max_connections(),
1514            bidirectional_relations: default_true(),
1515        }
1516    }
1517}
1518
1519impl Default for CommunityDetectionConfig {
1520    fn default() -> Self {
1521        Self {
1522            algorithm: default_community_algorithm(),
1523            resolution: default_resolution(),
1524            min_community_size: default_min_community_size(),
1525            max_community_size: 0,
1526        }
1527    }
1528}
1529
1530impl Default for StorageConfig {
1531    fn default() -> Self {
1532        Self {
1533            database_type: default_database_type(),
1534            database_path: default_database_path(),
1535            enable_wal: default_true(),
1536            postgresql: None,
1537            neo4j: None,
1538        }
1539    }
1540}
1541
1542impl Default for ModelsConfig {
1543    fn default() -> Self {
1544        Self {
1545            primary_llm: default_primary_llm(),
1546            embedding_model: default_embedding_model(),
1547            max_context_length: default_max_context(),
1548            llm_params: None,
1549            local: None,
1550        }
1551    }
1552}
1553
1554impl Default for PerformanceConfig {
1555    fn default() -> Self {
1556        Self {
1557            batch_processing: default_true(),
1558            batch_size: default_batch_size(),
1559            worker_threads: default_worker_threads(),
1560            memory_limit_mb: default_memory_limit(),
1561        }
1562    }
1563}
1564
1565impl Default for OllamaSetConfig {
1566    fn default() -> Self {
1567        Self {
1568            enabled: default_true(),
1569            host: default_ollama_host(),
1570            port: default_ollama_port(),
1571            chat_model: default_chat_model(),
1572            embedding_model: default_embedding_model_ollama(),
1573            timeout_seconds: default_timeout(),
1574            max_retries: default_max_retries(),
1575            fallback_to_hash: false,
1576            max_tokens: Some(800),
1577            temperature: Some(0.3),
1578            keep_alive: None,
1579            num_ctx: None,
1580        }
1581    }
1582}
1583
1584// =============================================================================
1585// Default implementations for Pipeline Approach Configuration
1586// =============================================================================
1587
1588impl Default for SemanticPipelineConfig {
1589    fn default() -> Self {
1590        Self {
1591            enabled: true,
1592            embeddings: SemanticEmbeddingsConfig::default(),
1593            entity_extraction: SemanticEntityConfig::default(),
1594            retrieval: SemanticRetrievalConfig::default(),
1595            graph_construction: SemanticGraphConfig::default(),
1596        }
1597    }
1598}
1599
1600impl Default for SemanticEmbeddingsConfig {
1601    fn default() -> Self {
1602        Self {
1603            backend: default_semantic_embedding_backend(),
1604            model: default_semantic_embedding_model(),
1605            dimension: default_semantic_embedding_dim(),
1606            use_gpu: default_true(),
1607            similarity_metric: default_similarity_metric(),
1608            batch_size: default_batch_size(),
1609        }
1610    }
1611}
1612
1613impl Default for SemanticEntityConfig {
1614    fn default() -> Self {
1615        Self {
1616            method: default_semantic_entity_method(),
1617            use_gleaning: default_true(),
1618            max_gleaning_rounds: default_max_gleaning_rounds(),
1619            model: default_chat_model(),
1620            temperature: default_semantic_temperature(),
1621            confidence_threshold: default_semantic_confidence(),
1622        }
1623    }
1624}
1625
1626impl Default for SemanticRetrievalConfig {
1627    fn default() -> Self {
1628        Self {
1629            strategy: default_semantic_retrieval_strategy(),
1630            use_hnsw: default_true(),
1631            hnsw_ef_construction: default_hnsw_ef_construction(),
1632            hnsw_m: default_hnsw_m(),
1633            top_k: default_top_k(),
1634            similarity_threshold: default_semantic_similarity_threshold(),
1635        }
1636    }
1637}
1638
1639impl Default for SemanticGraphConfig {
1640    fn default() -> Self {
1641        Self {
1642            relation_scorer: default_semantic_relation_scorer(),
1643            use_transformer_embeddings: default_true(),
1644            min_relation_score: default_min_relation_score(),
1645        }
1646    }
1647}
1648
1649impl Default for AlgorithmicEmbeddingsConfig {
1650    fn default() -> Self {
1651        Self {
1652            backend: default_algorithmic_embedding_backend(),
1653            dimension: default_algorithmic_embedding_dim(),
1654            use_tfidf: default_true(),
1655            vocabulary_size: default_vocabulary_size(),
1656            min_term_frequency: default_min_term_frequency(),
1657            max_document_frequency: default_max_document_frequency(),
1658        }
1659    }
1660}
1661
1662impl Default for AlgorithmicEntityConfig {
1663    fn default() -> Self {
1664        Self {
1665            method: default_algorithmic_entity_method(),
1666            use_ner_rules: default_true(),
1667            use_pos_tagging: false,
1668            min_entity_length: default_min_entity_length(),
1669            confidence_threshold: default_algorithmic_confidence(),
1670            patterns: None,
1671        }
1672    }
1673}
1674
1675impl Default for AlgorithmicRetrievalConfig {
1676    fn default() -> Self {
1677        Self {
1678            strategy: default_algorithmic_retrieval_strategy(),
1679            k1: default_bm25_k1(),
1680            b: default_bm25_b(),
1681            use_stemming: default_true(),
1682            language: default_language(),
1683            top_k: default_top_k(),
1684        }
1685    }
1686}
1687
1688impl Default for AlgorithmicGraphConfig {
1689    fn default() -> Self {
1690        Self {
1691            relation_scorer: default_algorithmic_relation_scorer(),
1692            use_cooccurrence: default_true(),
1693            window_size: default_cooccurrence_window(),
1694            min_relation_score: default_algorithmic_min_relation_score(),
1695        }
1696    }
1697}
1698
1699impl Default for HybridPipelineConfig {
1700    fn default() -> Self {
1701        Self {
1702            enabled: false,
1703            weights: HybridWeightsConfig::default(),
1704            embeddings: HybridEmbeddingsConfig::default(),
1705            entity_extraction: HybridEntityConfig::default(),
1706            retrieval: HybridRetrievalConfig::default(),
1707            graph_construction: HybridGraphConfig::default(),
1708            fallback_strategy: default_hybrid_fallback_strategy(),
1709            cross_validation: default_true(),
1710        }
1711    }
1712}
1713
1714impl Default for HybridWeightsConfig {
1715    fn default() -> Self {
1716        Self {
1717            semantic_weight: default_hybrid_semantic_weight(),
1718            algorithmic_weight: default_hybrid_algorithmic_weight(),
1719        }
1720    }
1721}
1722
1723impl Default for HybridEmbeddingsConfig {
1724    fn default() -> Self {
1725        Self {
1726            primary: default_semantic_embedding_backend(),
1727            fallback: default_algorithmic_embedding_backend(),
1728            combine_scores: default_true(),
1729            auto_fallback: default_true(),
1730        }
1731    }
1732}
1733
1734impl Default for HybridEntityConfig {
1735    fn default() -> Self {
1736        Self {
1737            use_both: default_true(),
1738            llm_weight: default_hybrid_llm_weight(),
1739            pattern_weight: default_hybrid_pattern_weight(),
1740            cross_validate: default_true(),
1741            confidence_boost: default_hybrid_confidence_boost(),
1742        }
1743    }
1744}
1745
1746impl Default for HybridRetrievalConfig {
1747    fn default() -> Self {
1748        Self {
1749            strategy: default_hybrid_retrieval_strategy(),
1750            combine_vector_bm25: default_true(),
1751            vector_weight: default_hybrid_vector_weight(),
1752            bm25_weight: default_hybrid_bm25_weight(),
1753            rrf_constant: default_rrf_constant(),
1754        }
1755    }
1756}
1757
1758impl Default for HybridGraphConfig {
1759    fn default() -> Self {
1760        Self {
1761            primary_scorer: default_semantic_relation_scorer(),
1762            fallback_scorer: default_algorithmic_relation_scorer(),
1763            combine_scores: default_true(),
1764        }
1765    }
1766}
1767
1768impl SetConfig {
1769    /// Load configuration from TOML or JSON5 file (auto-detects format by extension)
1770    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
1771        let path_ref = path.as_ref();
1772        let content = fs::read_to_string(path_ref)?;
1773
1774        // Detect format by file extension
1775        let extension = path_ref.extension().and_then(|e| e.to_str()).unwrap_or("");
1776
1777        let config: SetConfig = match extension {
1778            #[cfg(feature = "json5-support")]
1779            "json5" | "json" => {
1780                json5::from_str(&content).map_err(|e| crate::core::GraphRAGError::Config {
1781                    message: format!("JSON5 parse error: {e}"),
1782                })?
1783            },
1784            #[cfg(not(feature = "json5-support"))]
1785            "json5" | "json" => {
1786                return Err(crate::core::GraphRAGError::Config {
1787                    message: "JSON5 support not enabled. Rebuild with --features json5-support"
1788                        .to_string(),
1789                });
1790            },
1791            _ => toml::from_str(&content).map_err(|e| crate::core::GraphRAGError::Config {
1792                message: format!("TOML parse error: {e}"),
1793            })?,
1794        };
1795
1796        Ok(config)
1797    }
1798
1799    /// Save configuration to TOML file with comments
1800    pub fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
1801        let toml_string =
1802            toml::to_string_pretty(&self).map_err(|e| crate::core::GraphRAGError::Config {
1803                message: format!("TOML serialize error: {e}"),
1804            })?;
1805
1806        // Add header comment
1807        let commented_toml = format!(
1808            "# =============================================================================\n\
1809             # GraphRAG Configuration File\n\
1810             # Complete configuration with extensive parameters for easy customization\n\
1811             # =============================================================================\n\n{toml_string}"
1812        );
1813
1814        fs::write(path, commented_toml)?;
1815        Ok(())
1816    }
1817
1818    /// Convert to the existing Config format for compatibility
1819    pub fn to_graphrag_config(&self) -> crate::Config {
1820        let mut config = crate::Config {
1821            approach: self.mode.approach.clone(),
1822            ..Default::default()
1823        };
1824
1825        // Map text processing
1826        config.text.chunk_size = self.pipeline.text_extraction.chunk_size;
1827        config.text.chunk_overlap = self.pipeline.text_extraction.chunk_overlap;
1828
1829        // Map entity extraction based on approach
1830        config.entities.min_confidence = self.entity_extraction.min_confidence;
1831
1832        // Map entity types from pipeline.entity_extraction
1833        if let Some(ref types) = self.pipeline.entity_extraction.entity_types {
1834            config.entities.entity_types = types.clone();
1835        }
1836
1837        // Configure gleaning based on approach:
1838        // - semantic: use LLM-based gleaning
1839        // - algorithmic: use pattern-based extraction
1840        // - hybrid: use both (for compatibility, map to gleaning)
1841        match self.mode.approach.as_str() {
1842            "semantic" => {
1843                if let Some(ref semantic) = self.semantic {
1844                    config.entities.use_gleaning = semantic.entity_extraction.use_gleaning;
1845                    config.entities.max_gleaning_rounds =
1846                        semantic.entity_extraction.max_gleaning_rounds;
1847                    config.entities.min_confidence =
1848                        semantic.entity_extraction.confidence_threshold;
1849                } else {
1850                    // No semantic sub-section: use top-level entity_extraction settings directly
1851                    config.entities.use_gleaning = self.entity_extraction.use_gleaning;
1852                    config.entities.max_gleaning_rounds =
1853                        self.entity_extraction.max_gleaning_rounds;
1854                    config.entities.min_confidence = self.entity_extraction.min_confidence;
1855                }
1856            },
1857            "algorithmic" => {
1858                // Algorithmic uses pattern-based extraction, no gleaning
1859                config.entities.use_gleaning = false;
1860                if let Some(ref algorithmic) = self.algorithmic {
1861                    config.entities.min_confidence =
1862                        algorithmic.entity_extraction.confidence_threshold;
1863                }
1864            },
1865            "hybrid" => {
1866                // Hybrid can use both, enable gleaning for LLM component
1867                config.entities.use_gleaning = true;
1868                if self.hybrid.is_some() {
1869                    // Use hybrid configuration if available
1870                    config.entities.max_gleaning_rounds = 2; // Reduced for hybrid efficiency
1871                }
1872            },
1873            _ => {
1874                // Unknown approach, use top-level config as fallback
1875                config.entities.use_gleaning = self.entity_extraction.use_gleaning;
1876                config.entities.max_gleaning_rounds = self.entity_extraction.max_gleaning_rounds;
1877            },
1878        }
1879
1880        // Map graph building
1881        config.graph.similarity_threshold = self.pipeline.graph_building.min_relation_score;
1882        config.graph.max_connections = self.pipeline.graph_building.max_connections_per_node;
1883        config.graph.extract_relationships = true; // Enable by default for TOML configs
1884        config.graph.relationship_confidence_threshold = 0.5; // Default threshold
1885
1886        // Map retrieval
1887        config.retrieval.top_k = 10; // Default
1888
1889        // Map embeddings
1890        config.embeddings.dimension = 768; // Default for nomic-embed-text
1891        config.embeddings.backend = "ollama".to_string();
1892        config.embeddings.fallback_to_hash = self.ollama.fallback_to_hash;
1893
1894        // Map parallel processing
1895        config.parallel.enabled = self.pipeline.parallel_execution;
1896        config.parallel.num_threads = self.performance.worker_threads;
1897
1898        // Map Ollama configuration
1899        config.ollama = crate::ollama::OllamaConfig {
1900            enabled: self.ollama.enabled,
1901            host: self.ollama.host.clone(),
1902            port: self.ollama.port,
1903            chat_model: self.ollama.chat_model.clone(),
1904            embedding_model: self.ollama.embedding_model.clone(),
1905            timeout_seconds: self.ollama.timeout_seconds,
1906            max_retries: self.ollama.max_retries,
1907            fallback_to_hash: self.ollama.fallback_to_hash,
1908            max_tokens: self.ollama.max_tokens,
1909            temperature: self.ollama.temperature,
1910            enable_caching: true,
1911            keep_alive: self.ollama.keep_alive.clone(),
1912            num_ctx: self.ollama.num_ctx,
1913        };
1914
1915        // Map GLiNER configuration
1916        config.gliner = crate::config::GlinerConfig {
1917            enabled: self.gliner.enabled,
1918            model_path: self.gliner.model_path.clone(),
1919            tokenizer_path: self.gliner.tokenizer_path.clone(),
1920            mode: self.gliner.mode.clone(),
1921            entity_labels: self.gliner.entity_labels.clone(),
1922            relation_labels: self.gliner.relation_labels.clone(),
1923            entity_threshold: self.gliner.entity_threshold,
1924            relation_threshold: self.gliner.relation_threshold,
1925            use_gpu: self.gliner.use_gpu,
1926            max_concurrent_chunks: self.gliner.max_concurrent_chunks,
1927        };
1928
1929        // Map auto-save configuration
1930        config.auto_save = crate::config::AutoSaveConfig {
1931            enabled: self.auto_save.enabled,
1932            base_dir: self.auto_save.base_dir.clone(),
1933            interval_seconds: self.auto_save.interval_seconds,
1934            workspace_name: self.auto_save.workspace_name.clone(),
1935            max_versions: self.auto_save.max_versions,
1936        };
1937
1938        config
1939    }
1940}
1941
1942#[cfg(test)]
1943mod drift_guard_tests {
1944    //! Guards against silent drift between the serde-facing `*SetConfig` leaf
1945    //! structs and their canonical runtime counterparts in `config/mod.rs` /
1946    //! `ollama/mod.rs`. These structs are mechanical mirrors kept in sync by hand
1947    //! (the "5-point-sync" documented in CLAUDE.md); these tests fail loudly when
1948    //! a field's default diverges so the drift is caught at build time.
1949    //!
1950    //! NOTE: `OllamaConfig` is *intentionally* excluded — its runtime default is
1951    //! offline-first (`enabled = false`, `fallback_to_hash = true`) while
1952    //! `OllamaSetConfig` is the user-facing "I want Ollama" schema
1953    //! (`enabled = true`). That divergence is by design, not drift.
1954
1955    use super::*;
1956    use crate::config::{AutoSaveConfig, GlinerConfig};
1957
1958    #[test]
1959    fn gliner_setconfig_default_matches_runtime() {
1960        let set = GlinerSetConfig::default();
1961        let runtime = GlinerConfig::default();
1962        assert_eq!(set.mode, runtime.mode, "gliner.mode drifted");
1963        assert_eq!(
1964            set.entity_labels, runtime.entity_labels,
1965            "gliner.entity_labels drifted"
1966        );
1967        assert_eq!(
1968            set.relation_labels, runtime.relation_labels,
1969            "gliner.relation_labels drifted"
1970        );
1971        assert_eq!(
1972            set.entity_threshold, runtime.entity_threshold,
1973            "gliner.entity_threshold drifted"
1974        );
1975        assert_eq!(
1976            set.relation_threshold, runtime.relation_threshold,
1977            "gliner.relation_threshold drifted"
1978        );
1979        assert_eq!(set.use_gpu, runtime.use_gpu, "gliner.use_gpu drifted");
1980    }
1981
1982    #[test]
1983    fn autosave_setconfig_default_matches_runtime() {
1984        let set = AutoSaveSetConfig::default();
1985        let runtime = AutoSaveConfig::default();
1986        assert_eq!(set.enabled, runtime.enabled, "auto_save.enabled drifted");
1987        assert_eq!(
1988            set.interval_seconds, runtime.interval_seconds,
1989            "auto_save.interval_seconds drifted"
1990        );
1991        assert_eq!(
1992            set.max_versions, runtime.max_versions,
1993            "auto_save.max_versions drifted"
1994        );
1995    }
1996}
graphrag_core/config/setconfig.rs

graphrag_core/config/
setconfig.rs