graphrag_core/config/
setconfig.rs

1//! TOML Configuration System for GraphRAG
2//! Complete configuration management with extensive TOML support
3
4use crate::Result;
5use serde::{Deserialize, Serialize};
6use std::fs;
7use std::path::Path;
8
9/// Complete GraphRAG configuration loaded from TOML
10#[derive(Debug, Clone, Serialize, Deserialize, Default)]
11pub struct SetConfig {
12    /// Pipeline mode/approach configuration
13    #[serde(default)]
14    pub mode: ModeConfig,
15
16    /// Semantic/Neural pipeline configuration
17    #[serde(default)]
18    pub semantic: Option<SemanticPipelineConfig>,
19
20    /// Algorithmic/Classic NLP pipeline configuration
21    #[serde(default)]
22    pub algorithmic: Option<AlgorithmicPipelineConfig>,
23
24    /// Hybrid pipeline configuration
25    #[serde(default)]
26    pub hybrid: Option<HybridPipelineConfig>,
27
28    /// General system settings
29    #[serde(default)]
30    pub general: GeneralConfig,
31
32    /// Pipeline configuration
33    #[serde(default)]
34    pub pipeline: PipelineConfig,
35
36    /// Storage configuration
37    #[serde(default)]
38    pub storage: StorageConfig,
39
40    /// Model configuration
41    #[serde(default)]
42    pub models: ModelsConfig,
43
44    /// Performance tuning
45    #[serde(default)]
46    pub performance: PerformanceConfig,
47
48    /// Ollama-specific configuration
49    #[serde(default)]
50    pub ollama: OllamaSetConfig,
51
52    /// GLiNER-Relex extractor configuration
53    #[serde(default)]
54    pub gliner: GlinerSetConfig,
55
56    /// Experimental features
57    #[serde(default)]
58    pub experimental: ExperimentalConfig,
59
60    /// Top-level entity extraction configuration (for gleaning)
61    #[serde(default)]
62    pub entity_extraction: EntityExtractionTopLevelConfig,
63
64    /// Auto-save configuration for workspace persistence
65    #[serde(default)]
66    pub auto_save: AutoSaveSetConfig,
67}
68
69/// Auto-save / storage configuration
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct AutoSaveSetConfig {
72    /// Enable persistent storage.
73    /// `false` (default) → graph lives in memory only.
74    /// `true` → graph is saved to disk after `build_graph()` and reloaded on the next run.
75    #[serde(default)]
76    pub enabled: bool,
77
78    /// Base directory for workspace storage. Required when `enabled = true`.
79    /// Example: `"./output"` or `"/data/graphrag"`.
80    /// The workspace folder is created at `<base_dir>/<workspace_name>/`.
81    #[serde(default)]
82    pub base_dir: Option<String>,
83
84    /// Auto-save interval in seconds (0 = save after every graph build)
85    #[serde(default = "default_auto_save_interval")]
86    pub interval_seconds: u64,
87
88    /// Workspace name — sub-folder inside `base_dir` (default: "default").
89    #[serde(default)]
90    pub workspace_name: Option<String>,
91
92    /// Maximum number of auto-save versions to keep (0 = unlimited)
93    #[serde(default = "default_max_auto_save_versions")]
94    pub max_versions: usize,
95}
96
97impl Default for AutoSaveSetConfig {
98    fn default() -> Self {
99        Self {
100            enabled: false,
101            base_dir: None,
102            interval_seconds: default_auto_save_interval(),
103            workspace_name: None,
104            max_versions: default_max_auto_save_versions(),
105        }
106    }
107}
108
109/// General system configuration settings
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct GeneralConfig {
112    /// Logging level (error, warn, info, debug, trace)
113    #[serde(default = "default_log_level")]
114    pub log_level: String,
115
116    /// Output directory for results
117    #[serde(default = "default_output_dir")]
118    pub output_dir: String,
119
120    /// Path to the input document to process
121    #[serde(default)]
122    pub input_document_path: Option<String>,
123
124    /// Maximum threads (0 = auto-detect)
125    #[serde(default)]
126    pub max_threads: Option<usize>,
127
128    /// Enable performance profiling
129    #[serde(default)]
130    pub enable_profiling: bool,
131}
132
133/// Pipeline execution configuration
134#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct PipelineConfig {
136    /// Workflows to execute in sequence
137    #[serde(default = "default_workflows")]
138    pub workflows: Vec<String>,
139
140    /// Enable parallel execution
141    #[serde(default = "default_true")]
142    pub parallel_execution: bool,
143
144    /// Text extraction configuration
145    #[serde(default)]
146    pub text_extraction: TextExtractionConfig,
147
148    /// Entity extraction configuration
149    #[serde(default)]
150    pub entity_extraction: EntityExtractionConfig,
151
152    /// Graph building configuration
153    #[serde(default)]
154    pub graph_building: GraphBuildingConfig,
155
156    /// Community detection configuration
157    #[serde(default)]
158    pub community_detection: CommunityDetectionConfig,
159}
160
161/// Text extraction and chunking configuration
162#[derive(Debug, Clone, Serialize, Deserialize)]
163pub struct TextExtractionConfig {
164    /// Chunk size for text splitting
165    #[serde(default = "default_chunk_size")]
166    pub chunk_size: usize,
167
168    /// Overlap between chunks
169    #[serde(default = "default_chunk_overlap")]
170    pub chunk_overlap: usize,
171
172    /// Clean control characters
173    #[serde(default = "default_true")]
174    pub clean_control_chars: bool,
175
176    /// Minimum chunk size to keep
177    #[serde(default = "default_min_chunk_size")]
178    pub min_chunk_size: usize,
179
180    /// Text cleaning options
181    #[serde(default)]
182    pub cleaning: Option<CleaningConfig>,
183}
184
185/// Text cleaning options configuration
186#[derive(Debug, Clone, Serialize, Deserialize)]
187pub struct CleaningConfig {
188    /// Remove URLs from text
189    #[serde(default)]
190    pub remove_urls: bool,
191
192    /// Remove email addresses
193    #[serde(default)]
194    pub remove_emails: bool,
195
196    /// Normalize whitespace
197    #[serde(default = "default_true")]
198    pub normalize_whitespace: bool,
199
200    /// Remove special characters
201    #[serde(default)]
202    pub remove_special_chars: bool,
203}
204
205/// Entity extraction configuration
206#[derive(Debug, Clone, Serialize, Deserialize)]
207pub struct EntityExtractionConfig {
208    /// Model name for NER
209    #[serde(default = "default_ner_model")]
210    pub model_name: String,
211
212    /// Temperature for entity extraction (0.0 = fully deterministic JSON output)
213    #[serde(default = "default_extraction_temperature")]
214    pub temperature: f32,
215
216    /// Maximum tokens for extraction
217    #[serde(default = "default_max_tokens")]
218    pub max_tokens: usize,
219
220    /// Entity types to extract (dynamic configuration)
221    pub entity_types: Option<Vec<String>>,
222
223    /// Confidence threshold for entity extraction (top-level)
224    #[serde(default = "default_confidence_threshold")]
225    pub confidence_threshold: f32,
226
227    /// Custom extraction prompt
228    pub custom_prompt: Option<String>,
229
230    /// Entity filtering options
231    #[serde(default)]
232    pub filters: Option<EntityFiltersConfig>,
233}
234
235/// Entity filtering configuration
236#[derive(Debug, Clone, Serialize, Deserialize)]
237pub struct EntityFiltersConfig {
238    /// Minimum entity length
239    #[serde(default = "default_min_entity_length")]
240    pub min_entity_length: usize,
241
242    /// Maximum entity length
243    #[serde(default = "default_max_entity_length")]
244    pub max_entity_length: usize,
245
246    /// Allowed entity types
247    pub allowed_entity_types: Option<Vec<String>>,
248
249    /// Confidence threshold
250    #[serde(default = "default_confidence_threshold")]
251    pub confidence_threshold: f32,
252
253    /// Allowed regex patterns for entity matching
254    pub allowed_patterns: Option<Vec<String>>,
255
256    /// Excluded regex patterns for entity filtering
257    pub excluded_patterns: Option<Vec<String>>,
258
259    /// Enable fuzzy matching for entity resolution
260    #[serde(default)]
261    pub enable_fuzzy_matching: bool,
262}
263
264/// Graph building configuration
265#[derive(Debug, Clone, Serialize, Deserialize)]
266pub struct GraphBuildingConfig {
267    /// Relation scoring algorithm
268    #[serde(default = "default_relation_scorer")]
269    pub relation_scorer: String,
270
271    /// Minimum relation score threshold
272    #[serde(default = "default_min_relation_score")]
273    pub min_relation_score: f32,
274
275    /// Maximum connections per node
276    #[serde(default = "default_max_connections")]
277    pub max_connections_per_node: usize,
278
279    /// Use bidirectional relationships
280    #[serde(default = "default_true")]
281    pub bidirectional_relations: bool,
282}
283
284/// Community detection configuration
285#[derive(Debug, Clone, Serialize, Deserialize)]
286pub struct CommunityDetectionConfig {
287    /// Algorithm for community detection
288    #[serde(default = "default_community_algorithm")]
289    pub algorithm: String,
290
291    /// Resolution parameter
292    #[serde(default = "default_resolution")]
293    pub resolution: f32,
294
295    /// Minimum community size
296    #[serde(default = "default_min_community_size")]
297    pub min_community_size: usize,
298
299    /// Maximum community size (0 = unlimited)
300    #[serde(default)]
301    pub max_community_size: usize,
302}
303
304/// Storage backend configuration
305#[derive(Debug, Clone, Serialize, Deserialize)]
306pub struct StorageConfig {
307    /// Database type
308    #[serde(default = "default_database_type")]
309    pub database_type: String,
310
311    /// Database path for SQLite
312    #[serde(default = "default_database_path")]
313    pub database_path: String,
314
315    /// Enable WAL for SQLite
316    #[serde(default = "default_true")]
317    pub enable_wal: bool,
318
319    /// PostgreSQL configuration
320    pub postgresql: Option<PostgreSQLConfig>,
321
322    /// Neo4j configuration
323    pub neo4j: Option<Neo4jConfig>,
324}
325
326/// PostgreSQL database configuration
327#[derive(Debug, Clone, Serialize, Deserialize)]
328pub struct PostgreSQLConfig {
329    /// PostgreSQL server host
330    pub host: String,
331    /// PostgreSQL server port
332    pub port: u16,
333    /// Database name
334    pub database: String,
335    /// Username for authentication
336    pub username: String,
337    /// Password for authentication
338    pub password: String,
339    /// Connection pool size
340    #[serde(default = "default_pool_size")]
341    pub pool_size: usize,
342}
343
344/// Neo4j graph database configuration
345#[derive(Debug, Clone, Serialize, Deserialize)]
346pub struct Neo4jConfig {
347    /// Neo4j server URI
348    pub uri: String,
349    /// Username for authentication
350    pub username: String,
351    /// Password for authentication
352    pub password: String,
353    /// Enable encrypted connections
354    #[serde(default)]
355    pub encrypted: bool,
356}
357
358/// Model configuration for LLM and embeddings
359#[derive(Debug, Clone, Serialize, Deserialize)]
360pub struct ModelsConfig {
361    /// Primary LLM for generation
362    #[serde(default = "default_primary_llm")]
363    pub primary_llm: String,
364
365    /// Embedding model
366    #[serde(default = "default_embedding_model")]
367    pub embedding_model: String,
368
369    /// Maximum context length
370    #[serde(default = "default_max_context")]
371    pub max_context_length: usize,
372
373    /// LLM parameters
374    #[serde(default)]
375    pub llm_params: Option<LLMParamsConfig>,
376
377    /// Local model configuration
378    #[serde(default)]
379    pub local: Option<LocalModelsConfig>,
380}
381
382/// LLM generation parameters
383#[derive(Debug, Clone, Serialize, Deserialize)]
384pub struct LLMParamsConfig {
385    /// Sampling temperature (0.0-2.0)
386    #[serde(default = "default_temperature")]
387    pub temperature: f32,
388
389    /// Nucleus sampling parameter (0.0-1.0)
390    #[serde(default = "default_top_p")]
391    pub top_p: f32,
392
393    /// Frequency penalty (-2.0-2.0)
394    #[serde(default)]
395    pub frequency_penalty: f32,
396
397    /// Presence penalty (-2.0-2.0)
398    #[serde(default)]
399    pub presence_penalty: f32,
400
401    /// Sequences where the model will stop generating
402    pub stop_sequences: Option<Vec<String>>,
403}
404
405/// Local model configuration (Ollama)
406#[derive(Debug, Clone, Serialize, Deserialize)]
407pub struct LocalModelsConfig {
408    /// Ollama API base URL
409    #[serde(default = "default_ollama_url")]
410    pub ollama_base_url: String,
411
412    /// Local model name for generation
413    #[serde(default = "default_ollama_model")]
414    pub model_name: String,
415
416    /// Local embedding model name
417    #[serde(default = "default_ollama_embedding")]
418    pub embedding_model: String,
419}
420
421/// Performance tuning configuration
422#[derive(Debug, Clone, Serialize, Deserialize)]
423pub struct PerformanceConfig {
424    /// Enable batch processing
425    #[serde(default = "default_true")]
426    pub batch_processing: bool,
427
428    /// Batch size
429    #[serde(default = "default_batch_size")]
430    pub batch_size: usize,
431
432    /// Worker threads
433    #[serde(default = "default_worker_threads")]
434    pub worker_threads: usize,
435
436    /// Memory limit per worker (MB)
437    #[serde(default = "default_memory_limit")]
438    pub memory_limit_mb: usize,
439}
440
441/// Ollama-specific configuration for local LLM and embeddings
442#[derive(Debug, Clone, Serialize, Deserialize)]
443pub struct OllamaSetConfig {
444    /// Enable Ollama integration
445    #[serde(default = "default_true")]
446    pub enabled: bool,
447
448    /// Ollama host
449    #[serde(default = "default_ollama_host")]
450    pub host: String,
451
452    /// Ollama port
453    #[serde(default = "default_ollama_port")]
454    pub port: u16,
455
456    /// Chat model name
457    #[serde(default = "default_chat_model")]
458    pub chat_model: String,
459
460    /// Embedding model name
461    #[serde(default = "default_embedding_model_ollama")]
462    pub embedding_model: String,
463
464    /// Timeout in seconds
465    #[serde(default = "default_timeout")]
466    pub timeout_seconds: u64,
467
468    /// Maximum retries
469    #[serde(default = "default_max_retries")]
470    pub max_retries: u32,
471
472    /// Fallback to hash-based embeddings
473    #[serde(default)]
474    pub fallback_to_hash: bool,
475
476    /// Maximum tokens
477    pub max_tokens: Option<u32>,
478
479    /// Temperature
480    pub temperature: Option<f32>,
481
482    /// How long to keep the model loaded in memory (e.g. "1h", "30m", "0")
483    ///
484    /// Critical for KV Cache efficiency when processing multiple chunks from the same document.
485    pub keep_alive: Option<String>,
486
487    /// Context window size in tokens (overrides Ollama model default)
488    ///
489    /// Ollama silently truncates prompts exceeding the context window.
490    /// Set this when processing long documents to avoid silent truncation.
491    pub num_ctx: Option<u32>,
492}
493
494/// GLiNER-Relex extractor configuration
495#[derive(Debug, Clone, Serialize, Deserialize)]
496pub struct GlinerSetConfig {
497    /// Enable GLiNER-Relex extraction
498    #[serde(default)]
499    pub enabled: bool,
500    /// Path to the ONNX model file
501    #[serde(default)]
502    pub model_path: String,
503    /// Path to tokenizer.json (defaults to same dir as model_path)
504    #[serde(default)]
505    pub tokenizer_path: String,
506    /// "span" (default) or "token"
507    #[serde(default = "default_gliner_mode")]
508    pub mode: String,
509    /// Entity labels to extract
510    #[serde(default = "default_gliner_entity_labels")]
511    pub entity_labels: Vec<String>,
512    /// Relation labels to extract
513    #[serde(default = "default_gliner_relation_labels")]
514    pub relation_labels: Vec<String>,
515    /// Minimum entity confidence threshold
516    #[serde(default = "default_entity_threshold")]
517    pub entity_threshold: f32,
518    /// Minimum relation confidence threshold
519    #[serde(default = "default_relation_threshold")]
520    pub relation_threshold: f32,
521    /// Use GPU (CUDA) for inference
522    #[serde(default)]
523    pub use_gpu: bool,
524}
525
526fn default_gliner_mode()            -> String      { "span".to_string() }
527fn default_gliner_entity_labels()   -> Vec<String> { vec!["person".into(), "organization".into(), "location".into()] }
528fn default_gliner_relation_labels() -> Vec<String> { vec!["related to".into(), "part of".into()] }
529fn default_entity_threshold()       -> f32         { 0.4 }
530fn default_relation_threshold()     -> f32         { 0.5 }
531
532impl Default for GlinerSetConfig {
533    fn default() -> Self {
534        Self {
535            enabled: false,
536            model_path: String::new(),
537            tokenizer_path: String::new(),
538            mode: default_gliner_mode(),
539            entity_labels: default_gliner_entity_labels(),
540            relation_labels: default_gliner_relation_labels(),
541            entity_threshold: default_entity_threshold(),
542            relation_threshold: default_relation_threshold(),
543            use_gpu: false,
544        }
545    }
546}
547
548/// Experimental features configuration
549#[derive(Debug, Clone, Serialize, Deserialize, Default)]
550pub struct ExperimentalConfig {
551    /// Enable neural reranking
552    #[serde(default)]
553    pub neural_reranking: bool,
554
555    /// Enable federated learning
556    #[serde(default)]
557    pub federated_learning: bool,
558
559    /// Enable real-time updates
560    #[serde(default)]
561    pub real_time_updates: bool,
562
563    /// Enable distributed processing
564    #[serde(default)]
565    pub distributed_processing: bool,
566
567    /// Enable LazyGraphRAG mode (no prior summarization, 0.1% indexing cost)
568    #[serde(default)]
569    pub lazy_graphrag: bool,
570
571    /// Enable E2GraphRAG mode (efficient entity extraction without LLM)
572    #[serde(default)]
573    pub e2_graphrag: bool,
574
575    /// LazyGraphRAG configuration
576    #[serde(default)]
577    pub lazy_graphrag_config: Option<LazyGraphRAGConfig>,
578
579    /// E2GraphRAG configuration
580    #[serde(default)]
581    pub e2_graphrag_config: Option<E2GraphRAGConfig>,
582}
583
584/// LazyGraphRAG configuration
585/// Concept-based retrieval without prior summarization (Microsoft Research, 2025)
586/// Achieves 0.1% of full GraphRAG indexing cost and 700x cheaper query costs
587#[derive(Debug, Clone, Serialize, Deserialize)]
588pub struct LazyGraphRAGConfig {
589    /// Enable concept extraction (noun phrases without LLM)
590    #[serde(default = "default_true")]
591    pub use_concept_extraction: bool,
592
593    /// Minimum concept length in characters
594    #[serde(default = "default_min_concept_length")]
595    pub min_concept_length: usize,
596
597    /// Maximum concept length in words
598    #[serde(default = "default_max_concept_words")]
599    pub max_concept_words: usize,
600
601    /// Co-occurrence threshold (minimum shared chunks for relationship)
602    #[serde(default = "default_co_occurrence_threshold")]
603    pub co_occurrence_threshold: usize,
604
605    /// Enable query refinement with iterative deepening
606    #[serde(default = "default_true")]
607    pub use_query_refinement: bool,
608
609    /// Maximum refinement iterations
610    #[serde(default = "default_max_refinement_iterations")]
611    pub max_refinement_iterations: usize,
612
613    /// Enable bidirectional entity-chunk indexing for fast lookups
614    #[serde(default = "default_true")]
615    pub use_bidirectional_index: bool,
616}
617
618impl Default for LazyGraphRAGConfig {
619    fn default() -> Self {
620        Self {
621            use_concept_extraction: true,
622            min_concept_length: 3,
623            max_concept_words: 5,
624            co_occurrence_threshold: 1,
625            use_query_refinement: true,
626            max_refinement_iterations: 3,
627            use_bidirectional_index: true,
628        }
629    }
630}
631
632/// E2GraphRAG configuration
633/// Efficient entity extraction using SpaCy-like approach without LLM
634/// Achieves 10x faster indexing and 100x faster retrieval
635#[derive(Debug, Clone, Serialize, Deserialize)]
636pub struct E2GraphRAGConfig {
637    /// Enable lightweight NER (no LLM required)
638    #[serde(default = "default_true")]
639    pub use_lightweight_ner: bool,
640
641    /// Entity types to extract (using pattern matching)
642    #[serde(default = "default_e2_entity_types")]
643    pub entity_types: Vec<String>,
644
645    /// Minimum entity confidence for pattern-based extraction
646    #[serde(default = "default_e2_min_confidence")]
647    pub min_confidence: f32,
648
649    /// Enable capitalization-based named entity detection
650    #[serde(default = "default_true")]
651    pub use_capitalization_detection: bool,
652
653    /// Enable noun phrase extraction
654    #[serde(default = "default_true")]
655    pub use_noun_phrase_extraction: bool,
656
657    /// Minimum entity frequency (entities must appear at least N times)
658    #[serde(default = "default_min_entity_frequency")]
659    pub min_entity_frequency: usize,
660
661    /// Use fast co-occurrence for relationships (no LLM)
662    #[serde(default = "default_true")]
663    pub use_fast_cooccurrence: bool,
664
665    /// Enable bidirectional entity-chunk indexing
666    #[serde(default = "default_true")]
667    pub use_bidirectional_index: bool,
668}
669
670impl Default for E2GraphRAGConfig {
671    fn default() -> Self {
672        Self {
673            use_lightweight_ner: true,
674            entity_types: default_e2_entity_types(),
675            min_confidence: 0.6,
676            use_capitalization_detection: true,
677            use_noun_phrase_extraction: true,
678            min_entity_frequency: 1,
679            use_fast_cooccurrence: true,
680            use_bidirectional_index: true,
681        }
682    }
683}
684
685// =============================================================================
686// PIPELINE APPROACH CONFIGURATION (Semantic vs Algorithmic vs Hybrid)
687// =============================================================================
688
689/// Pipeline mode/approach configuration
690/// Determines which pipeline implementation to use
691#[derive(Debug, Clone, Serialize, Deserialize)]
692pub struct ModeConfig {
693    /// Pipeline approach: "semantic", "algorithmic", or "hybrid"
694    /// - semantic: Neural embeddings + LLM extraction + vector search
695    /// - algorithmic: Pattern matching + TF-IDF + BM25 keyword search
696    /// - hybrid: Combines both with weighted fusion
697    #[serde(default = "default_approach")]
698    pub approach: String,
699}
700
701impl Default for ModeConfig {
702    fn default() -> Self {
703        Self {
704            approach: default_approach(),
705        }
706    }
707}
708
709/// Semantic/Neural pipeline configuration
710/// Uses deep learning models for embeddings, entity extraction, and retrieval
711#[derive(Debug, Clone, Serialize, Deserialize)]
712pub struct SemanticPipelineConfig {
713    /// Enable semantic pipeline
714    #[serde(default)]
715    pub enabled: bool,
716
717    /// Embeddings configuration for semantic approach
718    pub embeddings: SemanticEmbeddingsConfig,
719
720    /// Entity extraction configuration for semantic approach
721    pub entity_extraction: SemanticEntityConfig,
722
723    /// Retrieval configuration for semantic approach
724    pub retrieval: SemanticRetrievalConfig,
725
726    /// Graph construction configuration for semantic approach
727    pub graph_construction: SemanticGraphConfig,
728}
729
730/// Semantic embeddings configuration (neural models)
731#[derive(Debug, Clone, Serialize, Deserialize)]
732pub struct SemanticEmbeddingsConfig {
733    /// Backend: "huggingface", "openai", "voyage", "cohere", "jina", "mistral", "together", "ollama"
734    #[serde(default = "default_semantic_embedding_backend")]
735    pub backend: String,
736
737    /// Model identifier (provider-specific)
738    #[serde(default = "default_semantic_embedding_model")]
739    pub model: String,
740
741    /// Embedding dimension
742    #[serde(default = "default_semantic_embedding_dim")]
743    pub dimension: usize,
744
745    /// Use GPU acceleration if available
746    #[serde(default = "default_true")]
747    pub use_gpu: bool,
748
749    /// Similarity metric (cosine, euclidean, dot_product)
750    #[serde(default = "default_similarity_metric")]
751    pub similarity_metric: String,
752
753    /// Batch size for embeddings generation
754    #[serde(default = "default_batch_size")]
755    pub batch_size: usize,
756}
757
758/// Semantic entity extraction configuration (LLM-based)
759#[derive(Debug, Clone, Serialize, Deserialize)]
760pub struct SemanticEntityConfig {
761    /// Extraction method (always "llm" for semantic)
762    #[serde(default = "default_semantic_entity_method")]
763    pub method: String,
764
765    /// Enable gleaning (iterative refinement)
766    #[serde(default = "default_true")]
767    pub use_gleaning: bool,
768
769    /// Maximum gleaning rounds
770    #[serde(default = "default_max_gleaning_rounds")]
771    pub max_gleaning_rounds: usize,
772
773    /// LLM model for extraction
774    #[serde(default = "default_chat_model")]
775    pub model: String,
776
777    /// Temperature for LLM
778    #[serde(default = "default_semantic_temperature")]
779    pub temperature: f32,
780
781    /// Confidence threshold
782    #[serde(default = "default_semantic_confidence")]
783    pub confidence_threshold: f32,
784}
785
786/// Semantic retrieval configuration (vector search)
787#[derive(Debug, Clone, Serialize, Deserialize)]
788pub struct SemanticRetrievalConfig {
789    /// Retrieval strategy (always "vector" for semantic)
790    #[serde(default = "default_semantic_retrieval_strategy")]
791    pub strategy: String,
792
793    /// Use HNSW index for fast approximate search
794    #[serde(default = "default_true")]
795    pub use_hnsw: bool,
796
797    /// HNSW ef_construction parameter
798    #[serde(default = "default_hnsw_ef_construction")]
799    pub hnsw_ef_construction: usize,
800
801    /// HNSW M parameter (connections per node)
802    #[serde(default = "default_hnsw_m")]
803    pub hnsw_m: usize,
804
805    /// Top-k results
806    #[serde(default = "default_top_k")]
807    pub top_k: usize,
808
809    /// Similarity threshold
810    #[serde(default = "default_semantic_similarity_threshold")]
811    pub similarity_threshold: f32,
812}
813
814/// Semantic graph construction configuration (embedding-based)
815#[derive(Debug, Clone, Serialize, Deserialize)]
816pub struct SemanticGraphConfig {
817    /// Relation scorer (always "embedding_similarity" for semantic)
818    #[serde(default = "default_semantic_relation_scorer")]
819    pub relation_scorer: String,
820
821    /// Use transformer embeddings for relationships
822    #[serde(default = "default_true")]
823    pub use_transformer_embeddings: bool,
824
825    /// Minimum relation score
826    #[serde(default = "default_min_relation_score")]
827    pub min_relation_score: f32,
828}
829
830/// Algorithmic/Classic NLP pipeline configuration
831/// Uses pattern matching, TF-IDF, and keyword-based methods
832#[derive(Debug, Clone, Serialize, Deserialize, Default)]
833pub struct AlgorithmicPipelineConfig {
834    /// Enable algorithmic pipeline
835    #[serde(default)]
836    pub enabled: bool,
837
838    /// Embeddings configuration for algorithmic approach
839    pub embeddings: AlgorithmicEmbeddingsConfig,
840
841    /// Entity extraction configuration for algorithmic approach
842    pub entity_extraction: AlgorithmicEntityConfig,
843
844    /// Retrieval configuration for algorithmic approach
845    pub retrieval: AlgorithmicRetrievalConfig,
846
847    /// Graph construction configuration for algorithmic approach
848    pub graph_construction: AlgorithmicGraphConfig,
849}
850
851/// Algorithmic embeddings configuration (hash-based, TF-IDF)
852#[derive(Debug, Clone, Serialize, Deserialize)]
853pub struct AlgorithmicEmbeddingsConfig {
854    /// Backend (always "hash" for algorithmic)
855    #[serde(default = "default_algorithmic_embedding_backend")]
856    pub backend: String,
857
858    /// Embedding dimension
859    #[serde(default = "default_algorithmic_embedding_dim")]
860    pub dimension: usize,
861
862    /// Use TF-IDF weighting
863    #[serde(default = "default_true")]
864    pub use_tfidf: bool,
865
866    /// Vocabulary size
867    #[serde(default = "default_vocabulary_size")]
868    pub vocabulary_size: usize,
869
870    /// Minimum term frequency
871    #[serde(default = "default_min_term_frequency")]
872    pub min_term_frequency: usize,
873
874    /// Maximum document frequency (0.0-1.0)
875    #[serde(default = "default_max_document_frequency")]
876    pub max_document_frequency: f32,
877}
878
879/// Algorithmic entity extraction configuration (pattern-based)
880#[derive(Debug, Clone, Serialize, Deserialize)]
881pub struct AlgorithmicEntityConfig {
882    /// Extraction method (always "pattern" for algorithmic)
883    #[serde(default = "default_algorithmic_entity_method")]
884    pub method: String,
885
886    /// Use NER rules
887    #[serde(default = "default_true")]
888    pub use_ner_rules: bool,
889
890    /// Use POS tagging
891    #[serde(default)]
892    pub use_pos_tagging: bool,
893
894    /// Minimum entity length
895    #[serde(default = "default_min_entity_length")]
896    pub min_entity_length: usize,
897
898    /// Confidence threshold
899    #[serde(default = "default_algorithmic_confidence")]
900    pub confidence_threshold: f32,
901
902    /// Regex patterns for entity matching
903    pub patterns: Option<Vec<String>>,
904}
905
906/// Algorithmic retrieval configuration (BM25 keyword search)
907#[derive(Debug, Clone, Serialize, Deserialize)]
908pub struct AlgorithmicRetrievalConfig {
909    /// Retrieval strategy (always "bm25" for algorithmic)
910    #[serde(default = "default_algorithmic_retrieval_strategy")]
911    pub strategy: String,
912
913    /// BM25 k1 parameter (term frequency saturation)
914    #[serde(default = "default_bm25_k1")]
915    pub k1: f32,
916
917    /// BM25 b parameter (length normalization)
918    #[serde(default = "default_bm25_b")]
919    pub b: f32,
920
921    /// Use stemming
922    #[serde(default = "default_true")]
923    pub use_stemming: bool,
924
925    /// Language for stemming
926    #[serde(default = "default_language")]
927    pub language: String,
928
929    /// Top-k results
930    #[serde(default = "default_top_k")]
931    pub top_k: usize,
932}
933
934/// Algorithmic graph construction configuration (token overlap)
935#[derive(Debug, Clone, Serialize, Deserialize)]
936pub struct AlgorithmicGraphConfig {
937    /// Relation scorer (jaccard, cosine on token vectors)
938    #[serde(default = "default_algorithmic_relation_scorer")]
939    pub relation_scorer: String,
940
941    /// Use co-occurrence for relationship detection
942    #[serde(default = "default_true")]
943    pub use_cooccurrence: bool,
944
945    /// Co-occurrence window size
946    #[serde(default = "default_cooccurrence_window")]
947    pub window_size: usize,
948
949    /// Minimum relation score
950    #[serde(default = "default_algorithmic_min_relation_score")]
951    pub min_relation_score: f32,
952}
953
954/// Hybrid pipeline configuration
955/// Combines semantic and algorithmic approaches with weighted fusion
956#[derive(Debug, Clone, Serialize, Deserialize)]
957pub struct HybridPipelineConfig {
958    /// Enable hybrid pipeline
959    #[serde(default)]
960    pub enabled: bool,
961
962    /// Weight configuration for combining approaches
963    pub weights: HybridWeightsConfig,
964
965    /// Embeddings configuration for hybrid
966    pub embeddings: HybridEmbeddingsConfig,
967
968    /// Entity extraction configuration for hybrid
969    pub entity_extraction: HybridEntityConfig,
970
971    /// Retrieval configuration for hybrid
972    pub retrieval: HybridRetrievalConfig,
973
974    /// Graph construction configuration for hybrid
975    pub graph_construction: HybridGraphConfig,
976
977    /// Fallback strategy when primary fails
978    #[serde(default = "default_hybrid_fallback_strategy")]
979    pub fallback_strategy: String,
980
981    /// Enable cross-validation between approaches
982    #[serde(default = "default_true")]
983    pub cross_validation: bool,
984}
985
986/// Hybrid weight configuration
987#[derive(Debug, Clone, Serialize, Deserialize)]
988pub struct HybridWeightsConfig {
989    /// Weight for semantic approach (0.0-1.0)
990    #[serde(default = "default_hybrid_semantic_weight")]
991    pub semantic_weight: f32,
992
993    /// Weight for algorithmic approach (0.0-1.0)
994    #[serde(default = "default_hybrid_algorithmic_weight")]
995    pub algorithmic_weight: f32,
996}
997
998/// Hybrid embeddings configuration
999#[derive(Debug, Clone, Serialize, Deserialize)]
1000pub struct HybridEmbeddingsConfig {
1001    /// Primary backend (neural)
1002    #[serde(default = "default_semantic_embedding_backend")]
1003    pub primary: String,
1004
1005    /// Fallback backend (hash-based)
1006    #[serde(default = "default_algorithmic_embedding_backend")]
1007    pub fallback: String,
1008
1009    /// Combine scores from both
1010    #[serde(default = "default_true")]
1011    pub combine_scores: bool,
1012
1013    /// Auto-fallback when primary unavailable
1014    #[serde(default = "default_true")]
1015    pub auto_fallback: bool,
1016}
1017
1018/// Hybrid entity extraction configuration
1019#[derive(Debug, Clone, Serialize, Deserialize)]
1020pub struct HybridEntityConfig {
1021    /// Use both LLM and pattern extraction
1022    #[serde(default = "default_true")]
1023    pub use_both: bool,
1024
1025    /// Weight for LLM extraction (0.0-1.0)
1026    #[serde(default = "default_hybrid_llm_weight")]
1027    pub llm_weight: f32,
1028
1029    /// Weight for pattern extraction (0.0-1.0)
1030    #[serde(default = "default_hybrid_pattern_weight")]
1031    pub pattern_weight: f32,
1032
1033    /// Cross-validate LLM results with patterns
1034    #[serde(default = "default_true")]
1035    pub cross_validate: bool,
1036
1037    /// Confidence boost when both agree
1038    #[serde(default = "default_hybrid_confidence_boost")]
1039    pub confidence_boost: f32,
1040}
1041
1042/// Hybrid retrieval configuration (RRF fusion)
1043#[derive(Debug, Clone, Serialize, Deserialize)]
1044pub struct HybridRetrievalConfig {
1045    /// Retrieval strategy (always "fusion" for hybrid)
1046    #[serde(default = "default_hybrid_retrieval_strategy")]
1047    pub strategy: String,
1048
1049    /// Combine vector and BM25
1050    #[serde(default = "default_true")]
1051    pub combine_vector_bm25: bool,
1052
1053    /// Weight for vector search
1054    #[serde(default = "default_hybrid_vector_weight")]
1055    pub vector_weight: f32,
1056
1057    /// Weight for BM25 search
1058    #[serde(default = "default_hybrid_bm25_weight")]
1059    pub bm25_weight: f32,
1060
1061    /// RRF constant (typically 60)
1062    #[serde(default = "default_rrf_constant")]
1063    pub rrf_constant: usize,
1064}
1065
1066/// Hybrid graph construction configuration
1067#[derive(Debug, Clone, Serialize, Deserialize)]
1068pub struct HybridGraphConfig {
1069    /// Primary relation scorer (embedding-based)
1070    #[serde(default = "default_semantic_relation_scorer")]
1071    pub primary_scorer: String,
1072
1073    /// Fallback relation scorer (token-based)
1074    #[serde(default = "default_algorithmic_relation_scorer")]
1075    pub fallback_scorer: String,
1076
1077    /// Combine scores from both scorers
1078    #[serde(default = "default_true")]
1079    pub combine_scores: bool,
1080}
1081
1082/// Top-level entity extraction configuration (gleaning settings)
1083#[derive(Debug, Clone, Serialize, Deserialize)]
1084pub struct EntityExtractionTopLevelConfig {
1085    /// Enable entity extraction
1086    #[serde(default = "default_true")]
1087    pub enabled: bool,
1088
1089    /// Minimum confidence threshold
1090    #[serde(default = "default_confidence_threshold")]
1091    pub min_confidence: f32,
1092
1093    /// Use LLM-based gleaning
1094    #[serde(default)]
1095    pub use_gleaning: bool,
1096
1097    /// Maximum gleaning rounds
1098    #[serde(default = "default_gleaning_rounds")]
1099    pub max_gleaning_rounds: usize,
1100
1101    /// Gleaning improvement threshold
1102    #[serde(default = "default_gleaning_improvement")]
1103    pub gleaning_improvement_threshold: f32,
1104
1105    /// Enable semantic merging
1106    #[serde(default)]
1107    pub semantic_merging: bool,
1108
1109    /// Merge similarity threshold
1110    #[serde(default = "default_merge_threshold")]
1111    pub merge_similarity_threshold: f32,
1112
1113    /// Enable automatic linking
1114    #[serde(default)]
1115    pub automatic_linking: bool,
1116
1117    /// Linking confidence threshold
1118    #[serde(default = "default_confidence_threshold")]
1119    pub linking_confidence_threshold: f32,
1120}
1121
1122impl Default for EntityExtractionTopLevelConfig {
1123    fn default() -> Self {
1124        Self {
1125            enabled: true,
1126            min_confidence: default_confidence_threshold(),
1127            use_gleaning: false,
1128            max_gleaning_rounds: default_gleaning_rounds(),
1129            gleaning_improvement_threshold: default_gleaning_improvement(),
1130            semantic_merging: false,
1131            merge_similarity_threshold: default_merge_threshold(),
1132            automatic_linking: false,
1133            linking_confidence_threshold: default_confidence_threshold(),
1134        }
1135    }
1136}
1137
1138// Default value functions
1139fn default_log_level() -> String {
1140    "info".to_string()
1141}
1142fn default_output_dir() -> String {
1143    "./output".to_string()
1144}
1145fn default_true() -> bool {
1146    true
1147}
1148fn default_workflows() -> Vec<String> {
1149    vec![
1150        "extract_text".to_string(),
1151        "extract_entities".to_string(),
1152        "build_graph".to_string(),
1153        "detect_communities".to_string(),
1154    ]
1155}
1156fn default_chunk_size() -> usize {
1157    512
1158}
1159fn default_chunk_overlap() -> usize {
1160    64
1161}
1162fn default_min_chunk_size() -> usize {
1163    50
1164}
1165fn default_ner_model() -> String {
1166    "microsoft/DialoGPT-medium".to_string()
1167}
1168fn default_temperature() -> f32 {
1169    0.1
1170}
1171fn default_extraction_temperature() -> f32 {
1172    0.0
1173}
1174fn default_max_tokens() -> usize {
1175    2048
1176}
1177fn default_min_entity_length() -> usize {
1178    3
1179}
1180fn default_max_entity_length() -> usize {
1181    100
1182}
1183fn default_confidence_threshold() -> f32 {
1184    0.8
1185}
1186fn default_relation_scorer() -> String {
1187    "cosine_similarity".to_string()
1188}
1189fn default_min_relation_score() -> f32 {
1190    0.7
1191}
1192fn default_max_connections() -> usize {
1193    10
1194}
1195fn default_community_algorithm() -> String {
1196    "leiden".to_string()
1197}
1198fn default_resolution() -> f32 {
1199    1.0
1200}
1201fn default_min_community_size() -> usize {
1202    3
1203}
1204fn default_database_type() -> String {
1205    "sqlite".to_string()
1206}
1207fn default_database_path() -> String {
1208    "./graphrag.db".to_string()
1209}
1210fn default_pool_size() -> usize {
1211    10
1212}
1213fn default_primary_llm() -> String {
1214    "gpt-4".to_string()
1215}
1216fn default_embedding_model() -> String {
1217    "text-embedding-ada-002".to_string()
1218}
1219fn default_max_context() -> usize {
1220    4096
1221}
1222fn default_top_p() -> f32 {
1223    0.9
1224}
1225fn default_ollama_url() -> String {
1226    "http://localhost:11434".to_string()
1227}
1228fn default_ollama_model() -> String {
1229    "llama2:7b".to_string()
1230}
1231fn default_ollama_embedding() -> String {
1232    "nomic-embed-text".to_string()
1233}
1234fn default_batch_size() -> usize {
1235    100
1236}
1237fn default_worker_threads() -> usize {
1238    4
1239}
1240fn default_memory_limit() -> usize {
1241    1024
1242}
1243fn default_ollama_host() -> String {
1244    "http://localhost".to_string()
1245}
1246fn default_ollama_port() -> u16 {
1247    11434
1248}
1249fn default_chat_model() -> String {
1250    "llama3.1:8b".to_string()
1251}
1252fn default_embedding_model_ollama() -> String {
1253    "nomic-embed-text".to_string()
1254}
1255fn default_timeout() -> u64 {
1256    60
1257}
1258fn default_max_retries() -> u32 {
1259    3
1260}
1261fn default_gleaning_rounds() -> usize {
1262    3
1263}
1264fn default_gleaning_improvement() -> f32 {
1265    0.1
1266}
1267fn default_merge_threshold() -> f32 {
1268    0.85
1269}
1270
1271// =============================================================================
1272// Default functions for Pipeline Approach Configuration
1273// =============================================================================
1274
1275// Mode defaults
1276fn default_approach() -> String {
1277    "semantic".to_string() // Default to semantic pipeline
1278}
1279
1280// Semantic pipeline defaults
1281fn default_semantic_embedding_backend() -> String {
1282    "huggingface".to_string()
1283}
1284fn default_semantic_embedding_model() -> String {
1285    "sentence-transformers/all-MiniLM-L6-v2".to_string()
1286}
1287fn default_semantic_embedding_dim() -> usize {
1288    384 // MiniLM-L6-v2 dimension
1289}
1290fn default_similarity_metric() -> String {
1291    "cosine".to_string()
1292}
1293fn default_semantic_entity_method() -> String {
1294    "llm".to_string()
1295}
1296fn default_max_gleaning_rounds() -> usize {
1297    3
1298}
1299fn default_semantic_temperature() -> f32 {
1300    0.1
1301}
1302fn default_semantic_confidence() -> f32 {
1303    0.7
1304}
1305fn default_semantic_retrieval_strategy() -> String {
1306    "vector".to_string()
1307}
1308fn default_hnsw_ef_construction() -> usize {
1309    200
1310}
1311fn default_hnsw_m() -> usize {
1312    16
1313}
1314fn default_top_k() -> usize {
1315    10
1316}
1317fn default_semantic_similarity_threshold() -> f32 {
1318    0.7
1319}
1320fn default_semantic_relation_scorer() -> String {
1321    "embedding_similarity".to_string()
1322}
1323
1324// Algorithmic pipeline defaults
1325fn default_algorithmic_embedding_backend() -> String {
1326    "hash".to_string()
1327}
1328fn default_algorithmic_embedding_dim() -> usize {
1329    128
1330}
1331fn default_vocabulary_size() -> usize {
1332    10000
1333}
1334fn default_min_term_frequency() -> usize {
1335    2
1336}
1337fn default_max_document_frequency() -> f32 {
1338    0.8
1339}
1340fn default_algorithmic_entity_method() -> String {
1341    "pattern".to_string()
1342}
1343fn default_algorithmic_confidence() -> f32 {
1344    0.75
1345}
1346fn default_algorithmic_retrieval_strategy() -> String {
1347    "bm25".to_string()
1348}
1349fn default_bm25_k1() -> f32 {
1350    1.5
1351}
1352fn default_bm25_b() -> f32 {
1353    0.75
1354}
1355fn default_language() -> String {
1356    "english".to_string()
1357}
1358fn default_algorithmic_relation_scorer() -> String {
1359    "jaccard".to_string()
1360}
1361fn default_cooccurrence_window() -> usize {
1362    10
1363}
1364fn default_algorithmic_min_relation_score() -> f32 {
1365    0.6
1366}
1367
1368// Hybrid pipeline defaults
1369fn default_hybrid_semantic_weight() -> f32 {
1370    0.6
1371}
1372fn default_hybrid_algorithmic_weight() -> f32 {
1373    0.4
1374}
1375fn default_hybrid_llm_weight() -> f32 {
1376    0.7
1377}
1378fn default_hybrid_pattern_weight() -> f32 {
1379    0.3
1380}
1381fn default_hybrid_confidence_boost() -> f32 {
1382    0.15
1383}
1384fn default_hybrid_retrieval_strategy() -> String {
1385    "fusion".to_string()
1386}
1387fn default_hybrid_vector_weight() -> f32 {
1388    0.6
1389}
1390fn default_hybrid_bm25_weight() -> f32 {
1391    0.4
1392}
1393fn default_rrf_constant() -> usize {
1394    60
1395}
1396fn default_hybrid_fallback_strategy() -> String {
1397    "semantic_first".to_string()
1398}
1399fn default_auto_save_interval() -> u64 {
1400    300 // 5 minutes
1401}
1402fn default_max_auto_save_versions() -> usize {
1403    5 // Keep 5 versions by default
1404}
1405
1406// LazyGraphRAG default functions
1407fn default_min_concept_length() -> usize {
1408    3 // Minimum 3 characters for concepts
1409}
1410fn default_max_concept_words() -> usize {
1411    5 // Maximum 5 words per concept
1412}
1413fn default_co_occurrence_threshold() -> usize {
1414    1 // Minimum 1 shared chunk for relationship
1415}
1416fn default_max_refinement_iterations() -> usize {
1417    3 // Up to 3 query refinement iterations
1418}
1419
1420// E2GraphRAG default functions
1421fn default_e2_entity_types() -> Vec<String> {
1422    vec![
1423        "PERSON".to_string(),
1424        "ORGANIZATION".to_string(),
1425        "LOCATION".to_string(),
1426        "CONCEPT".to_string(),
1427    ]
1428}
1429fn default_e2_min_confidence() -> f32 {
1430    0.6 // 60% minimum confidence for pattern-based extraction
1431}
1432fn default_min_entity_frequency() -> usize {
1433    1 // Entities must appear at least once
1434}
1435
1436impl Default for GeneralConfig {
1437    fn default() -> Self {
1438        Self {
1439            log_level: default_log_level(),
1440            output_dir: default_output_dir(),
1441            input_document_path: None,
1442            max_threads: None,
1443            enable_profiling: false,
1444        }
1445    }
1446}
1447
1448impl Default for PipelineConfig {
1449    fn default() -> Self {
1450        Self {
1451            workflows: default_workflows(),
1452            parallel_execution: default_true(),
1453            text_extraction: TextExtractionConfig::default(),
1454            entity_extraction: EntityExtractionConfig::default(),
1455            graph_building: GraphBuildingConfig::default(),
1456            community_detection: CommunityDetectionConfig::default(),
1457        }
1458    }
1459}
1460
1461impl Default for TextExtractionConfig {
1462    fn default() -> Self {
1463        Self {
1464            chunk_size: default_chunk_size(),
1465            chunk_overlap: default_chunk_overlap(),
1466            clean_control_chars: default_true(),
1467            min_chunk_size: default_min_chunk_size(),
1468            cleaning: None,
1469        }
1470    }
1471}
1472
1473impl Default for EntityExtractionConfig {
1474    fn default() -> Self {
1475        Self {
1476            model_name: default_ner_model(),
1477            temperature: default_temperature(),
1478            max_tokens: default_max_tokens(),
1479            entity_types: None,
1480            confidence_threshold: default_confidence_threshold(),
1481            custom_prompt: None,
1482            filters: None,
1483        }
1484    }
1485}
1486
1487impl Default for GraphBuildingConfig {
1488    fn default() -> Self {
1489        Self {
1490            relation_scorer: default_relation_scorer(),
1491            min_relation_score: default_min_relation_score(),
1492            max_connections_per_node: default_max_connections(),
1493            bidirectional_relations: default_true(),
1494        }
1495    }
1496}
1497
1498impl Default for CommunityDetectionConfig {
1499    fn default() -> Self {
1500        Self {
1501            algorithm: default_community_algorithm(),
1502            resolution: default_resolution(),
1503            min_community_size: default_min_community_size(),
1504            max_community_size: 0,
1505        }
1506    }
1507}
1508
1509impl Default for StorageConfig {
1510    fn default() -> Self {
1511        Self {
1512            database_type: default_database_type(),
1513            database_path: default_database_path(),
1514            enable_wal: default_true(),
1515            postgresql: None,
1516            neo4j: None,
1517        }
1518    }
1519}
1520
1521impl Default for ModelsConfig {
1522    fn default() -> Self {
1523        Self {
1524            primary_llm: default_primary_llm(),
1525            embedding_model: default_embedding_model(),
1526            max_context_length: default_max_context(),
1527            llm_params: None,
1528            local: None,
1529        }
1530    }
1531}
1532
1533impl Default for PerformanceConfig {
1534    fn default() -> Self {
1535        Self {
1536            batch_processing: default_true(),
1537            batch_size: default_batch_size(),
1538            worker_threads: default_worker_threads(),
1539            memory_limit_mb: default_memory_limit(),
1540        }
1541    }
1542}
1543
1544impl Default for OllamaSetConfig {
1545    fn default() -> Self {
1546        Self {
1547            enabled: default_true(),
1548            host: default_ollama_host(),
1549            port: default_ollama_port(),
1550            chat_model: default_chat_model(),
1551            embedding_model: default_embedding_model_ollama(),
1552            timeout_seconds: default_timeout(),
1553            max_retries: default_max_retries(),
1554            fallback_to_hash: false,
1555            max_tokens: Some(800),
1556            temperature: Some(0.3),
1557            keep_alive: None,
1558            num_ctx: None,
1559        }
1560    }
1561}
1562
1563// =============================================================================
1564// Default implementations for Pipeline Approach Configuration
1565// =============================================================================
1566
1567impl Default for SemanticPipelineConfig {
1568    fn default() -> Self {
1569        Self {
1570            enabled: true,
1571            embeddings: SemanticEmbeddingsConfig::default(),
1572            entity_extraction: SemanticEntityConfig::default(),
1573            retrieval: SemanticRetrievalConfig::default(),
1574            graph_construction: SemanticGraphConfig::default(),
1575        }
1576    }
1577}
1578
1579impl Default for SemanticEmbeddingsConfig {
1580    fn default() -> Self {
1581        Self {
1582            backend: default_semantic_embedding_backend(),
1583            model: default_semantic_embedding_model(),
1584            dimension: default_semantic_embedding_dim(),
1585            use_gpu: default_true(),
1586            similarity_metric: default_similarity_metric(),
1587            batch_size: default_batch_size(),
1588        }
1589    }
1590}
1591
1592impl Default for SemanticEntityConfig {
1593    fn default() -> Self {
1594        Self {
1595            method: default_semantic_entity_method(),
1596            use_gleaning: default_true(),
1597            max_gleaning_rounds: default_max_gleaning_rounds(),
1598            model: default_chat_model(),
1599            temperature: default_semantic_temperature(),
1600            confidence_threshold: default_semantic_confidence(),
1601        }
1602    }
1603}
1604
1605impl Default for SemanticRetrievalConfig {
1606    fn default() -> Self {
1607        Self {
1608            strategy: default_semantic_retrieval_strategy(),
1609            use_hnsw: default_true(),
1610            hnsw_ef_construction: default_hnsw_ef_construction(),
1611            hnsw_m: default_hnsw_m(),
1612            top_k: default_top_k(),
1613            similarity_threshold: default_semantic_similarity_threshold(),
1614        }
1615    }
1616}
1617
1618impl Default for SemanticGraphConfig {
1619    fn default() -> Self {
1620        Self {
1621            relation_scorer: default_semantic_relation_scorer(),
1622            use_transformer_embeddings: default_true(),
1623            min_relation_score: default_min_relation_score(),
1624        }
1625    }
1626}
1627
1628impl Default for AlgorithmicEmbeddingsConfig {
1629    fn default() -> Self {
1630        Self {
1631            backend: default_algorithmic_embedding_backend(),
1632            dimension: default_algorithmic_embedding_dim(),
1633            use_tfidf: default_true(),
1634            vocabulary_size: default_vocabulary_size(),
1635            min_term_frequency: default_min_term_frequency(),
1636            max_document_frequency: default_max_document_frequency(),
1637        }
1638    }
1639}
1640
1641impl Default for AlgorithmicEntityConfig {
1642    fn default() -> Self {
1643        Self {
1644            method: default_algorithmic_entity_method(),
1645            use_ner_rules: default_true(),
1646            use_pos_tagging: false,
1647            min_entity_length: default_min_entity_length(),
1648            confidence_threshold: default_algorithmic_confidence(),
1649            patterns: None,
1650        }
1651    }
1652}
1653
1654impl Default for AlgorithmicRetrievalConfig {
1655    fn default() -> Self {
1656        Self {
1657            strategy: default_algorithmic_retrieval_strategy(),
1658            k1: default_bm25_k1(),
1659            b: default_bm25_b(),
1660            use_stemming: default_true(),
1661            language: default_language(),
1662            top_k: default_top_k(),
1663        }
1664    }
1665}
1666
1667impl Default for AlgorithmicGraphConfig {
1668    fn default() -> Self {
1669        Self {
1670            relation_scorer: default_algorithmic_relation_scorer(),
1671            use_cooccurrence: default_true(),
1672            window_size: default_cooccurrence_window(),
1673            min_relation_score: default_algorithmic_min_relation_score(),
1674        }
1675    }
1676}
1677
1678impl Default for HybridPipelineConfig {
1679    fn default() -> Self {
1680        Self {
1681            enabled: false,
1682            weights: HybridWeightsConfig::default(),
1683            embeddings: HybridEmbeddingsConfig::default(),
1684            entity_extraction: HybridEntityConfig::default(),
1685            retrieval: HybridRetrievalConfig::default(),
1686            graph_construction: HybridGraphConfig::default(),
1687            fallback_strategy: default_hybrid_fallback_strategy(),
1688            cross_validation: default_true(),
1689        }
1690    }
1691}
1692
1693impl Default for HybridWeightsConfig {
1694    fn default() -> Self {
1695        Self {
1696            semantic_weight: default_hybrid_semantic_weight(),
1697            algorithmic_weight: default_hybrid_algorithmic_weight(),
1698        }
1699    }
1700}
1701
1702impl Default for HybridEmbeddingsConfig {
1703    fn default() -> Self {
1704        Self {
1705            primary: default_semantic_embedding_backend(),
1706            fallback: default_algorithmic_embedding_backend(),
1707            combine_scores: default_true(),
1708            auto_fallback: default_true(),
1709        }
1710    }
1711}
1712
1713impl Default for HybridEntityConfig {
1714    fn default() -> Self {
1715        Self {
1716            use_both: default_true(),
1717            llm_weight: default_hybrid_llm_weight(),
1718            pattern_weight: default_hybrid_pattern_weight(),
1719            cross_validate: default_true(),
1720            confidence_boost: default_hybrid_confidence_boost(),
1721        }
1722    }
1723}
1724
1725impl Default for HybridRetrievalConfig {
1726    fn default() -> Self {
1727        Self {
1728            strategy: default_hybrid_retrieval_strategy(),
1729            combine_vector_bm25: default_true(),
1730            vector_weight: default_hybrid_vector_weight(),
1731            bm25_weight: default_hybrid_bm25_weight(),
1732            rrf_constant: default_rrf_constant(),
1733        }
1734    }
1735}
1736
1737impl Default for HybridGraphConfig {
1738    fn default() -> Self {
1739        Self {
1740            primary_scorer: default_semantic_relation_scorer(),
1741            fallback_scorer: default_algorithmic_relation_scorer(),
1742            combine_scores: default_true(),
1743        }
1744    }
1745}
1746
1747impl SetConfig {
1748    /// Load configuration from TOML or JSON5 file (auto-detects format by extension)
1749    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
1750        let path_ref = path.as_ref();
1751        let content = fs::read_to_string(path_ref)?;
1752
1753        // Detect format by file extension
1754        let extension = path_ref.extension().and_then(|e| e.to_str()).unwrap_or("");
1755
1756        let config: SetConfig = match extension {
1757            #[cfg(feature = "json5-support")]
1758            "json5" | "json" => {
1759                json5::from_str(&content).map_err(|e| crate::core::GraphRAGError::Config {
1760                    message: format!("JSON5 parse error: {e}"),
1761                })?
1762            },
1763            #[cfg(not(feature = "json5-support"))]
1764            "json5" | "json" => {
1765                return Err(crate::core::GraphRAGError::Config {
1766                    message: "JSON5 support not enabled. Rebuild with --features json5-support"
1767                        .to_string(),
1768                });
1769            },
1770            _ => toml::from_str(&content).map_err(|e| crate::core::GraphRAGError::Config {
1771                message: format!("TOML parse error: {e}"),
1772            })?,
1773        };
1774
1775        Ok(config)
1776    }
1777
1778    /// Save configuration to TOML file with comments
1779    pub fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
1780        let toml_string =
1781            toml::to_string_pretty(&self).map_err(|e| crate::core::GraphRAGError::Config {
1782                message: format!("TOML serialize error: {e}"),
1783            })?;
1784
1785        // Add header comment
1786        let commented_toml = format!(
1787            "# =============================================================================\n\
1788             # GraphRAG Configuration File\n\
1789             # Complete configuration with extensive parameters for easy customization\n\
1790             # =============================================================================\n\n{toml_string}"
1791        );
1792
1793        fs::write(path, commented_toml)?;
1794        Ok(())
1795    }
1796
1797    /// Convert to the existing Config format for compatibility
1798    pub fn to_graphrag_config(&self) -> crate::Config {
1799        let mut config = crate::Config {
1800            approach: self.mode.approach.clone(),
1801            ..Default::default()
1802        };
1803
1804        // Map text processing
1805        config.text.chunk_size = self.pipeline.text_extraction.chunk_size;
1806        config.text.chunk_overlap = self.pipeline.text_extraction.chunk_overlap;
1807
1808        // Map entity extraction based on approach
1809        config.entities.min_confidence = self.entity_extraction.min_confidence;
1810
1811        // Map entity types from pipeline.entity_extraction
1812        if let Some(ref types) = self.pipeline.entity_extraction.entity_types {
1813            config.entities.entity_types = types.clone();
1814        }
1815
1816        // Configure gleaning based on approach:
1817        // - semantic: use LLM-based gleaning
1818        // - algorithmic: use pattern-based extraction
1819        // - hybrid: use both (for compatibility, map to gleaning)
1820        match self.mode.approach.as_str() {
1821            "semantic" => {
1822                if let Some(ref semantic) = self.semantic {
1823                    config.entities.use_gleaning = semantic.entity_extraction.use_gleaning;
1824                    config.entities.max_gleaning_rounds =
1825                        semantic.entity_extraction.max_gleaning_rounds;
1826                    config.entities.min_confidence =
1827                        semantic.entity_extraction.confidence_threshold;
1828                } else {
1829                    // No semantic sub-section: use top-level entity_extraction settings directly
1830                    config.entities.use_gleaning = self.entity_extraction.use_gleaning;
1831                    config.entities.max_gleaning_rounds = self.entity_extraction.max_gleaning_rounds;
1832                    config.entities.min_confidence = self.entity_extraction.min_confidence;
1833                }
1834            },
1835            "algorithmic" => {
1836                // Algorithmic uses pattern-based extraction, no gleaning
1837                config.entities.use_gleaning = false;
1838                if let Some(ref algorithmic) = self.algorithmic {
1839                    config.entities.min_confidence =
1840                        algorithmic.entity_extraction.confidence_threshold;
1841                }
1842            },
1843            "hybrid" => {
1844                // Hybrid can use both, enable gleaning for LLM component
1845                config.entities.use_gleaning = true;
1846                if self.hybrid.is_some() {
1847                    // Use hybrid configuration if available
1848                    config.entities.max_gleaning_rounds = 2; // Reduced for hybrid efficiency
1849                }
1850            },
1851            _ => {
1852                // Unknown approach, use top-level config as fallback
1853                config.entities.use_gleaning = self.entity_extraction.use_gleaning;
1854                config.entities.max_gleaning_rounds = self.entity_extraction.max_gleaning_rounds;
1855            },
1856        }
1857
1858        // Map graph building
1859        config.graph.similarity_threshold = self.pipeline.graph_building.min_relation_score;
1860        config.graph.max_connections = self.pipeline.graph_building.max_connections_per_node;
1861        config.graph.extract_relationships = true; // Enable by default for TOML configs
1862        config.graph.relationship_confidence_threshold = 0.5; // Default threshold
1863
1864        // Map retrieval
1865        config.retrieval.top_k = 10; // Default
1866
1867        // Map embeddings
1868        config.embeddings.dimension = 768; // Default for nomic-embed-text
1869        config.embeddings.backend = "ollama".to_string();
1870        config.embeddings.fallback_to_hash = self.ollama.fallback_to_hash;
1871
1872        // Map parallel processing
1873        config.parallel.enabled = self.pipeline.parallel_execution;
1874        config.parallel.num_threads = self.performance.worker_threads;
1875
1876        // Map Ollama configuration
1877        config.ollama = crate::ollama::OllamaConfig {
1878            enabled: self.ollama.enabled,
1879            host: self.ollama.host.clone(),
1880            port: self.ollama.port,
1881            chat_model: self.ollama.chat_model.clone(),
1882            embedding_model: self.ollama.embedding_model.clone(),
1883            timeout_seconds: self.ollama.timeout_seconds,
1884            max_retries: self.ollama.max_retries,
1885            fallback_to_hash: self.ollama.fallback_to_hash,
1886            max_tokens: self.ollama.max_tokens,
1887            temperature: self.ollama.temperature,
1888            enable_caching: true,
1889            keep_alive: self.ollama.keep_alive.clone(),
1890            num_ctx: self.ollama.num_ctx,
1891        };
1892
1893        // Map GLiNER configuration
1894        config.gliner = crate::config::GlinerConfig {
1895            enabled:            self.gliner.enabled,
1896            model_path:         self.gliner.model_path.clone(),
1897            tokenizer_path:     self.gliner.tokenizer_path.clone(),
1898            mode:               self.gliner.mode.clone(),
1899            entity_labels:      self.gliner.entity_labels.clone(),
1900            relation_labels:    self.gliner.relation_labels.clone(),
1901            entity_threshold:   self.gliner.entity_threshold,
1902            relation_threshold: self.gliner.relation_threshold,
1903            use_gpu:            self.gliner.use_gpu,
1904        };
1905
1906        // Map auto-save configuration
1907        config.auto_save = crate::config::AutoSaveConfig {
1908            enabled: self.auto_save.enabled,
1909            base_dir: self.auto_save.base_dir.clone(),
1910            interval_seconds: self.auto_save.interval_seconds,
1911            workspace_name: self.auto_save.workspace_name.clone(),
1912            max_versions: self.auto_save.max_versions,
1913        };
1914
1915        config
1916    }
1917}
graphrag_core/config/setconfig.rs

graphrag_core/config/
setconfig.rs