Skip to main content

graphrag_core/config/
setconfig.rs

1//! TOML Configuration System for GraphRAG
2//! Complete configuration management with extensive TOML support
3
4use crate::Result;
5use serde::{Deserialize, Serialize};
6use std::fs;
7use std::path::Path;
8
9/// Complete GraphRAG configuration loaded from TOML
10#[derive(Debug, Clone, Serialize, Deserialize, Default)]
11pub struct SetConfig {
12    /// Pipeline mode/approach configuration
13    #[serde(default)]
14    pub mode: ModeConfig,
15
16    /// Semantic/Neural pipeline configuration
17    #[serde(default)]
18    pub semantic: Option<SemanticPipelineConfig>,
19
20    /// Algorithmic/Classic NLP pipeline configuration
21    #[serde(default)]
22    pub algorithmic: Option<AlgorithmicPipelineConfig>,
23
24    /// Hybrid pipeline configuration
25    #[serde(default)]
26    pub hybrid: Option<HybridPipelineConfig>,
27
28    /// General system settings
29    #[serde(default)]
30    pub general: GeneralConfig,
31
32    /// Pipeline configuration
33    #[serde(default)]
34    pub pipeline: PipelineConfig,
35
36    /// Storage configuration
37    #[serde(default)]
38    pub storage: StorageConfig,
39
40    /// Model configuration
41    #[serde(default)]
42    pub models: ModelsConfig,
43
44    /// Performance tuning
45    #[serde(default)]
46    pub performance: PerformanceConfig,
47
48    /// Ollama-specific configuration
49    #[serde(default)]
50    pub ollama: OllamaSetConfig,
51
52    /// Experimental features
53    #[serde(default)]
54    pub experimental: ExperimentalConfig,
55
56    /// Top-level entity extraction configuration (for gleaning)
57    #[serde(default)]
58    pub entity_extraction: EntityExtractionTopLevelConfig,
59
60    /// Auto-save configuration for workspace persistence
61    #[serde(default)]
62    pub auto_save: AutoSaveSetConfig,
63}
64
65/// Auto-save configuration
66#[derive(Debug, Clone, Serialize, Deserialize)]
67pub struct AutoSaveSetConfig {
68    /// Enable auto-save functionality
69    #[serde(default)]
70    pub enabled: bool,
71
72    /// Auto-save interval in seconds (0 = save after every graph build)
73    #[serde(default = "default_auto_save_interval")]
74    pub interval_seconds: u64,
75
76    /// Workspace name for auto-saves (if None, uses "autosave")
77    #[serde(default)]
78    pub workspace_name: Option<String>,
79
80    /// Maximum number of auto-save versions to keep (0 = unlimited)
81    #[serde(default = "default_max_auto_save_versions")]
82    pub max_versions: usize,
83}
84
85impl Default for AutoSaveSetConfig {
86    fn default() -> Self {
87        Self {
88            enabled: false,
89            interval_seconds: default_auto_save_interval(),
90            workspace_name: None,
91            max_versions: default_max_auto_save_versions(),
92        }
93    }
94}
95
96/// General system configuration settings
97#[derive(Debug, Clone, Serialize, Deserialize)]
98pub struct GeneralConfig {
99    /// Logging level (error, warn, info, debug, trace)
100    #[serde(default = "default_log_level")]
101    pub log_level: String,
102
103    /// Output directory for results
104    #[serde(default = "default_output_dir")]
105    pub output_dir: String,
106
107    /// Path to the input document to process
108    #[serde(default)]
109    pub input_document_path: Option<String>,
110
111    /// Maximum threads (0 = auto-detect)
112    #[serde(default)]
113    pub max_threads: Option<usize>,
114
115    /// Enable performance profiling
116    #[serde(default)]
117    pub enable_profiling: bool,
118}
119
120/// Pipeline execution configuration
121#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct PipelineConfig {
123    /// Workflows to execute in sequence
124    #[serde(default = "default_workflows")]
125    pub workflows: Vec<String>,
126
127    /// Enable parallel execution
128    #[serde(default = "default_true")]
129    pub parallel_execution: bool,
130
131    /// Text extraction configuration
132    #[serde(default)]
133    pub text_extraction: TextExtractionConfig,
134
135    /// Entity extraction configuration
136    #[serde(default)]
137    pub entity_extraction: EntityExtractionConfig,
138
139    /// Graph building configuration
140    #[serde(default)]
141    pub graph_building: GraphBuildingConfig,
142
143    /// Community detection configuration
144    #[serde(default)]
145    pub community_detection: CommunityDetectionConfig,
146}
147
148/// Text extraction and chunking configuration
149#[derive(Debug, Clone, Serialize, Deserialize)]
150pub struct TextExtractionConfig {
151    /// Chunk size for text splitting
152    #[serde(default = "default_chunk_size")]
153    pub chunk_size: usize,
154
155    /// Overlap between chunks
156    #[serde(default = "default_chunk_overlap")]
157    pub chunk_overlap: usize,
158
159    /// Clean control characters
160    #[serde(default = "default_true")]
161    pub clean_control_chars: bool,
162
163    /// Minimum chunk size to keep
164    #[serde(default = "default_min_chunk_size")]
165    pub min_chunk_size: usize,
166
167    /// Text cleaning options
168    #[serde(default)]
169    pub cleaning: Option<CleaningConfig>,
170}
171
172/// Text cleaning options configuration
173#[derive(Debug, Clone, Serialize, Deserialize)]
174pub struct CleaningConfig {
175    /// Remove URLs from text
176    #[serde(default)]
177    pub remove_urls: bool,
178
179    /// Remove email addresses
180    #[serde(default)]
181    pub remove_emails: bool,
182
183    /// Normalize whitespace
184    #[serde(default = "default_true")]
185    pub normalize_whitespace: bool,
186
187    /// Remove special characters
188    #[serde(default)]
189    pub remove_special_chars: bool,
190}
191
192/// Entity extraction configuration
193#[derive(Debug, Clone, Serialize, Deserialize)]
194pub struct EntityExtractionConfig {
195    /// Model name for NER
196    #[serde(default = "default_ner_model")]
197    pub model_name: String,
198
199    /// Temperature for LLM generation
200    #[serde(default = "default_temperature")]
201    pub temperature: f32,
202
203    /// Maximum tokens for extraction
204    #[serde(default = "default_max_tokens")]
205    pub max_tokens: usize,
206
207    /// Entity types to extract (dynamic configuration)
208    pub entity_types: Option<Vec<String>>,
209
210    /// Confidence threshold for entity extraction (top-level)
211    #[serde(default = "default_confidence_threshold")]
212    pub confidence_threshold: f32,
213
214    /// Custom extraction prompt
215    pub custom_prompt: Option<String>,
216
217    /// Entity filtering options
218    #[serde(default)]
219    pub filters: Option<EntityFiltersConfig>,
220}
221
222/// Entity filtering configuration
223#[derive(Debug, Clone, Serialize, Deserialize)]
224pub struct EntityFiltersConfig {
225    /// Minimum entity length
226    #[serde(default = "default_min_entity_length")]
227    pub min_entity_length: usize,
228
229    /// Maximum entity length
230    #[serde(default = "default_max_entity_length")]
231    pub max_entity_length: usize,
232
233    /// Allowed entity types
234    pub allowed_entity_types: Option<Vec<String>>,
235
236    /// Confidence threshold
237    #[serde(default = "default_confidence_threshold")]
238    pub confidence_threshold: f32,
239
240    /// Allowed regex patterns for entity matching
241    pub allowed_patterns: Option<Vec<String>>,
242
243    /// Excluded regex patterns for entity filtering
244    pub excluded_patterns: Option<Vec<String>>,
245
246    /// Enable fuzzy matching for entity resolution
247    #[serde(default)]
248    pub enable_fuzzy_matching: bool,
249}
250
251/// Graph building configuration
252#[derive(Debug, Clone, Serialize, Deserialize)]
253pub struct GraphBuildingConfig {
254    /// Relation scoring algorithm
255    #[serde(default = "default_relation_scorer")]
256    pub relation_scorer: String,
257
258    /// Minimum relation score threshold
259    #[serde(default = "default_min_relation_score")]
260    pub min_relation_score: f32,
261
262    /// Maximum connections per node
263    #[serde(default = "default_max_connections")]
264    pub max_connections_per_node: usize,
265
266    /// Use bidirectional relationships
267    #[serde(default = "default_true")]
268    pub bidirectional_relations: bool,
269}
270
271/// Community detection configuration
272#[derive(Debug, Clone, Serialize, Deserialize)]
273pub struct CommunityDetectionConfig {
274    /// Algorithm for community detection
275    #[serde(default = "default_community_algorithm")]
276    pub algorithm: String,
277
278    /// Resolution parameter
279    #[serde(default = "default_resolution")]
280    pub resolution: f32,
281
282    /// Minimum community size
283    #[serde(default = "default_min_community_size")]
284    pub min_community_size: usize,
285
286    /// Maximum community size (0 = unlimited)
287    #[serde(default)]
288    pub max_community_size: usize,
289}
290
291/// Storage backend configuration
292#[derive(Debug, Clone, Serialize, Deserialize)]
293pub struct StorageConfig {
294    /// Database type
295    #[serde(default = "default_database_type")]
296    pub database_type: String,
297
298    /// Database path for SQLite
299    #[serde(default = "default_database_path")]
300    pub database_path: String,
301
302    /// Enable WAL for SQLite
303    #[serde(default = "default_true")]
304    pub enable_wal: bool,
305
306    /// PostgreSQL configuration
307    pub postgresql: Option<PostgreSQLConfig>,
308
309    /// Neo4j configuration
310    pub neo4j: Option<Neo4jConfig>,
311}
312
313/// PostgreSQL database configuration
314#[derive(Debug, Clone, Serialize, Deserialize)]
315pub struct PostgreSQLConfig {
316    /// PostgreSQL server host
317    pub host: String,
318    /// PostgreSQL server port
319    pub port: u16,
320    /// Database name
321    pub database: String,
322    /// Username for authentication
323    pub username: String,
324    /// Password for authentication
325    pub password: String,
326    /// Connection pool size
327    #[serde(default = "default_pool_size")]
328    pub pool_size: usize,
329}
330
331/// Neo4j graph database configuration
332#[derive(Debug, Clone, Serialize, Deserialize)]
333pub struct Neo4jConfig {
334    /// Neo4j server URI
335    pub uri: String,
336    /// Username for authentication
337    pub username: String,
338    /// Password for authentication
339    pub password: String,
340    /// Enable encrypted connections
341    #[serde(default)]
342    pub encrypted: bool,
343}
344
345/// Model configuration for LLM and embeddings
346#[derive(Debug, Clone, Serialize, Deserialize)]
347pub struct ModelsConfig {
348    /// Primary LLM for generation
349    #[serde(default = "default_primary_llm")]
350    pub primary_llm: String,
351
352    /// Embedding model
353    #[serde(default = "default_embedding_model")]
354    pub embedding_model: String,
355
356    /// Maximum context length
357    #[serde(default = "default_max_context")]
358    pub max_context_length: usize,
359
360    /// LLM parameters
361    #[serde(default)]
362    pub llm_params: Option<LLMParamsConfig>,
363
364    /// Local model configuration
365    #[serde(default)]
366    pub local: Option<LocalModelsConfig>,
367}
368
369/// LLM generation parameters
370#[derive(Debug, Clone, Serialize, Deserialize)]
371pub struct LLMParamsConfig {
372    /// Sampling temperature (0.0-2.0)
373    #[serde(default = "default_temperature")]
374    pub temperature: f32,
375
376    /// Nucleus sampling parameter (0.0-1.0)
377    #[serde(default = "default_top_p")]
378    pub top_p: f32,
379
380    /// Frequency penalty (-2.0-2.0)
381    #[serde(default)]
382    pub frequency_penalty: f32,
383
384    /// Presence penalty (-2.0-2.0)
385    #[serde(default)]
386    pub presence_penalty: f32,
387
388    /// Sequences where the model will stop generating
389    pub stop_sequences: Option<Vec<String>>,
390}
391
392/// Local model configuration (Ollama)
393#[derive(Debug, Clone, Serialize, Deserialize)]
394pub struct LocalModelsConfig {
395    /// Ollama API base URL
396    #[serde(default = "default_ollama_url")]
397    pub ollama_base_url: String,
398
399    /// Local model name for generation
400    #[serde(default = "default_ollama_model")]
401    pub model_name: String,
402
403    /// Local embedding model name
404    #[serde(default = "default_ollama_embedding")]
405    pub embedding_model: String,
406}
407
408/// Performance tuning configuration
409#[derive(Debug, Clone, Serialize, Deserialize)]
410pub struct PerformanceConfig {
411    /// Enable batch processing
412    #[serde(default = "default_true")]
413    pub batch_processing: bool,
414
415    /// Batch size
416    #[serde(default = "default_batch_size")]
417    pub batch_size: usize,
418
419    /// Worker threads
420    #[serde(default = "default_worker_threads")]
421    pub worker_threads: usize,
422
423    /// Memory limit per worker (MB)
424    #[serde(default = "default_memory_limit")]
425    pub memory_limit_mb: usize,
426}
427
428/// Ollama-specific configuration for local LLM and embeddings
429#[derive(Debug, Clone, Serialize, Deserialize)]
430pub struct OllamaSetConfig {
431    /// Enable Ollama integration
432    #[serde(default = "default_true")]
433    pub enabled: bool,
434
435    /// Ollama host
436    #[serde(default = "default_ollama_host")]
437    pub host: String,
438
439    /// Ollama port
440    #[serde(default = "default_ollama_port")]
441    pub port: u16,
442
443    /// Chat model name
444    #[serde(default = "default_chat_model")]
445    pub chat_model: String,
446
447    /// Embedding model name
448    #[serde(default = "default_embedding_model_ollama")]
449    pub embedding_model: String,
450
451    /// Timeout in seconds
452    #[serde(default = "default_timeout")]
453    pub timeout_seconds: u64,
454
455    /// Maximum retries
456    #[serde(default = "default_max_retries")]
457    pub max_retries: u32,
458
459    /// Fallback to hash-based embeddings
460    #[serde(default)]
461    pub fallback_to_hash: bool,
462
463    /// Maximum tokens
464    pub max_tokens: Option<u32>,
465
466    /// Temperature
467    pub temperature: Option<f32>,
468}
469
470/// Experimental features configuration
471#[derive(Debug, Clone, Serialize, Deserialize, Default)]
472pub struct ExperimentalConfig {
473    /// Enable neural reranking
474    #[serde(default)]
475    pub neural_reranking: bool,
476
477    /// Enable federated learning
478    #[serde(default)]
479    pub federated_learning: bool,
480
481    /// Enable real-time updates
482    #[serde(default)]
483    pub real_time_updates: bool,
484
485    /// Enable distributed processing
486    #[serde(default)]
487    pub distributed_processing: bool,
488
489    /// Enable LazyGraphRAG mode (no prior summarization, 0.1% indexing cost)
490    #[serde(default)]
491    pub lazy_graphrag: bool,
492
493    /// Enable E2GraphRAG mode (efficient entity extraction without LLM)
494    #[serde(default)]
495    pub e2_graphrag: bool,
496
497    /// LazyGraphRAG configuration
498    #[serde(default)]
499    pub lazy_graphrag_config: Option<LazyGraphRAGConfig>,
500
501    /// E2GraphRAG configuration
502    #[serde(default)]
503    pub e2_graphrag_config: Option<E2GraphRAGConfig>,
504}
505
506/// LazyGraphRAG configuration
507/// Concept-based retrieval without prior summarization (Microsoft Research, 2025)
508/// Achieves 0.1% of full GraphRAG indexing cost and 700x cheaper query costs
509#[derive(Debug, Clone, Serialize, Deserialize)]
510pub struct LazyGraphRAGConfig {
511    /// Enable concept extraction (noun phrases without LLM)
512    #[serde(default = "default_true")]
513    pub use_concept_extraction: bool,
514
515    /// Minimum concept length in characters
516    #[serde(default = "default_min_concept_length")]
517    pub min_concept_length: usize,
518
519    /// Maximum concept length in words
520    #[serde(default = "default_max_concept_words")]
521    pub max_concept_words: usize,
522
523    /// Co-occurrence threshold (minimum shared chunks for relationship)
524    #[serde(default = "default_co_occurrence_threshold")]
525    pub co_occurrence_threshold: usize,
526
527    /// Enable query refinement with iterative deepening
528    #[serde(default = "default_true")]
529    pub use_query_refinement: bool,
530
531    /// Maximum refinement iterations
532    #[serde(default = "default_max_refinement_iterations")]
533    pub max_refinement_iterations: usize,
534
535    /// Enable bidirectional entity-chunk indexing for fast lookups
536    #[serde(default = "default_true")]
537    pub use_bidirectional_index: bool,
538}
539
540impl Default for LazyGraphRAGConfig {
541    fn default() -> Self {
542        Self {
543            use_concept_extraction: true,
544            min_concept_length: 3,
545            max_concept_words: 5,
546            co_occurrence_threshold: 1,
547            use_query_refinement: true,
548            max_refinement_iterations: 3,
549            use_bidirectional_index: true,
550        }
551    }
552}
553
554/// E2GraphRAG configuration
555/// Efficient entity extraction using SpaCy-like approach without LLM
556/// Achieves 10x faster indexing and 100x faster retrieval
557#[derive(Debug, Clone, Serialize, Deserialize)]
558pub struct E2GraphRAGConfig {
559    /// Enable lightweight NER (no LLM required)
560    #[serde(default = "default_true")]
561    pub use_lightweight_ner: bool,
562
563    /// Entity types to extract (using pattern matching)
564    #[serde(default = "default_e2_entity_types")]
565    pub entity_types: Vec<String>,
566
567    /// Minimum entity confidence for pattern-based extraction
568    #[serde(default = "default_e2_min_confidence")]
569    pub min_confidence: f32,
570
571    /// Enable capitalization-based named entity detection
572    #[serde(default = "default_true")]
573    pub use_capitalization_detection: bool,
574
575    /// Enable noun phrase extraction
576    #[serde(default = "default_true")]
577    pub use_noun_phrase_extraction: bool,
578
579    /// Minimum entity frequency (entities must appear at least N times)
580    #[serde(default = "default_min_entity_frequency")]
581    pub min_entity_frequency: usize,
582
583    /// Use fast co-occurrence for relationships (no LLM)
584    #[serde(default = "default_true")]
585    pub use_fast_cooccurrence: bool,
586
587    /// Enable bidirectional entity-chunk indexing
588    #[serde(default = "default_true")]
589    pub use_bidirectional_index: bool,
590}
591
592impl Default for E2GraphRAGConfig {
593    fn default() -> Self {
594        Self {
595            use_lightweight_ner: true,
596            entity_types: default_e2_entity_types(),
597            min_confidence: 0.6,
598            use_capitalization_detection: true,
599            use_noun_phrase_extraction: true,
600            min_entity_frequency: 1,
601            use_fast_cooccurrence: true,
602            use_bidirectional_index: true,
603        }
604    }
605}
606
607// =============================================================================
608// PIPELINE APPROACH CONFIGURATION (Semantic vs Algorithmic vs Hybrid)
609// =============================================================================
610
611/// Pipeline mode/approach configuration
612/// Determines which pipeline implementation to use
613#[derive(Debug, Clone, Serialize, Deserialize)]
614pub struct ModeConfig {
615    /// Pipeline approach: "semantic", "algorithmic", or "hybrid"
616    /// - semantic: Neural embeddings + LLM extraction + vector search
617    /// - algorithmic: Pattern matching + TF-IDF + BM25 keyword search
618    /// - hybrid: Combines both with weighted fusion
619    #[serde(default = "default_approach")]
620    pub approach: String,
621}
622
623impl Default for ModeConfig {
624    fn default() -> Self {
625        Self {
626            approach: default_approach(),
627        }
628    }
629}
630
631/// Semantic/Neural pipeline configuration
632/// Uses deep learning models for embeddings, entity extraction, and retrieval
633#[derive(Debug, Clone, Serialize, Deserialize)]
634pub struct SemanticPipelineConfig {
635    /// Enable semantic pipeline
636    #[serde(default)]
637    pub enabled: bool,
638
639    /// Embeddings configuration for semantic approach
640    pub embeddings: SemanticEmbeddingsConfig,
641
642    /// Entity extraction configuration for semantic approach
643    pub entity_extraction: SemanticEntityConfig,
644
645    /// Retrieval configuration for semantic approach
646    pub retrieval: SemanticRetrievalConfig,
647
648    /// Graph construction configuration for semantic approach
649    pub graph_construction: SemanticGraphConfig,
650}
651
652/// Semantic embeddings configuration (neural models)
653#[derive(Debug, Clone, Serialize, Deserialize)]
654pub struct SemanticEmbeddingsConfig {
655    /// Backend: "huggingface", "openai", "voyage", "cohere", "jina", "mistral", "together", "ollama"
656    #[serde(default = "default_semantic_embedding_backend")]
657    pub backend: String,
658
659    /// Model identifier (provider-specific)
660    #[serde(default = "default_semantic_embedding_model")]
661    pub model: String,
662
663    /// Embedding dimension
664    #[serde(default = "default_semantic_embedding_dim")]
665    pub dimension: usize,
666
667    /// Use GPU acceleration if available
668    #[serde(default = "default_true")]
669    pub use_gpu: bool,
670
671    /// Similarity metric (cosine, euclidean, dot_product)
672    #[serde(default = "default_similarity_metric")]
673    pub similarity_metric: String,
674
675    /// Batch size for embeddings generation
676    #[serde(default = "default_batch_size")]
677    pub batch_size: usize,
678}
679
680/// Semantic entity extraction configuration (LLM-based)
681#[derive(Debug, Clone, Serialize, Deserialize)]
682pub struct SemanticEntityConfig {
683    /// Extraction method (always "llm" for semantic)
684    #[serde(default = "default_semantic_entity_method")]
685    pub method: String,
686
687    /// Enable gleaning (iterative refinement)
688    #[serde(default = "default_true")]
689    pub use_gleaning: bool,
690
691    /// Maximum gleaning rounds
692    #[serde(default = "default_max_gleaning_rounds")]
693    pub max_gleaning_rounds: usize,
694
695    /// LLM model for extraction
696    #[serde(default = "default_chat_model")]
697    pub model: String,
698
699    /// Temperature for LLM
700    #[serde(default = "default_semantic_temperature")]
701    pub temperature: f32,
702
703    /// Confidence threshold
704    #[serde(default = "default_semantic_confidence")]
705    pub confidence_threshold: f32,
706}
707
708/// Semantic retrieval configuration (vector search)
709#[derive(Debug, Clone, Serialize, Deserialize)]
710pub struct SemanticRetrievalConfig {
711    /// Retrieval strategy (always "vector" for semantic)
712    #[serde(default = "default_semantic_retrieval_strategy")]
713    pub strategy: String,
714
715    /// Use HNSW index for fast approximate search
716    #[serde(default = "default_true")]
717    pub use_hnsw: bool,
718
719    /// HNSW ef_construction parameter
720    #[serde(default = "default_hnsw_ef_construction")]
721    pub hnsw_ef_construction: usize,
722
723    /// HNSW M parameter (connections per node)
724    #[serde(default = "default_hnsw_m")]
725    pub hnsw_m: usize,
726
727    /// Top-k results
728    #[serde(default = "default_top_k")]
729    pub top_k: usize,
730
731    /// Similarity threshold
732    #[serde(default = "default_semantic_similarity_threshold")]
733    pub similarity_threshold: f32,
734}
735
736/// Semantic graph construction configuration (embedding-based)
737#[derive(Debug, Clone, Serialize, Deserialize)]
738pub struct SemanticGraphConfig {
739    /// Relation scorer (always "embedding_similarity" for semantic)
740    #[serde(default = "default_semantic_relation_scorer")]
741    pub relation_scorer: String,
742
743    /// Use transformer embeddings for relationships
744    #[serde(default = "default_true")]
745    pub use_transformer_embeddings: bool,
746
747    /// Minimum relation score
748    #[serde(default = "default_min_relation_score")]
749    pub min_relation_score: f32,
750}
751
752/// Algorithmic/Classic NLP pipeline configuration
753/// Uses pattern matching, TF-IDF, and keyword-based methods
754#[derive(Debug, Clone, Serialize, Deserialize)]
755pub struct AlgorithmicPipelineConfig {
756    /// Enable algorithmic pipeline
757    #[serde(default)]
758    pub enabled: bool,
759
760    /// Embeddings configuration for algorithmic approach
761    pub embeddings: AlgorithmicEmbeddingsConfig,
762
763    /// Entity extraction configuration for algorithmic approach
764    pub entity_extraction: AlgorithmicEntityConfig,
765
766    /// Retrieval configuration for algorithmic approach
767    pub retrieval: AlgorithmicRetrievalConfig,
768
769    /// Graph construction configuration for algorithmic approach
770    pub graph_construction: AlgorithmicGraphConfig,
771}
772
773/// Algorithmic embeddings configuration (hash-based, TF-IDF)
774#[derive(Debug, Clone, Serialize, Deserialize)]
775pub struct AlgorithmicEmbeddingsConfig {
776    /// Backend (always "hash" for algorithmic)
777    #[serde(default = "default_algorithmic_embedding_backend")]
778    pub backend: String,
779
780    /// Embedding dimension
781    #[serde(default = "default_algorithmic_embedding_dim")]
782    pub dimension: usize,
783
784    /// Use TF-IDF weighting
785    #[serde(default = "default_true")]
786    pub use_tfidf: bool,
787
788    /// Vocabulary size
789    #[serde(default = "default_vocabulary_size")]
790    pub vocabulary_size: usize,
791
792    /// Minimum term frequency
793    #[serde(default = "default_min_term_frequency")]
794    pub min_term_frequency: usize,
795
796    /// Maximum document frequency (0.0-1.0)
797    #[serde(default = "default_max_document_frequency")]
798    pub max_document_frequency: f32,
799}
800
801/// Algorithmic entity extraction configuration (pattern-based)
802#[derive(Debug, Clone, Serialize, Deserialize)]
803pub struct AlgorithmicEntityConfig {
804    /// Extraction method (always "pattern" for algorithmic)
805    #[serde(default = "default_algorithmic_entity_method")]
806    pub method: String,
807
808    /// Use NER rules
809    #[serde(default = "default_true")]
810    pub use_ner_rules: bool,
811
812    /// Use POS tagging
813    #[serde(default)]
814    pub use_pos_tagging: bool,
815
816    /// Minimum entity length
817    #[serde(default = "default_min_entity_length")]
818    pub min_entity_length: usize,
819
820    /// Confidence threshold
821    #[serde(default = "default_algorithmic_confidence")]
822    pub confidence_threshold: f32,
823
824    /// Regex patterns for entity matching
825    pub patterns: Option<Vec<String>>,
826}
827
828/// Algorithmic retrieval configuration (BM25 keyword search)
829#[derive(Debug, Clone, Serialize, Deserialize)]
830pub struct AlgorithmicRetrievalConfig {
831    /// Retrieval strategy (always "bm25" for algorithmic)
832    #[serde(default = "default_algorithmic_retrieval_strategy")]
833    pub strategy: String,
834
835    /// BM25 k1 parameter (term frequency saturation)
836    #[serde(default = "default_bm25_k1")]
837    pub k1: f32,
838
839    /// BM25 b parameter (length normalization)
840    #[serde(default = "default_bm25_b")]
841    pub b: f32,
842
843    /// Use stemming
844    #[serde(default = "default_true")]
845    pub use_stemming: bool,
846
847    /// Language for stemming
848    #[serde(default = "default_language")]
849    pub language: String,
850
851    /// Top-k results
852    #[serde(default = "default_top_k")]
853    pub top_k: usize,
854}
855
856/// Algorithmic graph construction configuration (token overlap)
857#[derive(Debug, Clone, Serialize, Deserialize)]
858pub struct AlgorithmicGraphConfig {
859    /// Relation scorer (jaccard, cosine on token vectors)
860    #[serde(default = "default_algorithmic_relation_scorer")]
861    pub relation_scorer: String,
862
863    /// Use co-occurrence for relationship detection
864    #[serde(default = "default_true")]
865    pub use_cooccurrence: bool,
866
867    /// Co-occurrence window size
868    #[serde(default = "default_cooccurrence_window")]
869    pub window_size: usize,
870
871    /// Minimum relation score
872    #[serde(default = "default_algorithmic_min_relation_score")]
873    pub min_relation_score: f32,
874}
875
876/// Hybrid pipeline configuration
877/// Combines semantic and algorithmic approaches with weighted fusion
878#[derive(Debug, Clone, Serialize, Deserialize)]
879pub struct HybridPipelineConfig {
880    /// Enable hybrid pipeline
881    #[serde(default)]
882    pub enabled: bool,
883
884    /// Weight configuration for combining approaches
885    pub weights: HybridWeightsConfig,
886
887    /// Embeddings configuration for hybrid
888    pub embeddings: HybridEmbeddingsConfig,
889
890    /// Entity extraction configuration for hybrid
891    pub entity_extraction: HybridEntityConfig,
892
893    /// Retrieval configuration for hybrid
894    pub retrieval: HybridRetrievalConfig,
895
896    /// Graph construction configuration for hybrid
897    pub graph_construction: HybridGraphConfig,
898
899    /// Fallback strategy when primary fails
900    #[serde(default = "default_hybrid_fallback_strategy")]
901    pub fallback_strategy: String,
902
903    /// Enable cross-validation between approaches
904    #[serde(default = "default_true")]
905    pub cross_validation: bool,
906}
907
908/// Hybrid weight configuration
909#[derive(Debug, Clone, Serialize, Deserialize)]
910pub struct HybridWeightsConfig {
911    /// Weight for semantic approach (0.0-1.0)
912    #[serde(default = "default_hybrid_semantic_weight")]
913    pub semantic_weight: f32,
914
915    /// Weight for algorithmic approach (0.0-1.0)
916    #[serde(default = "default_hybrid_algorithmic_weight")]
917    pub algorithmic_weight: f32,
918}
919
920/// Hybrid embeddings configuration
921#[derive(Debug, Clone, Serialize, Deserialize)]
922pub struct HybridEmbeddingsConfig {
923    /// Primary backend (neural)
924    #[serde(default = "default_semantic_embedding_backend")]
925    pub primary: String,
926
927    /// Fallback backend (hash-based)
928    #[serde(default = "default_algorithmic_embedding_backend")]
929    pub fallback: String,
930
931    /// Combine scores from both
932    #[serde(default = "default_true")]
933    pub combine_scores: bool,
934
935    /// Auto-fallback when primary unavailable
936    #[serde(default = "default_true")]
937    pub auto_fallback: bool,
938}
939
940/// Hybrid entity extraction configuration
941#[derive(Debug, Clone, Serialize, Deserialize)]
942pub struct HybridEntityConfig {
943    /// Use both LLM and pattern extraction
944    #[serde(default = "default_true")]
945    pub use_both: bool,
946
947    /// Weight for LLM extraction (0.0-1.0)
948    #[serde(default = "default_hybrid_llm_weight")]
949    pub llm_weight: f32,
950
951    /// Weight for pattern extraction (0.0-1.0)
952    #[serde(default = "default_hybrid_pattern_weight")]
953    pub pattern_weight: f32,
954
955    /// Cross-validate LLM results with patterns
956    #[serde(default = "default_true")]
957    pub cross_validate: bool,
958
959    /// Confidence boost when both agree
960    #[serde(default = "default_hybrid_confidence_boost")]
961    pub confidence_boost: f32,
962}
963
964/// Hybrid retrieval configuration (RRF fusion)
965#[derive(Debug, Clone, Serialize, Deserialize)]
966pub struct HybridRetrievalConfig {
967    /// Retrieval strategy (always "fusion" for hybrid)
968    #[serde(default = "default_hybrid_retrieval_strategy")]
969    pub strategy: String,
970
971    /// Combine vector and BM25
972    #[serde(default = "default_true")]
973    pub combine_vector_bm25: bool,
974
975    /// Weight for vector search
976    #[serde(default = "default_hybrid_vector_weight")]
977    pub vector_weight: f32,
978
979    /// Weight for BM25 search
980    #[serde(default = "default_hybrid_bm25_weight")]
981    pub bm25_weight: f32,
982
983    /// RRF constant (typically 60)
984    #[serde(default = "default_rrf_constant")]
985    pub rrf_constant: usize,
986}
987
988/// Hybrid graph construction configuration
989#[derive(Debug, Clone, Serialize, Deserialize)]
990pub struct HybridGraphConfig {
991    /// Primary relation scorer (embedding-based)
992    #[serde(default = "default_semantic_relation_scorer")]
993    pub primary_scorer: String,
994
995    /// Fallback relation scorer (token-based)
996    #[serde(default = "default_algorithmic_relation_scorer")]
997    pub fallback_scorer: String,
998
999    /// Combine scores from both scorers
1000    #[serde(default = "default_true")]
1001    pub combine_scores: bool,
1002}
1003
1004/// Top-level entity extraction configuration (gleaning settings)
1005#[derive(Debug, Clone, Serialize, Deserialize)]
1006pub struct EntityExtractionTopLevelConfig {
1007    /// Enable entity extraction
1008    #[serde(default = "default_true")]
1009    pub enabled: bool,
1010
1011    /// Minimum confidence threshold
1012    #[serde(default = "default_confidence_threshold")]
1013    pub min_confidence: f32,
1014
1015    /// Use LLM-based gleaning
1016    #[serde(default)]
1017    pub use_gleaning: bool,
1018
1019    /// Maximum gleaning rounds
1020    #[serde(default = "default_gleaning_rounds")]
1021    pub max_gleaning_rounds: usize,
1022
1023    /// Gleaning improvement threshold
1024    #[serde(default = "default_gleaning_improvement")]
1025    pub gleaning_improvement_threshold: f32,
1026
1027    /// Enable semantic merging
1028    #[serde(default)]
1029    pub semantic_merging: bool,
1030
1031    /// Merge similarity threshold
1032    #[serde(default = "default_merge_threshold")]
1033    pub merge_similarity_threshold: f32,
1034
1035    /// Enable automatic linking
1036    #[serde(default)]
1037    pub automatic_linking: bool,
1038
1039    /// Linking confidence threshold
1040    #[serde(default = "default_confidence_threshold")]
1041    pub linking_confidence_threshold: f32,
1042}
1043
1044impl Default for EntityExtractionTopLevelConfig {
1045    fn default() -> Self {
1046        Self {
1047            enabled: true,
1048            min_confidence: default_confidence_threshold(),
1049            use_gleaning: false,
1050            max_gleaning_rounds: default_gleaning_rounds(),
1051            gleaning_improvement_threshold: default_gleaning_improvement(),
1052            semantic_merging: false,
1053            merge_similarity_threshold: default_merge_threshold(),
1054            automatic_linking: false,
1055            linking_confidence_threshold: default_confidence_threshold(),
1056        }
1057    }
1058}
1059
1060// Default value functions
1061fn default_log_level() -> String {
1062    "info".to_string()
1063}
1064fn default_output_dir() -> String {
1065    "./output".to_string()
1066}
1067fn default_true() -> bool {
1068    true
1069}
1070fn default_workflows() -> Vec<String> {
1071    vec![
1072        "extract_text".to_string(),
1073        "extract_entities".to_string(),
1074        "build_graph".to_string(),
1075        "detect_communities".to_string(),
1076    ]
1077}
1078fn default_chunk_size() -> usize {
1079    512
1080}
1081fn default_chunk_overlap() -> usize {
1082    64
1083}
1084fn default_min_chunk_size() -> usize {
1085    50
1086}
1087fn default_ner_model() -> String {
1088    "microsoft/DialoGPT-medium".to_string()
1089}
1090fn default_temperature() -> f32 {
1091    0.1
1092}
1093fn default_max_tokens() -> usize {
1094    2048
1095}
1096fn default_min_entity_length() -> usize {
1097    3
1098}
1099fn default_max_entity_length() -> usize {
1100    100
1101}
1102fn default_confidence_threshold() -> f32 {
1103    0.8
1104}
1105fn default_relation_scorer() -> String {
1106    "cosine_similarity".to_string()
1107}
1108fn default_min_relation_score() -> f32 {
1109    0.7
1110}
1111fn default_max_connections() -> usize {
1112    10
1113}
1114fn default_community_algorithm() -> String {
1115    "leiden".to_string()
1116}
1117fn default_resolution() -> f32 {
1118    1.0
1119}
1120fn default_min_community_size() -> usize {
1121    3
1122}
1123fn default_database_type() -> String {
1124    "sqlite".to_string()
1125}
1126fn default_database_path() -> String {
1127    "./graphrag.db".to_string()
1128}
1129fn default_pool_size() -> usize {
1130    10
1131}
1132fn default_primary_llm() -> String {
1133    "gpt-4".to_string()
1134}
1135fn default_embedding_model() -> String {
1136    "text-embedding-ada-002".to_string()
1137}
1138fn default_max_context() -> usize {
1139    4096
1140}
1141fn default_top_p() -> f32 {
1142    0.9
1143}
1144fn default_ollama_url() -> String {
1145    "http://localhost:11434".to_string()
1146}
1147fn default_ollama_model() -> String {
1148    "llama2:7b".to_string()
1149}
1150fn default_ollama_embedding() -> String {
1151    "nomic-embed-text".to_string()
1152}
1153fn default_batch_size() -> usize {
1154    100
1155}
1156fn default_worker_threads() -> usize {
1157    4
1158}
1159fn default_memory_limit() -> usize {
1160    1024
1161}
1162fn default_ollama_host() -> String {
1163    "http://localhost".to_string()
1164}
1165fn default_ollama_port() -> u16 {
1166    11434
1167}
1168fn default_chat_model() -> String {
1169    "llama3.1:8b".to_string()
1170}
1171fn default_embedding_model_ollama() -> String {
1172    "nomic-embed-text".to_string()
1173}
1174fn default_timeout() -> u64 {
1175    60
1176}
1177fn default_max_retries() -> u32 {
1178    3
1179}
1180fn default_gleaning_rounds() -> usize {
1181    3
1182}
1183fn default_gleaning_improvement() -> f32 {
1184    0.1
1185}
1186fn default_merge_threshold() -> f32 {
1187    0.85
1188}
1189
1190// =============================================================================
1191// Default functions for Pipeline Approach Configuration
1192// =============================================================================
1193
1194// Mode defaults
1195fn default_approach() -> String {
1196    "semantic".to_string() // Default to semantic pipeline
1197}
1198
1199// Semantic pipeline defaults
1200fn default_semantic_embedding_backend() -> String {
1201    "huggingface".to_string()
1202}
1203fn default_semantic_embedding_model() -> String {
1204    "sentence-transformers/all-MiniLM-L6-v2".to_string()
1205}
1206fn default_semantic_embedding_dim() -> usize {
1207    384 // MiniLM-L6-v2 dimension
1208}
1209fn default_similarity_metric() -> String {
1210    "cosine".to_string()
1211}
1212fn default_semantic_entity_method() -> String {
1213    "llm".to_string()
1214}
1215fn default_max_gleaning_rounds() -> usize {
1216    3
1217}
1218fn default_semantic_temperature() -> f32 {
1219    0.1
1220}
1221fn default_semantic_confidence() -> f32 {
1222    0.7
1223}
1224fn default_semantic_retrieval_strategy() -> String {
1225    "vector".to_string()
1226}
1227fn default_hnsw_ef_construction() -> usize {
1228    200
1229}
1230fn default_hnsw_m() -> usize {
1231    16
1232}
1233fn default_top_k() -> usize {
1234    10
1235}
1236fn default_semantic_similarity_threshold() -> f32 {
1237    0.7
1238}
1239fn default_semantic_relation_scorer() -> String {
1240    "embedding_similarity".to_string()
1241}
1242
1243// Algorithmic pipeline defaults
1244fn default_algorithmic_embedding_backend() -> String {
1245    "hash".to_string()
1246}
1247fn default_algorithmic_embedding_dim() -> usize {
1248    128
1249}
1250fn default_vocabulary_size() -> usize {
1251    10000
1252}
1253fn default_min_term_frequency() -> usize {
1254    2
1255}
1256fn default_max_document_frequency() -> f32 {
1257    0.8
1258}
1259fn default_algorithmic_entity_method() -> String {
1260    "pattern".to_string()
1261}
1262fn default_algorithmic_confidence() -> f32 {
1263    0.75
1264}
1265fn default_algorithmic_retrieval_strategy() -> String {
1266    "bm25".to_string()
1267}
1268fn default_bm25_k1() -> f32 {
1269    1.5
1270}
1271fn default_bm25_b() -> f32 {
1272    0.75
1273}
1274fn default_language() -> String {
1275    "english".to_string()
1276}
1277fn default_algorithmic_relation_scorer() -> String {
1278    "jaccard".to_string()
1279}
1280fn default_cooccurrence_window() -> usize {
1281    10
1282}
1283fn default_algorithmic_min_relation_score() -> f32 {
1284    0.6
1285}
1286
1287// Hybrid pipeline defaults
1288fn default_hybrid_semantic_weight() -> f32 {
1289    0.6
1290}
1291fn default_hybrid_algorithmic_weight() -> f32 {
1292    0.4
1293}
1294fn default_hybrid_llm_weight() -> f32 {
1295    0.7
1296}
1297fn default_hybrid_pattern_weight() -> f32 {
1298    0.3
1299}
1300fn default_hybrid_confidence_boost() -> f32 {
1301    0.15
1302}
1303fn default_hybrid_retrieval_strategy() -> String {
1304    "fusion".to_string()
1305}
1306fn default_hybrid_vector_weight() -> f32 {
1307    0.6
1308}
1309fn default_hybrid_bm25_weight() -> f32 {
1310    0.4
1311}
1312fn default_rrf_constant() -> usize {
1313    60
1314}
1315fn default_hybrid_fallback_strategy() -> String {
1316    "semantic_first".to_string()
1317}
1318fn default_auto_save_interval() -> u64 {
1319    300  // 5 minutes
1320}
1321fn default_max_auto_save_versions() -> usize {
1322    5  // Keep 5 versions by default
1323}
1324
1325// LazyGraphRAG default functions
1326fn default_min_concept_length() -> usize {
1327    3  // Minimum 3 characters for concepts
1328}
1329fn default_max_concept_words() -> usize {
1330    5  // Maximum 5 words per concept
1331}
1332fn default_co_occurrence_threshold() -> usize {
1333    1  // Minimum 1 shared chunk for relationship
1334}
1335fn default_max_refinement_iterations() -> usize {
1336    3  // Up to 3 query refinement iterations
1337}
1338
1339// E2GraphRAG default functions
1340fn default_e2_entity_types() -> Vec<String> {
1341    vec![
1342        "PERSON".to_string(),
1343        "ORGANIZATION".to_string(),
1344        "LOCATION".to_string(),
1345        "CONCEPT".to_string(),
1346    ]
1347}
1348fn default_e2_min_confidence() -> f32 {
1349    0.6  // 60% minimum confidence for pattern-based extraction
1350}
1351fn default_min_entity_frequency() -> usize {
1352    1  // Entities must appear at least once
1353}
1354
1355impl Default for GeneralConfig {
1356    fn default() -> Self {
1357        Self {
1358            log_level: default_log_level(),
1359            output_dir: default_output_dir(),
1360            input_document_path: None,
1361            max_threads: None,
1362            enable_profiling: false,
1363        }
1364    }
1365}
1366
1367impl Default for PipelineConfig {
1368    fn default() -> Self {
1369        Self {
1370            workflows: default_workflows(),
1371            parallel_execution: default_true(),
1372            text_extraction: TextExtractionConfig::default(),
1373            entity_extraction: EntityExtractionConfig::default(),
1374            graph_building: GraphBuildingConfig::default(),
1375            community_detection: CommunityDetectionConfig::default(),
1376        }
1377    }
1378}
1379
1380impl Default for TextExtractionConfig {
1381    fn default() -> Self {
1382        Self {
1383            chunk_size: default_chunk_size(),
1384            chunk_overlap: default_chunk_overlap(),
1385            clean_control_chars: default_true(),
1386            min_chunk_size: default_min_chunk_size(),
1387            cleaning: None,
1388        }
1389    }
1390}
1391
1392impl Default for EntityExtractionConfig {
1393    fn default() -> Self {
1394        Self {
1395            model_name: default_ner_model(),
1396            temperature: default_temperature(),
1397            max_tokens: default_max_tokens(),
1398            entity_types: None,
1399            confidence_threshold: default_confidence_threshold(),
1400            custom_prompt: None,
1401            filters: None,
1402        }
1403    }
1404}
1405
1406impl Default for GraphBuildingConfig {
1407    fn default() -> Self {
1408        Self {
1409            relation_scorer: default_relation_scorer(),
1410            min_relation_score: default_min_relation_score(),
1411            max_connections_per_node: default_max_connections(),
1412            bidirectional_relations: default_true(),
1413        }
1414    }
1415}
1416
1417impl Default for CommunityDetectionConfig {
1418    fn default() -> Self {
1419        Self {
1420            algorithm: default_community_algorithm(),
1421            resolution: default_resolution(),
1422            min_community_size: default_min_community_size(),
1423            max_community_size: 0,
1424        }
1425    }
1426}
1427
1428impl Default for StorageConfig {
1429    fn default() -> Self {
1430        Self {
1431            database_type: default_database_type(),
1432            database_path: default_database_path(),
1433            enable_wal: default_true(),
1434            postgresql: None,
1435            neo4j: None,
1436        }
1437    }
1438}
1439
1440impl Default for ModelsConfig {
1441    fn default() -> Self {
1442        Self {
1443            primary_llm: default_primary_llm(),
1444            embedding_model: default_embedding_model(),
1445            max_context_length: default_max_context(),
1446            llm_params: None,
1447            local: None,
1448        }
1449    }
1450}
1451
1452impl Default for PerformanceConfig {
1453    fn default() -> Self {
1454        Self {
1455            batch_processing: default_true(),
1456            batch_size: default_batch_size(),
1457            worker_threads: default_worker_threads(),
1458            memory_limit_mb: default_memory_limit(),
1459        }
1460    }
1461}
1462
1463impl Default for OllamaSetConfig {
1464    fn default() -> Self {
1465        Self {
1466            enabled: default_true(),
1467            host: default_ollama_host(),
1468            port: default_ollama_port(),
1469            chat_model: default_chat_model(),
1470            embedding_model: default_embedding_model_ollama(),
1471            timeout_seconds: default_timeout(),
1472            max_retries: default_max_retries(),
1473            fallback_to_hash: false,
1474            max_tokens: Some(800),
1475            temperature: Some(0.3),
1476        }
1477    }
1478}
1479
1480// =============================================================================
1481// Default implementations for Pipeline Approach Configuration
1482// =============================================================================
1483
1484impl Default for SemanticPipelineConfig {
1485    fn default() -> Self {
1486        Self {
1487            enabled: true,
1488            embeddings: SemanticEmbeddingsConfig::default(),
1489            entity_extraction: SemanticEntityConfig::default(),
1490            retrieval: SemanticRetrievalConfig::default(),
1491            graph_construction: SemanticGraphConfig::default(),
1492        }
1493    }
1494}
1495
1496impl Default for SemanticEmbeddingsConfig {
1497    fn default() -> Self {
1498        Self {
1499            backend: default_semantic_embedding_backend(),
1500            model: default_semantic_embedding_model(),
1501            dimension: default_semantic_embedding_dim(),
1502            use_gpu: default_true(),
1503            similarity_metric: default_similarity_metric(),
1504            batch_size: default_batch_size(),
1505        }
1506    }
1507}
1508
1509impl Default for SemanticEntityConfig {
1510    fn default() -> Self {
1511        Self {
1512            method: default_semantic_entity_method(),
1513            use_gleaning: default_true(),
1514            max_gleaning_rounds: default_max_gleaning_rounds(),
1515            model: default_chat_model(),
1516            temperature: default_semantic_temperature(),
1517            confidence_threshold: default_semantic_confidence(),
1518        }
1519    }
1520}
1521
1522impl Default for SemanticRetrievalConfig {
1523    fn default() -> Self {
1524        Self {
1525            strategy: default_semantic_retrieval_strategy(),
1526            use_hnsw: default_true(),
1527            hnsw_ef_construction: default_hnsw_ef_construction(),
1528            hnsw_m: default_hnsw_m(),
1529            top_k: default_top_k(),
1530            similarity_threshold: default_semantic_similarity_threshold(),
1531        }
1532    }
1533}
1534
1535impl Default for SemanticGraphConfig {
1536    fn default() -> Self {
1537        Self {
1538            relation_scorer: default_semantic_relation_scorer(),
1539            use_transformer_embeddings: default_true(),
1540            min_relation_score: default_min_relation_score(),
1541        }
1542    }
1543}
1544
1545impl Default for AlgorithmicPipelineConfig {
1546    fn default() -> Self {
1547        Self {
1548            enabled: false,
1549            embeddings: AlgorithmicEmbeddingsConfig::default(),
1550            entity_extraction: AlgorithmicEntityConfig::default(),
1551            retrieval: AlgorithmicRetrievalConfig::default(),
1552            graph_construction: AlgorithmicGraphConfig::default(),
1553        }
1554    }
1555}
1556
1557impl Default for AlgorithmicEmbeddingsConfig {
1558    fn default() -> Self {
1559        Self {
1560            backend: default_algorithmic_embedding_backend(),
1561            dimension: default_algorithmic_embedding_dim(),
1562            use_tfidf: default_true(),
1563            vocabulary_size: default_vocabulary_size(),
1564            min_term_frequency: default_min_term_frequency(),
1565            max_document_frequency: default_max_document_frequency(),
1566        }
1567    }
1568}
1569
1570impl Default for AlgorithmicEntityConfig {
1571    fn default() -> Self {
1572        Self {
1573            method: default_algorithmic_entity_method(),
1574            use_ner_rules: default_true(),
1575            use_pos_tagging: false,
1576            min_entity_length: default_min_entity_length(),
1577            confidence_threshold: default_algorithmic_confidence(),
1578            patterns: None,
1579        }
1580    }
1581}
1582
1583impl Default for AlgorithmicRetrievalConfig {
1584    fn default() -> Self {
1585        Self {
1586            strategy: default_algorithmic_retrieval_strategy(),
1587            k1: default_bm25_k1(),
1588            b: default_bm25_b(),
1589            use_stemming: default_true(),
1590            language: default_language(),
1591            top_k: default_top_k(),
1592        }
1593    }
1594}
1595
1596impl Default for AlgorithmicGraphConfig {
1597    fn default() -> Self {
1598        Self {
1599            relation_scorer: default_algorithmic_relation_scorer(),
1600            use_cooccurrence: default_true(),
1601            window_size: default_cooccurrence_window(),
1602            min_relation_score: default_algorithmic_min_relation_score(),
1603        }
1604    }
1605}
1606
1607impl Default for HybridPipelineConfig {
1608    fn default() -> Self {
1609        Self {
1610            enabled: false,
1611            weights: HybridWeightsConfig::default(),
1612            embeddings: HybridEmbeddingsConfig::default(),
1613            entity_extraction: HybridEntityConfig::default(),
1614            retrieval: HybridRetrievalConfig::default(),
1615            graph_construction: HybridGraphConfig::default(),
1616            fallback_strategy: default_hybrid_fallback_strategy(),
1617            cross_validation: default_true(),
1618        }
1619    }
1620}
1621
1622impl Default for HybridWeightsConfig {
1623    fn default() -> Self {
1624        Self {
1625            semantic_weight: default_hybrid_semantic_weight(),
1626            algorithmic_weight: default_hybrid_algorithmic_weight(),
1627        }
1628    }
1629}
1630
1631impl Default for HybridEmbeddingsConfig {
1632    fn default() -> Self {
1633        Self {
1634            primary: default_semantic_embedding_backend(),
1635            fallback: default_algorithmic_embedding_backend(),
1636            combine_scores: default_true(),
1637            auto_fallback: default_true(),
1638        }
1639    }
1640}
1641
1642impl Default for HybridEntityConfig {
1643    fn default() -> Self {
1644        Self {
1645            use_both: default_true(),
1646            llm_weight: default_hybrid_llm_weight(),
1647            pattern_weight: default_hybrid_pattern_weight(),
1648            cross_validate: default_true(),
1649            confidence_boost: default_hybrid_confidence_boost(),
1650        }
1651    }
1652}
1653
1654impl Default for HybridRetrievalConfig {
1655    fn default() -> Self {
1656        Self {
1657            strategy: default_hybrid_retrieval_strategy(),
1658            combine_vector_bm25: default_true(),
1659            vector_weight: default_hybrid_vector_weight(),
1660            bm25_weight: default_hybrid_bm25_weight(),
1661            rrf_constant: default_rrf_constant(),
1662        }
1663    }
1664}
1665
1666impl Default for HybridGraphConfig {
1667    fn default() -> Self {
1668        Self {
1669            primary_scorer: default_semantic_relation_scorer(),
1670            fallback_scorer: default_algorithmic_relation_scorer(),
1671            combine_scores: default_true(),
1672        }
1673    }
1674}
1675
1676impl SetConfig {
1677    /// Load configuration from TOML or JSON5 file (auto-detects format by extension)
1678    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
1679        let path_ref = path.as_ref();
1680        let content = fs::read_to_string(path_ref)?;
1681
1682        // Detect format by file extension
1683        let extension = path_ref
1684            .extension()
1685            .and_then(|e| e.to_str())
1686            .unwrap_or("");
1687
1688        let config: SetConfig = match extension {
1689            #[cfg(feature = "json5-support")]
1690            "json5" | "json" => {
1691                json5::from_str(&content).map_err(|e| crate::core::GraphRAGError::Config {
1692                    message: format!("JSON5 parse error: {e}"),
1693                })?
1694            }
1695            #[cfg(not(feature = "json5-support"))]
1696            "json5" | "json" => {
1697                return Err(crate::core::GraphRAGError::Config {
1698                    message: "JSON5 support not enabled. Rebuild with --features json5-support".to_string(),
1699                });
1700            }
1701            _ => {
1702                toml::from_str(&content).map_err(|e| crate::core::GraphRAGError::Config {
1703                    message: format!("TOML parse error: {e}"),
1704                })?
1705            }
1706        };
1707
1708        Ok(config)
1709    }
1710
1711    /// Save configuration to TOML file with comments
1712    pub fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
1713        let toml_string =
1714            toml::to_string_pretty(&self).map_err(|e| crate::core::GraphRAGError::Config {
1715                message: format!("TOML serialize error: {e}"),
1716            })?;
1717
1718        // Add header comment
1719        let commented_toml = format!(
1720            "# =============================================================================\n\
1721             # GraphRAG Configuration File\n\
1722             # Complete configuration with extensive parameters for easy customization\n\
1723             # =============================================================================\n\n{toml_string}"
1724        );
1725
1726        fs::write(path, commented_toml)?;
1727        Ok(())
1728    }
1729
1730    /// Convert to the existing Config format for compatibility
1731    pub fn to_graphrag_config(&self) -> crate::Config {
1732        let mut config = crate::Config::default();
1733
1734        // Map pipeline approach (semantic/algorithmic/hybrid)
1735        config.approach = self.mode.approach.clone();
1736
1737        // Map text processing
1738        config.text.chunk_size = self.pipeline.text_extraction.chunk_size;
1739        config.text.chunk_overlap = self.pipeline.text_extraction.chunk_overlap;
1740
1741        // Map entity extraction based on approach
1742        config.entities.min_confidence = self
1743            .entity_extraction
1744            .min_confidence;
1745
1746        // Map entity types from pipeline.entity_extraction
1747        if let Some(ref types) = self.pipeline.entity_extraction.entity_types {
1748            config.entities.entity_types = types.clone();
1749        }
1750
1751        // Configure gleaning based on approach:
1752        // - semantic: use LLM-based gleaning
1753        // - algorithmic: use pattern-based extraction
1754        // - hybrid: use both (for compatibility, map to gleaning)
1755        match self.mode.approach.as_str() {
1756            "semantic" => {
1757                if let Some(ref semantic) = self.semantic {
1758                    config.entities.use_gleaning = semantic.entity_extraction.use_gleaning;
1759                    config.entities.max_gleaning_rounds = semantic.entity_extraction.max_gleaning_rounds;
1760                    config.entities.min_confidence = semantic.entity_extraction.confidence_threshold;
1761                } else {
1762                    // Fallback for semantic approach: ALWAYS enable gleaning when mode.approach = "semantic"
1763                    // This ensures JSON5 configs with mode.approach="semantic" use LLM-based extraction
1764                    config.entities.use_gleaning = true;
1765                    config.entities.max_gleaning_rounds = if self.entity_extraction.use_gleaning {
1766                        self.entity_extraction.max_gleaning_rounds
1767                    } else {
1768                        default_max_gleaning_rounds() // Use default if not explicitly set
1769                    };
1770                    // Use top-level min_confidence if available
1771                    config.entities.min_confidence = self.entity_extraction.min_confidence;
1772                }
1773            }
1774            "algorithmic" => {
1775                // Algorithmic uses pattern-based extraction, no gleaning
1776                config.entities.use_gleaning = false;
1777                if let Some(ref algorithmic) = self.algorithmic {
1778                    config.entities.min_confidence = algorithmic.entity_extraction.confidence_threshold;
1779                }
1780            }
1781            "hybrid" => {
1782                // Hybrid can use both, enable gleaning for LLM component
1783                config.entities.use_gleaning = true;
1784                if self.hybrid.is_some() {
1785                    // Use hybrid configuration if available
1786                    config.entities.max_gleaning_rounds = 2; // Reduced for hybrid efficiency
1787                }
1788            }
1789            _ => {
1790                // Unknown approach, use top-level config as fallback
1791                config.entities.use_gleaning = self.entity_extraction.use_gleaning;
1792                config.entities.max_gleaning_rounds = self.entity_extraction.max_gleaning_rounds;
1793            }
1794        }
1795
1796        // Map graph building
1797        config.graph.similarity_threshold = self.pipeline.graph_building.min_relation_score;
1798        config.graph.max_connections = self.pipeline.graph_building.max_connections_per_node;
1799        config.graph.extract_relationships = true; // Enable by default for TOML configs
1800        config.graph.relationship_confidence_threshold = 0.5; // Default threshold
1801
1802        // Map retrieval
1803        config.retrieval.top_k = 10; // Default
1804
1805        // Map embeddings
1806        config.embeddings.dimension = 768; // Default for nomic-embed-text
1807        config.embeddings.backend = "ollama".to_string();
1808        config.embeddings.fallback_to_hash = self.ollama.fallback_to_hash;
1809
1810        // Map parallel processing
1811        config.parallel.enabled = self.pipeline.parallel_execution;
1812        config.parallel.num_threads = self.performance.worker_threads;
1813
1814        // Map Ollama configuration
1815        config.ollama = crate::ollama::OllamaConfig {
1816            enabled: self.ollama.enabled,
1817            host: self.ollama.host.clone(),
1818            port: self.ollama.port,
1819            chat_model: self.ollama.chat_model.clone(),
1820            embedding_model: self.ollama.embedding_model.clone(),
1821            timeout_seconds: self.ollama.timeout_seconds,
1822            max_retries: self.ollama.max_retries,
1823            fallback_to_hash: self.ollama.fallback_to_hash,
1824            max_tokens: self.ollama.max_tokens,
1825            temperature: self.ollama.temperature,
1826        };
1827
1828        // Map auto-save configuration
1829        config.auto_save = crate::config::AutoSaveConfig {
1830            enabled: self.auto_save.enabled,
1831            interval_seconds: self.auto_save.interval_seconds,
1832            workspace_name: self.auto_save.workspace_name.clone(),
1833            max_versions: self.auto_save.max_versions,
1834        };
1835
1836        config
1837    }
1838}