1use crate::Result;
8use std::fs;
9
10pub mod enhancements;
12#[cfg(feature = "json5-support")]
14pub mod json5_loader;
15mod json_parser;
17pub mod loader;
19#[cfg(feature = "json5-support")]
21pub mod schema_validator;
22pub mod setconfig;
24pub mod validation;
26
27pub use setconfig::{
28 AlgorithmicEmbeddingsConfig,
29 AlgorithmicEntityConfig,
30 AlgorithmicGraphConfig,
31 AlgorithmicPipelineConfig,
33 AlgorithmicRetrievalConfig,
34 HybridEmbeddingsConfig,
35 HybridEntityConfig,
36 HybridGraphConfig,
37 HybridPipelineConfig,
39 HybridRetrievalConfig,
40 HybridWeightsConfig,
41 ModeConfig,
43 SemanticEmbeddingsConfig,
44 SemanticEntityConfig,
45 SemanticGraphConfig,
46 SemanticPipelineConfig,
48 SemanticRetrievalConfig,
49 SetConfig,
50};
51pub use validation::{validate_config_file, Validatable, ValidationResult};
52
53#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
55pub struct Config {
56 pub output_dir: String,
58
59 pub chunk_size: usize,
61
62 pub chunk_overlap: usize,
64
65 pub max_entities_per_chunk: Option<usize>,
67
68 pub top_k_results: Option<usize>,
70
71 pub similarity_threshold: Option<f32>,
73
74 #[serde(default = "default_approach")]
77 pub approach: String,
78
79 pub embeddings: EmbeddingConfig,
81
82 pub graph: GraphConfig,
84
85 pub text: TextConfig,
87
88 pub entities: EntityConfig,
90
91 pub retrieval: RetrievalConfig,
93
94 pub parallel: ParallelConfig,
96
97 pub ollama: crate::ollama::OllamaConfig,
99
100 pub gliner: GlinerConfig,
102
103 pub enhancements: enhancements::EnhancementsConfig,
105
106 pub auto_save: AutoSaveConfig,
108
109 pub summarization: crate::summarization::HierarchicalConfig,
111
112 pub zero_cost_approach: ZeroCostApproachConfig,
114
115 #[serde(default)]
117 pub advanced_features: AdvancedFeaturesConfig,
118
119 #[serde(default)]
122 pub suppress_progress_bars: bool,
123}
124
125#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
127pub struct GlinerConfig {
128 pub enabled: bool,
130 pub model_path: String,
132 pub tokenizer_path: String,
134 pub mode: String,
136 pub entity_labels: Vec<String>,
138 pub relation_labels: Vec<String>,
140 pub entity_threshold: f32,
142 pub relation_threshold: f32,
144 pub use_gpu: bool,
146 #[serde(default)]
149 pub max_concurrent_chunks: Option<usize>,
150}
151
152impl Default for GlinerConfig {
153 fn default() -> Self {
154 Self {
155 enabled: false,
156 model_path: String::new(),
157 tokenizer_path: String::new(),
158 mode: "span".to_string(),
159 entity_labels: vec![
160 "person".into(),
161 "organization".into(),
162 "location".into(),
163 "concept".into(),
164 ],
165 relation_labels: vec!["related to".into(), "part of".into(), "causes".into()],
166 entity_threshold: 0.4,
167 relation_threshold: 0.5,
168 use_gpu: false,
169 max_concurrent_chunks: None,
170 }
171 }
172}
173
174#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
176pub struct AutoSaveConfig {
177 #[serde(default)]
181 pub enabled: bool,
182
183 #[serde(default)]
186 pub base_dir: Option<String>,
187
188 #[serde(default = "default_auto_save_interval")]
190 pub interval_seconds: u64,
191
192 #[serde(default)]
194 pub workspace_name: Option<String>,
195
196 #[serde(default = "default_max_versions")]
198 pub max_versions: usize,
199}
200
201#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
203pub struct ZeroCostApproachConfig {
204 #[serde(default = "default_zero_cost_approach")]
206 pub approach: String,
207
208 #[serde(default)]
210 pub lazy_graphrag: LazyGraphRAGConfig,
211
212 #[serde(default)]
214 pub e2_graphrag: E2GraphRAGConfig,
215
216 #[serde(default)]
218 pub pure_algorithmic: PureAlgorithmicConfig,
219
220 #[serde(default)]
222 pub hybrid_strategy: HybridStrategyConfig,
223}
224
225#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
228pub struct LazyGraphRAGConfig {
229 pub enabled: bool,
231 pub concept_extraction: ConceptExtractionConfig,
233 pub co_occurrence: CoOccurrenceConfig,
235 pub indexing: LazyIndexingConfig,
237 pub query_expansion: LazyQueryExpansionConfig,
239 pub relevance_scoring: LazyRelevanceScoringConfig,
241}
242
243#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
246pub struct ConceptExtractionConfig {
247 pub min_concept_length: usize,
249 pub max_concept_words: usize,
251 pub use_noun_phrases: bool,
253 pub use_capitalization: bool,
255 pub use_title_case: bool,
257 pub use_tf_idf_scoring: bool,
259 pub min_term_frequency: usize,
261 pub max_concepts_per_chunk: usize,
263 pub min_concept_score: f32,
265 pub exclude_stopwords: bool,
267 pub custom_stopwords: Vec<String>,
269}
270
271#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
274pub struct CoOccurrenceConfig {
275 pub window_size: usize,
277 pub min_co_occurrence: usize,
279 pub jaccard_threshold: f32,
281 pub max_edges_per_node: usize,
283}
284
285#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
288pub struct LazyIndexingConfig {
289 pub use_bidirectional_index: bool,
291 pub enable_hnsw_index: bool,
293 pub cache_size: usize,
295}
296
297#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
300pub struct LazyQueryExpansionConfig {
301 pub enabled: bool,
303 pub max_expansions: usize,
305 pub expansion_model: String,
307 pub expansion_temperature: f32,
309 pub max_tokens_per_expansion: usize,
311}
312
313#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
316pub struct LazyRelevanceScoringConfig {
317 pub enabled: bool,
319 pub scoring_model: String,
321 pub batch_size: usize,
323 pub temperature: f32,
325 pub max_tokens_per_score: usize,
327}
328
329#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
332pub struct E2GraphRAGConfig {
333 pub enabled: bool,
335
336 pub ner_extraction: NERExtractionConfig,
338
339 pub keyword_extraction: KeywordExtractionConfig,
341
342 pub graph_construction: E2GraphConstructionConfig,
344
345 pub indexing: E2IndexingConfig,
347}
348
349#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
352pub struct NERExtractionConfig {
353 pub entity_types: Vec<String>,
355
356 pub use_capitalized_patterns: bool,
358
359 pub use_title_case_patterns: bool,
361
362 pub use_quoted_patterns: bool,
364
365 pub use_abbreviations: bool,
367
368 pub use_contextual_disambiguation: bool,
370
371 pub min_context_words: usize,
373
374 pub min_confidence: f32,
376
377 pub use_positional_boost: bool,
379
380 pub use_frequency_boost: bool,
382}
383
384impl Default for NERExtractionConfig {
385 fn default() -> Self {
386 Self {
387 entity_types: vec![
388 "PERSON".to_string(),
389 "ORG".to_string(),
390 "LOCATION".to_string(),
391 ],
392 use_capitalized_patterns: true,
393 use_title_case_patterns: true,
394 use_quoted_patterns: true,
395 use_abbreviations: true,
396 use_contextual_disambiguation: true,
397 min_context_words: 5,
398 min_confidence: 0.7,
399 use_positional_boost: true,
400 use_frequency_boost: true,
401 }
402 }
403}
404
405#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
408pub struct KeywordExtractionConfig {
409 pub algorithms: Vec<String>,
411
412 pub max_keywords_per_chunk: usize,
414
415 pub min_keyword_length: usize,
417
418 pub combine_algorithms: bool,
420}
421
422impl Default for KeywordExtractionConfig {
423 fn default() -> Self {
424 Self {
425 algorithms: vec!["tfidf".to_string(), "yake".to_string()],
426 max_keywords_per_chunk: 10,
427 min_keyword_length: 3,
428 combine_algorithms: true,
429 }
430 }
431}
432
433#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
436pub struct E2GraphConstructionConfig {
437 pub relationship_types: Vec<String>,
439
440 pub min_relationship_score: f32,
442
443 pub max_relationships_per_entity: usize,
445
446 pub use_mutual_information: bool,
448}
449
450impl Default for E2GraphConstructionConfig {
451 fn default() -> Self {
452 Self {
453 relationship_types: vec!["CO_OCCURS_WITH".to_string(), "RELATED_TO".to_string()],
454 min_relationship_score: 0.5,
455 max_relationships_per_entity: 20,
456 use_mutual_information: true,
457 }
458 }
459}
460
461#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
464pub struct E2IndexingConfig {
465 pub batch_size: usize,
467
468 pub enable_parallel_processing: bool,
470
471 pub cache_concept_vectors: bool,
473
474 pub use_hash_embeddings: bool,
476}
477
478impl Default for E2IndexingConfig {
479 fn default() -> Self {
480 Self {
481 batch_size: 32,
482 enable_parallel_processing: true,
483 cache_concept_vectors: true,
484 use_hash_embeddings: false,
485 }
486 }
487}
488
489#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
495pub struct PureAlgorithmicConfig {
496 pub enabled: bool,
498 pub pattern_extraction: PatternExtractionConfig,
500 pub keyword_extraction: PureKeywordExtractionConfig,
502 pub relationship_discovery: RelationshipDiscoveryConfig,
504 pub search_ranking: SearchRankingConfig,
506}
507
508#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
513pub struct PatternExtractionConfig {
514 pub capitalized_patterns: Vec<String>,
516 pub technical_patterns: Vec<String>,
518 pub context_patterns: Vec<String>,
520}
521
522#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
527pub struct PureKeywordExtractionConfig {
528 pub algorithm: String,
530 pub max_keywords: usize,
532 pub min_word_length: usize,
534 pub use_positional_boost: bool,
536 pub use_frequency_filter: bool,
538 pub min_term_frequency: usize,
540 pub max_term_frequency_ratio: f32,
542}
543
544#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
549pub struct RelationshipDiscoveryConfig {
550 pub window_size: usize,
552 pub min_co_occurrence: usize,
554 pub use_mutual_information: bool,
556 pub relationship_types: Vec<String>,
558 pub scoring_method: String,
560 pub min_similarity_score: f32,
562}
563
564#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
569pub struct SearchRankingConfig {
570 pub vector_search: VectorSearchConfig,
572 pub keyword_search: KeywordSearchConfig,
574 pub graph_traversal: GraphTraversalConfig,
576 pub hybrid_fusion: HybridFusionConfig,
578}
579
580#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
585pub struct VectorSearchConfig {
586 pub enabled: bool,
588}
589
590#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
595pub struct KeywordSearchConfig {
596 pub enabled: bool,
598 pub algorithm: String,
600 pub k1: f32,
602 pub b: f32,
604}
605
606#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
611pub struct GraphTraversalConfig {
612 pub enabled: bool,
614 pub algorithm: String,
616 pub damping_factor: f32,
618 pub max_iterations: usize,
620 pub personalized: bool,
622}
623
624#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
629pub struct HybridFusionConfig {
630 pub enabled: bool,
632 pub weights: FusionWeights,
634}
635
636#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
641pub struct FusionWeights {
642 pub keywords: f32,
644 pub graph: f32,
646 pub bm25: f32,
648}
649
650#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
655pub struct HybridStrategyConfig {
656 pub lazy_algorithmic: LazyAlgorithmicConfig,
658 pub progressive: ProgressiveConfig,
660 pub budget_aware: BudgetAwareConfig,
662}
663
664#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
669pub struct LazyAlgorithmicConfig {
670 pub indexing_approach: String,
672 pub query_approach: String,
674 pub cost_optimization: String,
676}
677
678#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
683pub struct ProgressiveConfig {
684 pub level_0: String,
686 pub level_1: String,
688 pub level_2: String,
690 pub level_3: String,
692 pub level_4_plus: String,
694}
695
696#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
701pub struct BudgetAwareConfig {
702 pub daily_budget_usd: f64,
704 pub queries_per_day: usize,
706 pub max_llm_cost_per_query: f64,
708 pub strategy: String,
710 pub fallback_to_algorithmic: bool,
712}
713
714fn default_zero_cost_approach() -> String {
716 "pure_algorithmic".to_string()
717}
718
719impl Default for ZeroCostApproachConfig {
720 fn default() -> Self {
721 Self {
722 approach: default_zero_cost_approach(),
723 lazy_graphrag: LazyGraphRAGConfig::default(),
724 e2_graphrag: E2GraphRAGConfig::default(),
725 pure_algorithmic: PureAlgorithmicConfig::default(),
726 hybrid_strategy: HybridStrategyConfig::default(),
727 }
728 }
729}
730
731impl Default for ConceptExtractionConfig {
733 fn default() -> Self {
734 Self {
735 min_concept_length: 3,
736 max_concept_words: 5,
737 use_noun_phrases: true,
738 use_capitalization: true,
739 use_title_case: true,
740 use_tf_idf_scoring: true,
741 min_term_frequency: 2,
742 max_concepts_per_chunk: 10,
743 min_concept_score: 0.1,
744 exclude_stopwords: true,
745 custom_stopwords: vec!["the".to_string(), "and".to_string(), "or".to_string()],
746 }
747 }
748}
749impl Default for CoOccurrenceConfig {
750 fn default() -> Self {
751 Self {
752 window_size: 50,
753 min_co_occurrence: 2,
754 jaccard_threshold: 0.2,
755 max_edges_per_node: 25,
756 }
757 }
758}
759impl Default for LazyIndexingConfig {
760 fn default() -> Self {
761 Self {
762 use_bidirectional_index: true,
763 enable_hnsw_index: false,
764 cache_size: 10000,
765 }
766 }
767}
768impl Default for LazyQueryExpansionConfig {
769 fn default() -> Self {
770 Self {
771 enabled: true,
772 max_expansions: 3,
773 expansion_model: "llama3.1:8b".to_string(),
774 expansion_temperature: 0.1,
775 max_tokens_per_expansion: 50,
776 }
777 }
778}
779impl Default for LazyRelevanceScoringConfig {
780 fn default() -> Self {
781 Self {
782 enabled: true,
783 scoring_model: "llama3.1:8b".to_string(),
784 batch_size: 10,
785 temperature: 0.2,
786 max_tokens_per_score: 30,
787 }
788 }
789}
790impl Default for PureAlgorithmicConfig {
791 fn default() -> Self {
792 Self {
793 enabled: true,
794 pattern_extraction: Default::default(),
795 keyword_extraction: Default::default(),
796 relationship_discovery: Default::default(),
797 search_ranking: Default::default(),
798 }
799 }
800}
801impl Default for PatternExtractionConfig {
802 fn default() -> Self {
803 Self {
804 capitalized_patterns: vec![r"[A-Z][a-z]+".to_string()],
805 technical_patterns: vec![r"[a-z]+-[a-z]+".to_string()],
806 context_patterns: vec![r"\b(the|this)\s+(\w+)".to_string()],
807 }
808 }
809}
810impl Default for PureKeywordExtractionConfig {
811 fn default() -> Self {
812 Self {
813 algorithm: "tf_idf".to_string(),
814 max_keywords: 20,
815 min_word_length: 4,
816 use_positional_boost: true,
817 use_frequency_filter: true,
818 min_term_frequency: 2,
819 max_term_frequency_ratio: 0.8,
820 }
821 }
822}
823impl Default for RelationshipDiscoveryConfig {
824 fn default() -> Self {
825 Self {
826 window_size: 30,
827 min_co_occurrence: 2,
828 use_mutual_information: true,
829 relationship_types: vec!["co_occurs_with".to_string()],
830 scoring_method: "jaccard_similarity".to_string(),
831 min_similarity_score: 0.1,
832 }
833 }
834}
835impl Default for SearchRankingConfig {
836 fn default() -> Self {
837 Self {
838 vector_search: VectorSearchConfig { enabled: false },
839 keyword_search: KeywordSearchConfig {
840 enabled: true,
841 algorithm: "bm25".to_string(),
842 k1: 1.2,
843 b: 0.75,
844 },
845 graph_traversal: GraphTraversalConfig {
846 enabled: true,
847 algorithm: "pagerank".to_string(),
848 damping_factor: 0.85,
849 max_iterations: 20,
850 personalized: true,
851 },
852 hybrid_fusion: HybridFusionConfig {
853 enabled: true,
854 weights: FusionWeights {
855 keywords: 0.4,
856 graph: 0.4,
857 bm25: 0.2,
858 },
859 },
860 }
861 }
862}
863impl Default for HybridStrategyConfig {
864 fn default() -> Self {
865 Self {
866 lazy_algorithmic: LazyAlgorithmicConfig {
867 indexing_approach: "e2_graphrag".to_string(),
868 query_approach: "lazy_graphrag".to_string(),
869 cost_optimization: "indexing".to_string(),
870 },
871 progressive: ProgressiveConfig {
872 level_0: "pure_algorithmic".to_string(),
873 level_1: "pure_algorithmic".to_string(),
874 level_2: "e2_graphrag".to_string(),
875 level_3: "lazy_graphrag".to_string(),
876 level_4_plus: "lazy_graphrag".to_string(),
877 },
878 budget_aware: BudgetAwareConfig {
879 daily_budget_usd: 1.0,
880 queries_per_day: 1000,
881 max_llm_cost_per_query: 0.002,
882 strategy: "lazy_graphrag".to_string(),
883 fallback_to_algorithmic: true,
884 },
885 }
886 }
887}
888impl Default for KeywordSearchConfig {
889 fn default() -> Self {
890 Self {
891 enabled: true,
892 algorithm: "bm25".to_string(),
893 k1: 1.2,
894 b: 0.75,
895 }
896 }
897}
898impl Default for GraphTraversalConfig {
899 fn default() -> Self {
900 Self {
901 enabled: true,
902 algorithm: "pagerank".to_string(),
903 damping_factor: 0.85,
904 max_iterations: 20,
905 personalized: true,
906 }
907 }
908}
909impl Default for HybridFusionConfig {
910 fn default() -> Self {
911 Self {
912 enabled: true,
913 weights: FusionWeights {
914 keywords: 0.4,
915 graph: 0.4,
916 bm25: 0.2,
917 },
918 }
919 }
920}
921impl Default for FusionWeights {
922 fn default() -> Self {
923 Self {
924 keywords: 0.4,
925 graph: 0.4,
926 bm25: 0.2,
927 }
928 }
929}
930impl Default for LazyAlgorithmicConfig {
931 fn default() -> Self {
932 Self {
933 indexing_approach: "e2_graphrag".to_string(),
934 query_approach: "lazy_graphrag".to_string(),
935 cost_optimization: "indexing".to_string(),
936 }
937 }
938}
939impl Default for ProgressiveConfig {
940 fn default() -> Self {
941 Self {
942 level_0: "pure_algorithmic".to_string(),
943 level_1: "pure_algorithmic".to_string(),
944 level_2: "e2_graphrag".to_string(),
945 level_3: "lazy_graphrag".to_string(),
946 level_4_plus: "lazy_graphrag".to_string(),
947 }
948 }
949}
950impl Default for BudgetAwareConfig {
951 fn default() -> Self {
952 Self {
953 daily_budget_usd: 1.0,
954 queries_per_day: 1000,
955 max_llm_cost_per_query: 0.002,
956 strategy: "lazy_graphrag".to_string(),
957 fallback_to_algorithmic: true,
958 }
959 }
960}
961
962#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
964pub struct EmbeddingConfig {
965 pub dimension: usize,
967
968 pub backend: String,
970
971 #[serde(default)]
981 pub model: Option<String>,
982
983 pub fallback_to_hash: bool,
985
986 pub api_endpoint: Option<String>,
988
989 pub api_key: Option<String>,
992
993 #[serde(default)]
995 pub cache_dir: Option<String>,
996
997 #[serde(default = "default_batch_size")]
999 pub batch_size: usize,
1000}
1001
1002fn default_batch_size() -> usize {
1003 32
1004}
1005
1006#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1008pub struct GraphConfig {
1009 pub max_connections: usize,
1011
1012 pub similarity_threshold: f32,
1014
1015 #[serde(default = "default_true")]
1017 pub extract_relationships: bool,
1018
1019 #[serde(default = "default_relationship_confidence")]
1021 pub relationship_confidence_threshold: f32,
1022
1023 #[serde(default)]
1025 pub traversal: TraversalConfigParams,
1026}
1027
1028#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1030pub struct TraversalConfigParams {
1031 #[serde(default = "default_max_traversal_depth")]
1033 pub max_depth: usize,
1034
1035 #[serde(default = "default_max_paths")]
1037 pub max_paths: usize,
1038
1039 #[serde(default = "default_true")]
1041 pub use_edge_weights: bool,
1042
1043 #[serde(default = "default_min_relationship_strength")]
1045 pub min_relationship_strength: f32,
1046}
1047
1048impl Default for TraversalConfigParams {
1049 fn default() -> Self {
1050 Self {
1051 max_depth: default_max_traversal_depth(),
1052 max_paths: default_max_paths(),
1053 use_edge_weights: true,
1054 min_relationship_strength: default_min_relationship_strength(),
1055 }
1056 }
1057}
1058
1059#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1061pub struct TextConfig {
1062 pub chunk_size: usize,
1064
1065 pub chunk_overlap: usize,
1067
1068 pub languages: Vec<String>,
1070}
1071
1072#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1074pub struct EntityConfig {
1075 pub min_confidence: f32,
1077
1078 pub entity_types: Vec<String>,
1080
1081 #[serde(default)]
1083 pub use_gleaning: bool,
1084
1085 #[serde(default = "default_max_gleaning_rounds")]
1087 pub max_gleaning_rounds: usize,
1088
1089 #[serde(default)]
1092 pub enable_triple_reflection: bool,
1093
1094 #[serde(default = "default_validation_confidence")]
1097 pub validation_min_confidence: f32,
1098
1099 #[serde(default)]
1102 pub use_atomic_facts: bool,
1103
1104 #[serde(default = "default_max_fact_tokens")]
1107 pub max_fact_tokens: usize,
1108}
1109
1110#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
1112pub struct AdvancedFeaturesConfig {
1113 #[serde(default)]
1116 pub symbolic_anchoring: SymbolicAnchoringConfig,
1117
1118 #[serde(default)]
1121 pub dynamic_weighting: DynamicWeightingConfig,
1122
1123 #[serde(default)]
1126 pub causal_analysis: CausalAnalysisConfig,
1127
1128 #[serde(default)]
1131 pub hierarchical_clustering: HierarchicalClusteringConfig,
1132
1133 #[serde(default)]
1136 pub weight_optimization: WeightOptimizationConfig,
1137}
1138
1139#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1141pub struct SymbolicAnchoringConfig {
1142 #[serde(default = "default_anchor_min_relevance")]
1144 pub min_relevance: f32,
1145
1146 #[serde(default = "default_max_anchors")]
1148 pub max_anchors: usize,
1149
1150 #[serde(default = "default_max_entities_per_anchor")]
1152 pub max_entities_per_anchor: usize,
1153}
1154
1155#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1157pub struct DynamicWeightingConfig {
1158 #[serde(default = "default_true")]
1160 pub enable_semantic_boost: bool,
1161
1162 #[serde(default = "default_true")]
1164 pub enable_temporal_boost: bool,
1165
1166 #[serde(default = "default_true")]
1168 pub enable_concept_boost: bool,
1169
1170 #[serde(default = "default_true")]
1172 pub enable_causal_boost: bool,
1173}
1174
1175#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1177pub struct CausalAnalysisConfig {
1178 #[serde(default = "default_causal_min_confidence")]
1180 pub min_confidence: f32,
1181
1182 #[serde(default = "default_causal_min_strength")]
1184 pub min_causal_strength: f32,
1185
1186 #[serde(default = "default_max_chain_depth")]
1188 pub max_chain_depth: usize,
1189
1190 #[serde(default = "default_true")]
1192 pub require_temporal_consistency: bool,
1193}
1194
1195#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1197pub struct HierarchicalClusteringConfig {
1198 #[serde(default = "default_num_levels")]
1200 pub num_levels: usize,
1201
1202 #[serde(default = "default_resolutions")]
1205 pub resolutions: Vec<f32>,
1206
1207 #[serde(default = "default_min_cluster_size")]
1209 pub min_cluster_size: usize,
1210
1211 #[serde(default = "default_true")]
1213 pub generate_summaries: bool,
1214}
1215
1216#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1218pub struct WeightOptimizationConfig {
1219 #[serde(default = "default_learning_rate")]
1221 pub learning_rate: f32,
1222
1223 #[serde(default = "default_max_iterations")]
1225 pub max_iterations: usize,
1226
1227 #[serde(default = "default_slope_window")]
1229 pub slope_window: usize,
1230
1231 #[serde(default = "default_stagnation_threshold")]
1233 pub stagnation_threshold: f32,
1234
1235 #[serde(default = "default_true")]
1237 pub use_llm_eval: bool,
1238
1239 #[serde(default)]
1241 pub objective_weights: ObjectiveWeightsConfig,
1242}
1243
1244#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1246pub struct ObjectiveWeightsConfig {
1247 #[serde(default = "default_relevance_weight")]
1249 pub relevance: f32,
1250
1251 #[serde(default = "default_faithfulness_weight")]
1253 pub faithfulness: f32,
1254
1255 #[serde(default = "default_conciseness_weight")]
1257 pub conciseness: f32,
1258}
1259
1260#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1262pub struct RetrievalConfig {
1263 pub top_k: usize,
1265
1266 pub search_algorithm: String,
1268}
1269
1270#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1272pub struct ParallelConfig {
1273 pub num_threads: usize,
1275
1276 pub enabled: bool,
1278
1279 pub min_batch_size: usize,
1281
1282 pub chunk_batch_size: usize,
1284
1285 pub parallel_embeddings: bool,
1287
1288 pub parallel_graph_ops: bool,
1290
1291 pub parallel_vector_ops: bool,
1293}
1294
1295fn default_embedding_dim() -> usize {
1297 384
1298}
1299fn default_embedding_backend() -> String {
1300 "hash".to_string()
1301}
1302fn default_max_connections() -> usize {
1303 10
1304}
1305fn default_similarity_threshold() -> f32 {
1306 0.8
1307}
1308fn default_chunk_size() -> usize {
1309 1000
1310}
1311fn default_chunk_overlap() -> usize {
1312 200
1313}
1314fn default_languages() -> Vec<String> {
1315 vec!["en".to_string()]
1316}
1317fn default_min_confidence() -> f32 {
1318 0.7
1319}
1320fn default_entity_types() -> Vec<String> {
1321 vec![
1322 "PERSON".to_string(),
1323 "ORG".to_string(),
1324 "LOCATION".to_string(),
1325 ]
1326}
1327fn default_top_k() -> usize {
1328 10
1329}
1330fn default_search_algorithm() -> String {
1331 "cosine".to_string()
1332}
1333fn default_num_threads() -> usize {
1334 0
1335} fn default_min_batch_size() -> usize {
1337 10
1338}
1339fn default_chunk_batch_size() -> usize {
1340 100
1341}
1342fn default_true() -> bool {
1343 true
1344}
1345fn default_relationship_confidence() -> f32 {
1346 0.5
1347}
1348fn default_max_gleaning_rounds() -> usize {
1349 3
1350}
1351
1352fn default_validation_confidence() -> f32 {
1353 0.7
1354}
1355
1356fn default_anchor_min_relevance() -> f32 {
1360 0.3
1361}
1362
1363fn default_max_anchors() -> usize {
1364 5
1365}
1366
1367fn default_max_entities_per_anchor() -> usize {
1368 10
1369}
1370
1371fn default_causal_min_confidence() -> f32 {
1373 0.3
1374}
1375
1376fn default_causal_min_strength() -> f32 {
1377 0.5
1378}
1379
1380fn default_max_chain_depth() -> usize {
1381 5
1382}
1383
1384fn default_num_levels() -> usize {
1386 3
1387}
1388
1389fn default_resolutions() -> Vec<f32> {
1390 vec![1.0, 0.5, 0.2]
1391}
1392
1393fn default_min_cluster_size() -> usize {
1394 2
1395}
1396
1397fn default_learning_rate() -> f32 {
1399 0.1
1400}
1401
1402fn default_max_iterations() -> usize {
1403 20
1404}
1405
1406fn default_slope_window() -> usize {
1407 3
1408}
1409
1410fn default_stagnation_threshold() -> f32 {
1411 0.01
1412}
1413
1414fn default_relevance_weight() -> f32 {
1415 0.4
1416}
1417
1418fn default_faithfulness_weight() -> f32 {
1419 0.4
1420}
1421
1422fn default_conciseness_weight() -> f32 {
1423 0.2
1424}
1425
1426fn default_max_fact_tokens() -> usize {
1427 400
1428}
1429
1430fn default_approach() -> String {
1431 "semantic".to_string()
1432}
1433fn default_max_traversal_depth() -> usize {
1434 3
1435}
1436fn default_max_paths() -> usize {
1437 10
1438}
1439fn default_min_relationship_strength() -> f32 {
1440 0.3
1441}
1442fn default_auto_save_interval() -> u64 {
1443 300 }
1445fn default_max_versions() -> usize {
1446 5 }
1448
1449impl Default for Config {
1450 fn default() -> Self {
1451 Self {
1452 output_dir: "./output".to_string(),
1453 chunk_size: default_chunk_size(),
1454 chunk_overlap: default_chunk_overlap(),
1455 max_entities_per_chunk: Some(10),
1456 top_k_results: Some(default_top_k()),
1457 similarity_threshold: Some(default_similarity_threshold()),
1458 approach: default_approach(),
1459 embeddings: EmbeddingConfig {
1460 dimension: default_embedding_dim(),
1461 backend: default_embedding_backend(),
1462 model: Some("sentence-transformers/all-MiniLM-L6-v2".to_string()),
1463 fallback_to_hash: true,
1464 api_endpoint: None,
1465 api_key: None,
1466 cache_dir: None,
1467 batch_size: default_batch_size(),
1468 },
1469 graph: GraphConfig {
1470 max_connections: default_max_connections(),
1471 similarity_threshold: default_similarity_threshold(),
1472 extract_relationships: default_true(),
1473 relationship_confidence_threshold: default_relationship_confidence(),
1474 traversal: TraversalConfigParams::default(),
1475 },
1476 text: TextConfig {
1477 chunk_size: default_chunk_size(),
1478 chunk_overlap: default_chunk_overlap(),
1479 languages: default_languages(),
1480 },
1481 entities: EntityConfig {
1482 min_confidence: default_min_confidence(),
1483 entity_types: default_entity_types(),
1484 use_gleaning: false,
1485 max_gleaning_rounds: default_max_gleaning_rounds(),
1486 enable_triple_reflection: false,
1487 validation_min_confidence: default_validation_confidence(),
1488 use_atomic_facts: false,
1489 max_fact_tokens: default_max_fact_tokens(),
1490 },
1491 retrieval: RetrievalConfig {
1492 top_k: default_top_k(),
1493 search_algorithm: default_search_algorithm(),
1494 },
1495 parallel: ParallelConfig {
1496 num_threads: default_num_threads(),
1497 enabled: true,
1498 min_batch_size: default_min_batch_size(),
1499 chunk_batch_size: default_chunk_batch_size(),
1500 parallel_embeddings: true,
1501 parallel_graph_ops: true,
1502 parallel_vector_ops: true,
1503 },
1504 ollama: crate::ollama::OllamaConfig::default(),
1505 gliner: GlinerConfig::default(),
1506 enhancements: enhancements::EnhancementsConfig::default(),
1507 auto_save: AutoSaveConfig {
1508 enabled: false,
1509 base_dir: None,
1510 interval_seconds: default_auto_save_interval(),
1511 workspace_name: None,
1512 max_versions: default_max_versions(),
1513 },
1514 summarization: crate::summarization::HierarchicalConfig::default(),
1515 zero_cost_approach: ZeroCostApproachConfig::default(),
1516 advanced_features: AdvancedFeaturesConfig::default(),
1517 suppress_progress_bars: false,
1518 }
1519 }
1520}
1521
1522impl Default for AutoSaveConfig {
1523 fn default() -> Self {
1524 Self {
1525 enabled: false,
1526 base_dir: None,
1527 interval_seconds: default_auto_save_interval(),
1528 workspace_name: None,
1529 max_versions: default_max_versions(),
1530 }
1531 }
1532}
1533
1534impl Config {
1535 pub fn quick(workspace: impl AsRef<std::path::Path>) -> Self {
1540 let ws = workspace.as_ref();
1541 let ws_str = ws.to_string_lossy().into_owned();
1542 let (base, name) = match (ws.parent(), ws.file_name()) {
1543 (Some(p), Some(f)) if !p.as_os_str().is_empty() => (
1544 p.to_string_lossy().into_owned(),
1545 f.to_string_lossy().into_owned(),
1546 ),
1547 _ => (".".to_string(), ws_str.clone()),
1548 };
1549 Self {
1550 output_dir: ws_str,
1551 auto_save: AutoSaveConfig {
1552 enabled: true,
1553 base_dir: Some(base),
1554 workspace_name: Some(name),
1555 ..AutoSaveConfig::default()
1556 },
1557 ..Self::default()
1558 }
1559 }
1560
1561 pub fn with_ollama(mut self) -> Self {
1563 self.ollama.enabled = true;
1564 self.embeddings.backend = "ollama".to_string();
1565 self
1566 }
1567
1568 pub fn with_ollama_host(mut self, host: impl Into<String>) -> Self {
1570 self.ollama.host = host.into();
1571 self.ollama.enabled = true;
1572 self
1573 }
1574
1575 pub fn with_chunk_size(mut self, size: usize) -> Self {
1577 self.chunk_size = size;
1578 self.chunk_overlap = size / 5;
1579 self.text.chunk_size = size;
1580 self.text.chunk_overlap = size / 5;
1581 self
1582 }
1583}
1584
1585impl Default for SymbolicAnchoringConfig {
1586 fn default() -> Self {
1587 Self {
1588 min_relevance: default_anchor_min_relevance(),
1589 max_anchors: default_max_anchors(),
1590 max_entities_per_anchor: default_max_entities_per_anchor(),
1591 }
1592 }
1593}
1594
1595impl Default for DynamicWeightingConfig {
1596 fn default() -> Self {
1597 Self {
1598 enable_semantic_boost: default_true(),
1599 enable_temporal_boost: default_true(),
1600 enable_concept_boost: default_true(),
1601 enable_causal_boost: default_true(),
1602 }
1603 }
1604}
1605
1606impl Default for CausalAnalysisConfig {
1607 fn default() -> Self {
1608 Self {
1609 min_confidence: default_causal_min_confidence(),
1610 min_causal_strength: default_causal_min_strength(),
1611 max_chain_depth: default_max_chain_depth(),
1612 require_temporal_consistency: default_true(),
1613 }
1614 }
1615}
1616
1617impl Default for HierarchicalClusteringConfig {
1618 fn default() -> Self {
1619 Self {
1620 num_levels: default_num_levels(),
1621 resolutions: default_resolutions(),
1622 min_cluster_size: default_min_cluster_size(),
1623 generate_summaries: default_true(),
1624 }
1625 }
1626}
1627
1628impl Default for WeightOptimizationConfig {
1629 fn default() -> Self {
1630 Self {
1631 learning_rate: default_learning_rate(),
1632 max_iterations: default_max_iterations(),
1633 slope_window: default_slope_window(),
1634 stagnation_threshold: default_stagnation_threshold(),
1635 use_llm_eval: default_true(),
1636 objective_weights: ObjectiveWeightsConfig::default(),
1637 }
1638 }
1639}
1640
1641impl Default for ObjectiveWeightsConfig {
1642 fn default() -> Self {
1643 Self {
1644 relevance: default_relevance_weight(),
1645 faithfulness: default_faithfulness_weight(),
1646 conciseness: default_conciseness_weight(),
1647 }
1648 }
1649}
1650
1651impl Config {
1652 #[cfg(feature = "hierarchical-config")]
1669 pub fn load() -> Result<Self> {
1670 use figment::{
1671 providers::{Env, Format, Serialized, Toml},
1672 Figment,
1673 };
1674
1675 let mut figment = Figment::new()
1677 .merge(Serialized::defaults(Config::default()));
1679
1680 if let Some(home) = dirs::home_dir() {
1682 let user_config = home.join(".graphrag").join("config.toml");
1683 if user_config.exists() {
1684 figment = figment.merge(Toml::file(user_config));
1685 }
1686 }
1687
1688 let project_config = std::path::Path::new("graphrag.toml");
1690 if project_config.exists() {
1691 figment = figment.merge(Toml::file(project_config));
1692 }
1693
1694 figment = figment.merge(Env::prefixed("GRAPHRAG_").split("_"));
1697
1698 figment
1699 .extract()
1700 .map_err(|e| crate::core::GraphRAGError::Config {
1701 message: format!("Failed to load hierarchical configuration: {}", e),
1702 })
1703 }
1704
1705 #[cfg(not(feature = "hierarchical-config"))]
1709 pub fn load() -> Result<Self> {
1710 Ok(Config::default())
1711 }
1712
1713 pub fn from_toml_file<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
1726 let content = fs::read_to_string(path.as_ref())?;
1727 let config: Config =
1728 toml::from_str(&content).map_err(|e| crate::core::GraphRAGError::Config {
1729 message: format!("Failed to parse TOML config: {}", e),
1730 })?;
1731 Ok(config)
1732 }
1733}