1use crate::Result;
5use serde::{Deserialize, Serialize};
6use std::fs;
7use std::path::Path;
8
9#[derive(Debug, Clone, Serialize, Deserialize, Default)]
11pub struct SetConfig {
12 #[serde(default)]
14 pub mode: ModeConfig,
15
16 #[serde(default)]
18 pub semantic: Option<SemanticPipelineConfig>,
19
20 #[serde(default)]
22 pub algorithmic: Option<AlgorithmicPipelineConfig>,
23
24 #[serde(default)]
26 pub hybrid: Option<HybridPipelineConfig>,
27
28 #[serde(default)]
30 pub general: GeneralConfig,
31
32 #[serde(default)]
34 pub pipeline: PipelineConfig,
35
36 #[serde(default)]
38 pub storage: StorageConfig,
39
40 #[serde(default)]
42 pub models: ModelsConfig,
43
44 #[serde(default)]
46 pub performance: PerformanceConfig,
47
48 #[serde(default)]
50 pub ollama: OllamaSetConfig,
51
52 #[serde(default)]
54 pub experimental: ExperimentalConfig,
55
56 #[serde(default)]
58 pub entity_extraction: EntityExtractionTopLevelConfig,
59
60 #[serde(default)]
62 pub auto_save: AutoSaveSetConfig,
63}
64
65#[derive(Debug, Clone, Serialize, Deserialize)]
67pub struct AutoSaveSetConfig {
68 #[serde(default)]
70 pub enabled: bool,
71
72 #[serde(default = "default_auto_save_interval")]
74 pub interval_seconds: u64,
75
76 #[serde(default)]
78 pub workspace_name: Option<String>,
79
80 #[serde(default = "default_max_auto_save_versions")]
82 pub max_versions: usize,
83}
84
85impl Default for AutoSaveSetConfig {
86 fn default() -> Self {
87 Self {
88 enabled: false,
89 interval_seconds: default_auto_save_interval(),
90 workspace_name: None,
91 max_versions: default_max_auto_save_versions(),
92 }
93 }
94}
95
96#[derive(Debug, Clone, Serialize, Deserialize)]
98pub struct GeneralConfig {
99 #[serde(default = "default_log_level")]
101 pub log_level: String,
102
103 #[serde(default = "default_output_dir")]
105 pub output_dir: String,
106
107 #[serde(default)]
109 pub input_document_path: Option<String>,
110
111 #[serde(default)]
113 pub max_threads: Option<usize>,
114
115 #[serde(default)]
117 pub enable_profiling: bool,
118}
119
120#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct PipelineConfig {
123 #[serde(default = "default_workflows")]
125 pub workflows: Vec<String>,
126
127 #[serde(default = "default_true")]
129 pub parallel_execution: bool,
130
131 #[serde(default)]
133 pub text_extraction: TextExtractionConfig,
134
135 #[serde(default)]
137 pub entity_extraction: EntityExtractionConfig,
138
139 #[serde(default)]
141 pub graph_building: GraphBuildingConfig,
142
143 #[serde(default)]
145 pub community_detection: CommunityDetectionConfig,
146}
147
148#[derive(Debug, Clone, Serialize, Deserialize)]
150pub struct TextExtractionConfig {
151 #[serde(default = "default_chunk_size")]
153 pub chunk_size: usize,
154
155 #[serde(default = "default_chunk_overlap")]
157 pub chunk_overlap: usize,
158
159 #[serde(default = "default_true")]
161 pub clean_control_chars: bool,
162
163 #[serde(default = "default_min_chunk_size")]
165 pub min_chunk_size: usize,
166
167 #[serde(default)]
169 pub cleaning: Option<CleaningConfig>,
170}
171
172#[derive(Debug, Clone, Serialize, Deserialize)]
174pub struct CleaningConfig {
175 #[serde(default)]
177 pub remove_urls: bool,
178
179 #[serde(default)]
181 pub remove_emails: bool,
182
183 #[serde(default = "default_true")]
185 pub normalize_whitespace: bool,
186
187 #[serde(default)]
189 pub remove_special_chars: bool,
190}
191
192#[derive(Debug, Clone, Serialize, Deserialize)]
194pub struct EntityExtractionConfig {
195 #[serde(default = "default_ner_model")]
197 pub model_name: String,
198
199 #[serde(default = "default_temperature")]
201 pub temperature: f32,
202
203 #[serde(default = "default_max_tokens")]
205 pub max_tokens: usize,
206
207 pub entity_types: Option<Vec<String>>,
209
210 #[serde(default = "default_confidence_threshold")]
212 pub confidence_threshold: f32,
213
214 pub custom_prompt: Option<String>,
216
217 #[serde(default)]
219 pub filters: Option<EntityFiltersConfig>,
220}
221
222#[derive(Debug, Clone, Serialize, Deserialize)]
224pub struct EntityFiltersConfig {
225 #[serde(default = "default_min_entity_length")]
227 pub min_entity_length: usize,
228
229 #[serde(default = "default_max_entity_length")]
231 pub max_entity_length: usize,
232
233 pub allowed_entity_types: Option<Vec<String>>,
235
236 #[serde(default = "default_confidence_threshold")]
238 pub confidence_threshold: f32,
239
240 pub allowed_patterns: Option<Vec<String>>,
242
243 pub excluded_patterns: Option<Vec<String>>,
245
246 #[serde(default)]
248 pub enable_fuzzy_matching: bool,
249}
250
251#[derive(Debug, Clone, Serialize, Deserialize)]
253pub struct GraphBuildingConfig {
254 #[serde(default = "default_relation_scorer")]
256 pub relation_scorer: String,
257
258 #[serde(default = "default_min_relation_score")]
260 pub min_relation_score: f32,
261
262 #[serde(default = "default_max_connections")]
264 pub max_connections_per_node: usize,
265
266 #[serde(default = "default_true")]
268 pub bidirectional_relations: bool,
269}
270
271#[derive(Debug, Clone, Serialize, Deserialize)]
273pub struct CommunityDetectionConfig {
274 #[serde(default = "default_community_algorithm")]
276 pub algorithm: String,
277
278 #[serde(default = "default_resolution")]
280 pub resolution: f32,
281
282 #[serde(default = "default_min_community_size")]
284 pub min_community_size: usize,
285
286 #[serde(default)]
288 pub max_community_size: usize,
289}
290
291#[derive(Debug, Clone, Serialize, Deserialize)]
293pub struct StorageConfig {
294 #[serde(default = "default_database_type")]
296 pub database_type: String,
297
298 #[serde(default = "default_database_path")]
300 pub database_path: String,
301
302 #[serde(default = "default_true")]
304 pub enable_wal: bool,
305
306 pub postgresql: Option<PostgreSQLConfig>,
308
309 pub neo4j: Option<Neo4jConfig>,
311}
312
313#[derive(Debug, Clone, Serialize, Deserialize)]
315pub struct PostgreSQLConfig {
316 pub host: String,
318 pub port: u16,
320 pub database: String,
322 pub username: String,
324 pub password: String,
326 #[serde(default = "default_pool_size")]
328 pub pool_size: usize,
329}
330
331#[derive(Debug, Clone, Serialize, Deserialize)]
333pub struct Neo4jConfig {
334 pub uri: String,
336 pub username: String,
338 pub password: String,
340 #[serde(default)]
342 pub encrypted: bool,
343}
344
345#[derive(Debug, Clone, Serialize, Deserialize)]
347pub struct ModelsConfig {
348 #[serde(default = "default_primary_llm")]
350 pub primary_llm: String,
351
352 #[serde(default = "default_embedding_model")]
354 pub embedding_model: String,
355
356 #[serde(default = "default_max_context")]
358 pub max_context_length: usize,
359
360 #[serde(default)]
362 pub llm_params: Option<LLMParamsConfig>,
363
364 #[serde(default)]
366 pub local: Option<LocalModelsConfig>,
367}
368
369#[derive(Debug, Clone, Serialize, Deserialize)]
371pub struct LLMParamsConfig {
372 #[serde(default = "default_temperature")]
374 pub temperature: f32,
375
376 #[serde(default = "default_top_p")]
378 pub top_p: f32,
379
380 #[serde(default)]
382 pub frequency_penalty: f32,
383
384 #[serde(default)]
386 pub presence_penalty: f32,
387
388 pub stop_sequences: Option<Vec<String>>,
390}
391
392#[derive(Debug, Clone, Serialize, Deserialize)]
394pub struct LocalModelsConfig {
395 #[serde(default = "default_ollama_url")]
397 pub ollama_base_url: String,
398
399 #[serde(default = "default_ollama_model")]
401 pub model_name: String,
402
403 #[serde(default = "default_ollama_embedding")]
405 pub embedding_model: String,
406}
407
408#[derive(Debug, Clone, Serialize, Deserialize)]
410pub struct PerformanceConfig {
411 #[serde(default = "default_true")]
413 pub batch_processing: bool,
414
415 #[serde(default = "default_batch_size")]
417 pub batch_size: usize,
418
419 #[serde(default = "default_worker_threads")]
421 pub worker_threads: usize,
422
423 #[serde(default = "default_memory_limit")]
425 pub memory_limit_mb: usize,
426}
427
428#[derive(Debug, Clone, Serialize, Deserialize)]
430pub struct OllamaSetConfig {
431 #[serde(default = "default_true")]
433 pub enabled: bool,
434
435 #[serde(default = "default_ollama_host")]
437 pub host: String,
438
439 #[serde(default = "default_ollama_port")]
441 pub port: u16,
442
443 #[serde(default = "default_chat_model")]
445 pub chat_model: String,
446
447 #[serde(default = "default_embedding_model_ollama")]
449 pub embedding_model: String,
450
451 #[serde(default = "default_timeout")]
453 pub timeout_seconds: u64,
454
455 #[serde(default = "default_max_retries")]
457 pub max_retries: u32,
458
459 #[serde(default)]
461 pub fallback_to_hash: bool,
462
463 pub max_tokens: Option<u32>,
465
466 pub temperature: Option<f32>,
468}
469
470#[derive(Debug, Clone, Serialize, Deserialize, Default)]
472pub struct ExperimentalConfig {
473 #[serde(default)]
475 pub neural_reranking: bool,
476
477 #[serde(default)]
479 pub federated_learning: bool,
480
481 #[serde(default)]
483 pub real_time_updates: bool,
484
485 #[serde(default)]
487 pub distributed_processing: bool,
488
489 #[serde(default)]
491 pub lazy_graphrag: bool,
492
493 #[serde(default)]
495 pub e2_graphrag: bool,
496
497 #[serde(default)]
499 pub lazy_graphrag_config: Option<LazyGraphRAGConfig>,
500
501 #[serde(default)]
503 pub e2_graphrag_config: Option<E2GraphRAGConfig>,
504}
505
506#[derive(Debug, Clone, Serialize, Deserialize)]
510pub struct LazyGraphRAGConfig {
511 #[serde(default = "default_true")]
513 pub use_concept_extraction: bool,
514
515 #[serde(default = "default_min_concept_length")]
517 pub min_concept_length: usize,
518
519 #[serde(default = "default_max_concept_words")]
521 pub max_concept_words: usize,
522
523 #[serde(default = "default_co_occurrence_threshold")]
525 pub co_occurrence_threshold: usize,
526
527 #[serde(default = "default_true")]
529 pub use_query_refinement: bool,
530
531 #[serde(default = "default_max_refinement_iterations")]
533 pub max_refinement_iterations: usize,
534
535 #[serde(default = "default_true")]
537 pub use_bidirectional_index: bool,
538}
539
540impl Default for LazyGraphRAGConfig {
541 fn default() -> Self {
542 Self {
543 use_concept_extraction: true,
544 min_concept_length: 3,
545 max_concept_words: 5,
546 co_occurrence_threshold: 1,
547 use_query_refinement: true,
548 max_refinement_iterations: 3,
549 use_bidirectional_index: true,
550 }
551 }
552}
553
554#[derive(Debug, Clone, Serialize, Deserialize)]
558pub struct E2GraphRAGConfig {
559 #[serde(default = "default_true")]
561 pub use_lightweight_ner: bool,
562
563 #[serde(default = "default_e2_entity_types")]
565 pub entity_types: Vec<String>,
566
567 #[serde(default = "default_e2_min_confidence")]
569 pub min_confidence: f32,
570
571 #[serde(default = "default_true")]
573 pub use_capitalization_detection: bool,
574
575 #[serde(default = "default_true")]
577 pub use_noun_phrase_extraction: bool,
578
579 #[serde(default = "default_min_entity_frequency")]
581 pub min_entity_frequency: usize,
582
583 #[serde(default = "default_true")]
585 pub use_fast_cooccurrence: bool,
586
587 #[serde(default = "default_true")]
589 pub use_bidirectional_index: bool,
590}
591
592impl Default for E2GraphRAGConfig {
593 fn default() -> Self {
594 Self {
595 use_lightweight_ner: true,
596 entity_types: default_e2_entity_types(),
597 min_confidence: 0.6,
598 use_capitalization_detection: true,
599 use_noun_phrase_extraction: true,
600 min_entity_frequency: 1,
601 use_fast_cooccurrence: true,
602 use_bidirectional_index: true,
603 }
604 }
605}
606
607#[derive(Debug, Clone, Serialize, Deserialize)]
614pub struct ModeConfig {
615 #[serde(default = "default_approach")]
620 pub approach: String,
621}
622
623impl Default for ModeConfig {
624 fn default() -> Self {
625 Self {
626 approach: default_approach(),
627 }
628 }
629}
630
631#[derive(Debug, Clone, Serialize, Deserialize)]
634pub struct SemanticPipelineConfig {
635 #[serde(default)]
637 pub enabled: bool,
638
639 pub embeddings: SemanticEmbeddingsConfig,
641
642 pub entity_extraction: SemanticEntityConfig,
644
645 pub retrieval: SemanticRetrievalConfig,
647
648 pub graph_construction: SemanticGraphConfig,
650}
651
652#[derive(Debug, Clone, Serialize, Deserialize)]
654pub struct SemanticEmbeddingsConfig {
655 #[serde(default = "default_semantic_embedding_backend")]
657 pub backend: String,
658
659 #[serde(default = "default_semantic_embedding_model")]
661 pub model: String,
662
663 #[serde(default = "default_semantic_embedding_dim")]
665 pub dimension: usize,
666
667 #[serde(default = "default_true")]
669 pub use_gpu: bool,
670
671 #[serde(default = "default_similarity_metric")]
673 pub similarity_metric: String,
674
675 #[serde(default = "default_batch_size")]
677 pub batch_size: usize,
678}
679
680#[derive(Debug, Clone, Serialize, Deserialize)]
682pub struct SemanticEntityConfig {
683 #[serde(default = "default_semantic_entity_method")]
685 pub method: String,
686
687 #[serde(default = "default_true")]
689 pub use_gleaning: bool,
690
691 #[serde(default = "default_max_gleaning_rounds")]
693 pub max_gleaning_rounds: usize,
694
695 #[serde(default = "default_chat_model")]
697 pub model: String,
698
699 #[serde(default = "default_semantic_temperature")]
701 pub temperature: f32,
702
703 #[serde(default = "default_semantic_confidence")]
705 pub confidence_threshold: f32,
706}
707
708#[derive(Debug, Clone, Serialize, Deserialize)]
710pub struct SemanticRetrievalConfig {
711 #[serde(default = "default_semantic_retrieval_strategy")]
713 pub strategy: String,
714
715 #[serde(default = "default_true")]
717 pub use_hnsw: bool,
718
719 #[serde(default = "default_hnsw_ef_construction")]
721 pub hnsw_ef_construction: usize,
722
723 #[serde(default = "default_hnsw_m")]
725 pub hnsw_m: usize,
726
727 #[serde(default = "default_top_k")]
729 pub top_k: usize,
730
731 #[serde(default = "default_semantic_similarity_threshold")]
733 pub similarity_threshold: f32,
734}
735
736#[derive(Debug, Clone, Serialize, Deserialize)]
738pub struct SemanticGraphConfig {
739 #[serde(default = "default_semantic_relation_scorer")]
741 pub relation_scorer: String,
742
743 #[serde(default = "default_true")]
745 pub use_transformer_embeddings: bool,
746
747 #[serde(default = "default_min_relation_score")]
749 pub min_relation_score: f32,
750}
751
752#[derive(Debug, Clone, Serialize, Deserialize)]
755pub struct AlgorithmicPipelineConfig {
756 #[serde(default)]
758 pub enabled: bool,
759
760 pub embeddings: AlgorithmicEmbeddingsConfig,
762
763 pub entity_extraction: AlgorithmicEntityConfig,
765
766 pub retrieval: AlgorithmicRetrievalConfig,
768
769 pub graph_construction: AlgorithmicGraphConfig,
771}
772
773#[derive(Debug, Clone, Serialize, Deserialize)]
775pub struct AlgorithmicEmbeddingsConfig {
776 #[serde(default = "default_algorithmic_embedding_backend")]
778 pub backend: String,
779
780 #[serde(default = "default_algorithmic_embedding_dim")]
782 pub dimension: usize,
783
784 #[serde(default = "default_true")]
786 pub use_tfidf: bool,
787
788 #[serde(default = "default_vocabulary_size")]
790 pub vocabulary_size: usize,
791
792 #[serde(default = "default_min_term_frequency")]
794 pub min_term_frequency: usize,
795
796 #[serde(default = "default_max_document_frequency")]
798 pub max_document_frequency: f32,
799}
800
801#[derive(Debug, Clone, Serialize, Deserialize)]
803pub struct AlgorithmicEntityConfig {
804 #[serde(default = "default_algorithmic_entity_method")]
806 pub method: String,
807
808 #[serde(default = "default_true")]
810 pub use_ner_rules: bool,
811
812 #[serde(default)]
814 pub use_pos_tagging: bool,
815
816 #[serde(default = "default_min_entity_length")]
818 pub min_entity_length: usize,
819
820 #[serde(default = "default_algorithmic_confidence")]
822 pub confidence_threshold: f32,
823
824 pub patterns: Option<Vec<String>>,
826}
827
828#[derive(Debug, Clone, Serialize, Deserialize)]
830pub struct AlgorithmicRetrievalConfig {
831 #[serde(default = "default_algorithmic_retrieval_strategy")]
833 pub strategy: String,
834
835 #[serde(default = "default_bm25_k1")]
837 pub k1: f32,
838
839 #[serde(default = "default_bm25_b")]
841 pub b: f32,
842
843 #[serde(default = "default_true")]
845 pub use_stemming: bool,
846
847 #[serde(default = "default_language")]
849 pub language: String,
850
851 #[serde(default = "default_top_k")]
853 pub top_k: usize,
854}
855
856#[derive(Debug, Clone, Serialize, Deserialize)]
858pub struct AlgorithmicGraphConfig {
859 #[serde(default = "default_algorithmic_relation_scorer")]
861 pub relation_scorer: String,
862
863 #[serde(default = "default_true")]
865 pub use_cooccurrence: bool,
866
867 #[serde(default = "default_cooccurrence_window")]
869 pub window_size: usize,
870
871 #[serde(default = "default_algorithmic_min_relation_score")]
873 pub min_relation_score: f32,
874}
875
876#[derive(Debug, Clone, Serialize, Deserialize)]
879pub struct HybridPipelineConfig {
880 #[serde(default)]
882 pub enabled: bool,
883
884 pub weights: HybridWeightsConfig,
886
887 pub embeddings: HybridEmbeddingsConfig,
889
890 pub entity_extraction: HybridEntityConfig,
892
893 pub retrieval: HybridRetrievalConfig,
895
896 pub graph_construction: HybridGraphConfig,
898
899 #[serde(default = "default_hybrid_fallback_strategy")]
901 pub fallback_strategy: String,
902
903 #[serde(default = "default_true")]
905 pub cross_validation: bool,
906}
907
908#[derive(Debug, Clone, Serialize, Deserialize)]
910pub struct HybridWeightsConfig {
911 #[serde(default = "default_hybrid_semantic_weight")]
913 pub semantic_weight: f32,
914
915 #[serde(default = "default_hybrid_algorithmic_weight")]
917 pub algorithmic_weight: f32,
918}
919
920#[derive(Debug, Clone, Serialize, Deserialize)]
922pub struct HybridEmbeddingsConfig {
923 #[serde(default = "default_semantic_embedding_backend")]
925 pub primary: String,
926
927 #[serde(default = "default_algorithmic_embedding_backend")]
929 pub fallback: String,
930
931 #[serde(default = "default_true")]
933 pub combine_scores: bool,
934
935 #[serde(default = "default_true")]
937 pub auto_fallback: bool,
938}
939
940#[derive(Debug, Clone, Serialize, Deserialize)]
942pub struct HybridEntityConfig {
943 #[serde(default = "default_true")]
945 pub use_both: bool,
946
947 #[serde(default = "default_hybrid_llm_weight")]
949 pub llm_weight: f32,
950
951 #[serde(default = "default_hybrid_pattern_weight")]
953 pub pattern_weight: f32,
954
955 #[serde(default = "default_true")]
957 pub cross_validate: bool,
958
959 #[serde(default = "default_hybrid_confidence_boost")]
961 pub confidence_boost: f32,
962}
963
964#[derive(Debug, Clone, Serialize, Deserialize)]
966pub struct HybridRetrievalConfig {
967 #[serde(default = "default_hybrid_retrieval_strategy")]
969 pub strategy: String,
970
971 #[serde(default = "default_true")]
973 pub combine_vector_bm25: bool,
974
975 #[serde(default = "default_hybrid_vector_weight")]
977 pub vector_weight: f32,
978
979 #[serde(default = "default_hybrid_bm25_weight")]
981 pub bm25_weight: f32,
982
983 #[serde(default = "default_rrf_constant")]
985 pub rrf_constant: usize,
986}
987
988#[derive(Debug, Clone, Serialize, Deserialize)]
990pub struct HybridGraphConfig {
991 #[serde(default = "default_semantic_relation_scorer")]
993 pub primary_scorer: String,
994
995 #[serde(default = "default_algorithmic_relation_scorer")]
997 pub fallback_scorer: String,
998
999 #[serde(default = "default_true")]
1001 pub combine_scores: bool,
1002}
1003
1004#[derive(Debug, Clone, Serialize, Deserialize)]
1006pub struct EntityExtractionTopLevelConfig {
1007 #[serde(default = "default_true")]
1009 pub enabled: bool,
1010
1011 #[serde(default = "default_confidence_threshold")]
1013 pub min_confidence: f32,
1014
1015 #[serde(default)]
1017 pub use_gleaning: bool,
1018
1019 #[serde(default = "default_gleaning_rounds")]
1021 pub max_gleaning_rounds: usize,
1022
1023 #[serde(default = "default_gleaning_improvement")]
1025 pub gleaning_improvement_threshold: f32,
1026
1027 #[serde(default)]
1029 pub semantic_merging: bool,
1030
1031 #[serde(default = "default_merge_threshold")]
1033 pub merge_similarity_threshold: f32,
1034
1035 #[serde(default)]
1037 pub automatic_linking: bool,
1038
1039 #[serde(default = "default_confidence_threshold")]
1041 pub linking_confidence_threshold: f32,
1042}
1043
1044impl Default for EntityExtractionTopLevelConfig {
1045 fn default() -> Self {
1046 Self {
1047 enabled: true,
1048 min_confidence: default_confidence_threshold(),
1049 use_gleaning: false,
1050 max_gleaning_rounds: default_gleaning_rounds(),
1051 gleaning_improvement_threshold: default_gleaning_improvement(),
1052 semantic_merging: false,
1053 merge_similarity_threshold: default_merge_threshold(),
1054 automatic_linking: false,
1055 linking_confidence_threshold: default_confidence_threshold(),
1056 }
1057 }
1058}
1059
1060fn default_log_level() -> String {
1062 "info".to_string()
1063}
1064fn default_output_dir() -> String {
1065 "./output".to_string()
1066}
1067fn default_true() -> bool {
1068 true
1069}
1070fn default_workflows() -> Vec<String> {
1071 vec![
1072 "extract_text".to_string(),
1073 "extract_entities".to_string(),
1074 "build_graph".to_string(),
1075 "detect_communities".to_string(),
1076 ]
1077}
1078fn default_chunk_size() -> usize {
1079 512
1080}
1081fn default_chunk_overlap() -> usize {
1082 64
1083}
1084fn default_min_chunk_size() -> usize {
1085 50
1086}
1087fn default_ner_model() -> String {
1088 "microsoft/DialoGPT-medium".to_string()
1089}
1090fn default_temperature() -> f32 {
1091 0.1
1092}
1093fn default_max_tokens() -> usize {
1094 2048
1095}
1096fn default_min_entity_length() -> usize {
1097 3
1098}
1099fn default_max_entity_length() -> usize {
1100 100
1101}
1102fn default_confidence_threshold() -> f32 {
1103 0.8
1104}
1105fn default_relation_scorer() -> String {
1106 "cosine_similarity".to_string()
1107}
1108fn default_min_relation_score() -> f32 {
1109 0.7
1110}
1111fn default_max_connections() -> usize {
1112 10
1113}
1114fn default_community_algorithm() -> String {
1115 "leiden".to_string()
1116}
1117fn default_resolution() -> f32 {
1118 1.0
1119}
1120fn default_min_community_size() -> usize {
1121 3
1122}
1123fn default_database_type() -> String {
1124 "sqlite".to_string()
1125}
1126fn default_database_path() -> String {
1127 "./graphrag.db".to_string()
1128}
1129fn default_pool_size() -> usize {
1130 10
1131}
1132fn default_primary_llm() -> String {
1133 "gpt-4".to_string()
1134}
1135fn default_embedding_model() -> String {
1136 "text-embedding-ada-002".to_string()
1137}
1138fn default_max_context() -> usize {
1139 4096
1140}
1141fn default_top_p() -> f32 {
1142 0.9
1143}
1144fn default_ollama_url() -> String {
1145 "http://localhost:11434".to_string()
1146}
1147fn default_ollama_model() -> String {
1148 "llama2:7b".to_string()
1149}
1150fn default_ollama_embedding() -> String {
1151 "nomic-embed-text".to_string()
1152}
1153fn default_batch_size() -> usize {
1154 100
1155}
1156fn default_worker_threads() -> usize {
1157 4
1158}
1159fn default_memory_limit() -> usize {
1160 1024
1161}
1162fn default_ollama_host() -> String {
1163 "http://localhost".to_string()
1164}
1165fn default_ollama_port() -> u16 {
1166 11434
1167}
1168fn default_chat_model() -> String {
1169 "llama3.1:8b".to_string()
1170}
1171fn default_embedding_model_ollama() -> String {
1172 "nomic-embed-text".to_string()
1173}
1174fn default_timeout() -> u64 {
1175 60
1176}
1177fn default_max_retries() -> u32 {
1178 3
1179}
1180fn default_gleaning_rounds() -> usize {
1181 3
1182}
1183fn default_gleaning_improvement() -> f32 {
1184 0.1
1185}
1186fn default_merge_threshold() -> f32 {
1187 0.85
1188}
1189
1190fn default_approach() -> String {
1196 "semantic".to_string() }
1198
1199fn default_semantic_embedding_backend() -> String {
1201 "huggingface".to_string()
1202}
1203fn default_semantic_embedding_model() -> String {
1204 "sentence-transformers/all-MiniLM-L6-v2".to_string()
1205}
1206fn default_semantic_embedding_dim() -> usize {
1207 384 }
1209fn default_similarity_metric() -> String {
1210 "cosine".to_string()
1211}
1212fn default_semantic_entity_method() -> String {
1213 "llm".to_string()
1214}
1215fn default_max_gleaning_rounds() -> usize {
1216 3
1217}
1218fn default_semantic_temperature() -> f32 {
1219 0.1
1220}
1221fn default_semantic_confidence() -> f32 {
1222 0.7
1223}
1224fn default_semantic_retrieval_strategy() -> String {
1225 "vector".to_string()
1226}
1227fn default_hnsw_ef_construction() -> usize {
1228 200
1229}
1230fn default_hnsw_m() -> usize {
1231 16
1232}
1233fn default_top_k() -> usize {
1234 10
1235}
1236fn default_semantic_similarity_threshold() -> f32 {
1237 0.7
1238}
1239fn default_semantic_relation_scorer() -> String {
1240 "embedding_similarity".to_string()
1241}
1242
1243fn default_algorithmic_embedding_backend() -> String {
1245 "hash".to_string()
1246}
1247fn default_algorithmic_embedding_dim() -> usize {
1248 128
1249}
1250fn default_vocabulary_size() -> usize {
1251 10000
1252}
1253fn default_min_term_frequency() -> usize {
1254 2
1255}
1256fn default_max_document_frequency() -> f32 {
1257 0.8
1258}
1259fn default_algorithmic_entity_method() -> String {
1260 "pattern".to_string()
1261}
1262fn default_algorithmic_confidence() -> f32 {
1263 0.75
1264}
1265fn default_algorithmic_retrieval_strategy() -> String {
1266 "bm25".to_string()
1267}
1268fn default_bm25_k1() -> f32 {
1269 1.5
1270}
1271fn default_bm25_b() -> f32 {
1272 0.75
1273}
1274fn default_language() -> String {
1275 "english".to_string()
1276}
1277fn default_algorithmic_relation_scorer() -> String {
1278 "jaccard".to_string()
1279}
1280fn default_cooccurrence_window() -> usize {
1281 10
1282}
1283fn default_algorithmic_min_relation_score() -> f32 {
1284 0.6
1285}
1286
1287fn default_hybrid_semantic_weight() -> f32 {
1289 0.6
1290}
1291fn default_hybrid_algorithmic_weight() -> f32 {
1292 0.4
1293}
1294fn default_hybrid_llm_weight() -> f32 {
1295 0.7
1296}
1297fn default_hybrid_pattern_weight() -> f32 {
1298 0.3
1299}
1300fn default_hybrid_confidence_boost() -> f32 {
1301 0.15
1302}
1303fn default_hybrid_retrieval_strategy() -> String {
1304 "fusion".to_string()
1305}
1306fn default_hybrid_vector_weight() -> f32 {
1307 0.6
1308}
1309fn default_hybrid_bm25_weight() -> f32 {
1310 0.4
1311}
1312fn default_rrf_constant() -> usize {
1313 60
1314}
1315fn default_hybrid_fallback_strategy() -> String {
1316 "semantic_first".to_string()
1317}
1318fn default_auto_save_interval() -> u64 {
1319 300 }
1321fn default_max_auto_save_versions() -> usize {
1322 5 }
1324
1325fn default_min_concept_length() -> usize {
1327 3 }
1329fn default_max_concept_words() -> usize {
1330 5 }
1332fn default_co_occurrence_threshold() -> usize {
1333 1 }
1335fn default_max_refinement_iterations() -> usize {
1336 3 }
1338
1339fn default_e2_entity_types() -> Vec<String> {
1341 vec![
1342 "PERSON".to_string(),
1343 "ORGANIZATION".to_string(),
1344 "LOCATION".to_string(),
1345 "CONCEPT".to_string(),
1346 ]
1347}
1348fn default_e2_min_confidence() -> f32 {
1349 0.6 }
1351fn default_min_entity_frequency() -> usize {
1352 1 }
1354
1355impl Default for GeneralConfig {
1356 fn default() -> Self {
1357 Self {
1358 log_level: default_log_level(),
1359 output_dir: default_output_dir(),
1360 input_document_path: None,
1361 max_threads: None,
1362 enable_profiling: false,
1363 }
1364 }
1365}
1366
1367impl Default for PipelineConfig {
1368 fn default() -> Self {
1369 Self {
1370 workflows: default_workflows(),
1371 parallel_execution: default_true(),
1372 text_extraction: TextExtractionConfig::default(),
1373 entity_extraction: EntityExtractionConfig::default(),
1374 graph_building: GraphBuildingConfig::default(),
1375 community_detection: CommunityDetectionConfig::default(),
1376 }
1377 }
1378}
1379
1380impl Default for TextExtractionConfig {
1381 fn default() -> Self {
1382 Self {
1383 chunk_size: default_chunk_size(),
1384 chunk_overlap: default_chunk_overlap(),
1385 clean_control_chars: default_true(),
1386 min_chunk_size: default_min_chunk_size(),
1387 cleaning: None,
1388 }
1389 }
1390}
1391
1392impl Default for EntityExtractionConfig {
1393 fn default() -> Self {
1394 Self {
1395 model_name: default_ner_model(),
1396 temperature: default_temperature(),
1397 max_tokens: default_max_tokens(),
1398 entity_types: None,
1399 confidence_threshold: default_confidence_threshold(),
1400 custom_prompt: None,
1401 filters: None,
1402 }
1403 }
1404}
1405
1406impl Default for GraphBuildingConfig {
1407 fn default() -> Self {
1408 Self {
1409 relation_scorer: default_relation_scorer(),
1410 min_relation_score: default_min_relation_score(),
1411 max_connections_per_node: default_max_connections(),
1412 bidirectional_relations: default_true(),
1413 }
1414 }
1415}
1416
1417impl Default for CommunityDetectionConfig {
1418 fn default() -> Self {
1419 Self {
1420 algorithm: default_community_algorithm(),
1421 resolution: default_resolution(),
1422 min_community_size: default_min_community_size(),
1423 max_community_size: 0,
1424 }
1425 }
1426}
1427
1428impl Default for StorageConfig {
1429 fn default() -> Self {
1430 Self {
1431 database_type: default_database_type(),
1432 database_path: default_database_path(),
1433 enable_wal: default_true(),
1434 postgresql: None,
1435 neo4j: None,
1436 }
1437 }
1438}
1439
1440impl Default for ModelsConfig {
1441 fn default() -> Self {
1442 Self {
1443 primary_llm: default_primary_llm(),
1444 embedding_model: default_embedding_model(),
1445 max_context_length: default_max_context(),
1446 llm_params: None,
1447 local: None,
1448 }
1449 }
1450}
1451
1452impl Default for PerformanceConfig {
1453 fn default() -> Self {
1454 Self {
1455 batch_processing: default_true(),
1456 batch_size: default_batch_size(),
1457 worker_threads: default_worker_threads(),
1458 memory_limit_mb: default_memory_limit(),
1459 }
1460 }
1461}
1462
1463impl Default for OllamaSetConfig {
1464 fn default() -> Self {
1465 Self {
1466 enabled: default_true(),
1467 host: default_ollama_host(),
1468 port: default_ollama_port(),
1469 chat_model: default_chat_model(),
1470 embedding_model: default_embedding_model_ollama(),
1471 timeout_seconds: default_timeout(),
1472 max_retries: default_max_retries(),
1473 fallback_to_hash: false,
1474 max_tokens: Some(800),
1475 temperature: Some(0.3),
1476 }
1477 }
1478}
1479
1480impl Default for SemanticPipelineConfig {
1485 fn default() -> Self {
1486 Self {
1487 enabled: true,
1488 embeddings: SemanticEmbeddingsConfig::default(),
1489 entity_extraction: SemanticEntityConfig::default(),
1490 retrieval: SemanticRetrievalConfig::default(),
1491 graph_construction: SemanticGraphConfig::default(),
1492 }
1493 }
1494}
1495
1496impl Default for SemanticEmbeddingsConfig {
1497 fn default() -> Self {
1498 Self {
1499 backend: default_semantic_embedding_backend(),
1500 model: default_semantic_embedding_model(),
1501 dimension: default_semantic_embedding_dim(),
1502 use_gpu: default_true(),
1503 similarity_metric: default_similarity_metric(),
1504 batch_size: default_batch_size(),
1505 }
1506 }
1507}
1508
1509impl Default for SemanticEntityConfig {
1510 fn default() -> Self {
1511 Self {
1512 method: default_semantic_entity_method(),
1513 use_gleaning: default_true(),
1514 max_gleaning_rounds: default_max_gleaning_rounds(),
1515 model: default_chat_model(),
1516 temperature: default_semantic_temperature(),
1517 confidence_threshold: default_semantic_confidence(),
1518 }
1519 }
1520}
1521
1522impl Default for SemanticRetrievalConfig {
1523 fn default() -> Self {
1524 Self {
1525 strategy: default_semantic_retrieval_strategy(),
1526 use_hnsw: default_true(),
1527 hnsw_ef_construction: default_hnsw_ef_construction(),
1528 hnsw_m: default_hnsw_m(),
1529 top_k: default_top_k(),
1530 similarity_threshold: default_semantic_similarity_threshold(),
1531 }
1532 }
1533}
1534
1535impl Default for SemanticGraphConfig {
1536 fn default() -> Self {
1537 Self {
1538 relation_scorer: default_semantic_relation_scorer(),
1539 use_transformer_embeddings: default_true(),
1540 min_relation_score: default_min_relation_score(),
1541 }
1542 }
1543}
1544
1545impl Default for AlgorithmicPipelineConfig {
1546 fn default() -> Self {
1547 Self {
1548 enabled: false,
1549 embeddings: AlgorithmicEmbeddingsConfig::default(),
1550 entity_extraction: AlgorithmicEntityConfig::default(),
1551 retrieval: AlgorithmicRetrievalConfig::default(),
1552 graph_construction: AlgorithmicGraphConfig::default(),
1553 }
1554 }
1555}
1556
1557impl Default for AlgorithmicEmbeddingsConfig {
1558 fn default() -> Self {
1559 Self {
1560 backend: default_algorithmic_embedding_backend(),
1561 dimension: default_algorithmic_embedding_dim(),
1562 use_tfidf: default_true(),
1563 vocabulary_size: default_vocabulary_size(),
1564 min_term_frequency: default_min_term_frequency(),
1565 max_document_frequency: default_max_document_frequency(),
1566 }
1567 }
1568}
1569
1570impl Default for AlgorithmicEntityConfig {
1571 fn default() -> Self {
1572 Self {
1573 method: default_algorithmic_entity_method(),
1574 use_ner_rules: default_true(),
1575 use_pos_tagging: false,
1576 min_entity_length: default_min_entity_length(),
1577 confidence_threshold: default_algorithmic_confidence(),
1578 patterns: None,
1579 }
1580 }
1581}
1582
1583impl Default for AlgorithmicRetrievalConfig {
1584 fn default() -> Self {
1585 Self {
1586 strategy: default_algorithmic_retrieval_strategy(),
1587 k1: default_bm25_k1(),
1588 b: default_bm25_b(),
1589 use_stemming: default_true(),
1590 language: default_language(),
1591 top_k: default_top_k(),
1592 }
1593 }
1594}
1595
1596impl Default for AlgorithmicGraphConfig {
1597 fn default() -> Self {
1598 Self {
1599 relation_scorer: default_algorithmic_relation_scorer(),
1600 use_cooccurrence: default_true(),
1601 window_size: default_cooccurrence_window(),
1602 min_relation_score: default_algorithmic_min_relation_score(),
1603 }
1604 }
1605}
1606
1607impl Default for HybridPipelineConfig {
1608 fn default() -> Self {
1609 Self {
1610 enabled: false,
1611 weights: HybridWeightsConfig::default(),
1612 embeddings: HybridEmbeddingsConfig::default(),
1613 entity_extraction: HybridEntityConfig::default(),
1614 retrieval: HybridRetrievalConfig::default(),
1615 graph_construction: HybridGraphConfig::default(),
1616 fallback_strategy: default_hybrid_fallback_strategy(),
1617 cross_validation: default_true(),
1618 }
1619 }
1620}
1621
1622impl Default for HybridWeightsConfig {
1623 fn default() -> Self {
1624 Self {
1625 semantic_weight: default_hybrid_semantic_weight(),
1626 algorithmic_weight: default_hybrid_algorithmic_weight(),
1627 }
1628 }
1629}
1630
1631impl Default for HybridEmbeddingsConfig {
1632 fn default() -> Self {
1633 Self {
1634 primary: default_semantic_embedding_backend(),
1635 fallback: default_algorithmic_embedding_backend(),
1636 combine_scores: default_true(),
1637 auto_fallback: default_true(),
1638 }
1639 }
1640}
1641
1642impl Default for HybridEntityConfig {
1643 fn default() -> Self {
1644 Self {
1645 use_both: default_true(),
1646 llm_weight: default_hybrid_llm_weight(),
1647 pattern_weight: default_hybrid_pattern_weight(),
1648 cross_validate: default_true(),
1649 confidence_boost: default_hybrid_confidence_boost(),
1650 }
1651 }
1652}
1653
1654impl Default for HybridRetrievalConfig {
1655 fn default() -> Self {
1656 Self {
1657 strategy: default_hybrid_retrieval_strategy(),
1658 combine_vector_bm25: default_true(),
1659 vector_weight: default_hybrid_vector_weight(),
1660 bm25_weight: default_hybrid_bm25_weight(),
1661 rrf_constant: default_rrf_constant(),
1662 }
1663 }
1664}
1665
1666impl Default for HybridGraphConfig {
1667 fn default() -> Self {
1668 Self {
1669 primary_scorer: default_semantic_relation_scorer(),
1670 fallback_scorer: default_algorithmic_relation_scorer(),
1671 combine_scores: default_true(),
1672 }
1673 }
1674}
1675
1676impl SetConfig {
1677 pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
1679 let path_ref = path.as_ref();
1680 let content = fs::read_to_string(path_ref)?;
1681
1682 let extension = path_ref
1684 .extension()
1685 .and_then(|e| e.to_str())
1686 .unwrap_or("");
1687
1688 let config: SetConfig = match extension {
1689 #[cfg(feature = "json5-support")]
1690 "json5" | "json" => {
1691 json5::from_str(&content).map_err(|e| crate::core::GraphRAGError::Config {
1692 message: format!("JSON5 parse error: {e}"),
1693 })?
1694 }
1695 #[cfg(not(feature = "json5-support"))]
1696 "json5" | "json" => {
1697 return Err(crate::core::GraphRAGError::Config {
1698 message: "JSON5 support not enabled. Rebuild with --features json5-support".to_string(),
1699 });
1700 }
1701 _ => {
1702 toml::from_str(&content).map_err(|e| crate::core::GraphRAGError::Config {
1703 message: format!("TOML parse error: {e}"),
1704 })?
1705 }
1706 };
1707
1708 Ok(config)
1709 }
1710
1711 pub fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
1713 let toml_string =
1714 toml::to_string_pretty(&self).map_err(|e| crate::core::GraphRAGError::Config {
1715 message: format!("TOML serialize error: {e}"),
1716 })?;
1717
1718 let commented_toml = format!(
1720 "# =============================================================================\n\
1721 # GraphRAG Configuration File\n\
1722 # Complete configuration with extensive parameters for easy customization\n\
1723 # =============================================================================\n\n{toml_string}"
1724 );
1725
1726 fs::write(path, commented_toml)?;
1727 Ok(())
1728 }
1729
1730 pub fn to_graphrag_config(&self) -> crate::Config {
1732 let mut config = crate::Config::default();
1733
1734 config.approach = self.mode.approach.clone();
1736
1737 config.text.chunk_size = self.pipeline.text_extraction.chunk_size;
1739 config.text.chunk_overlap = self.pipeline.text_extraction.chunk_overlap;
1740
1741 config.entities.min_confidence = self
1743 .entity_extraction
1744 .min_confidence;
1745
1746 if let Some(ref types) = self.pipeline.entity_extraction.entity_types {
1748 config.entities.entity_types = types.clone();
1749 }
1750
1751 match self.mode.approach.as_str() {
1756 "semantic" => {
1757 if let Some(ref semantic) = self.semantic {
1758 config.entities.use_gleaning = semantic.entity_extraction.use_gleaning;
1759 config.entities.max_gleaning_rounds = semantic.entity_extraction.max_gleaning_rounds;
1760 config.entities.min_confidence = semantic.entity_extraction.confidence_threshold;
1761 } else {
1762 config.entities.use_gleaning = true;
1765 config.entities.max_gleaning_rounds = if self.entity_extraction.use_gleaning {
1766 self.entity_extraction.max_gleaning_rounds
1767 } else {
1768 default_max_gleaning_rounds() };
1770 config.entities.min_confidence = self.entity_extraction.min_confidence;
1772 }
1773 }
1774 "algorithmic" => {
1775 config.entities.use_gleaning = false;
1777 if let Some(ref algorithmic) = self.algorithmic {
1778 config.entities.min_confidence = algorithmic.entity_extraction.confidence_threshold;
1779 }
1780 }
1781 "hybrid" => {
1782 config.entities.use_gleaning = true;
1784 if self.hybrid.is_some() {
1785 config.entities.max_gleaning_rounds = 2; }
1788 }
1789 _ => {
1790 config.entities.use_gleaning = self.entity_extraction.use_gleaning;
1792 config.entities.max_gleaning_rounds = self.entity_extraction.max_gleaning_rounds;
1793 }
1794 }
1795
1796 config.graph.similarity_threshold = self.pipeline.graph_building.min_relation_score;
1798 config.graph.max_connections = self.pipeline.graph_building.max_connections_per_node;
1799 config.graph.extract_relationships = true; config.graph.relationship_confidence_threshold = 0.5; config.retrieval.top_k = 10; config.embeddings.dimension = 768; config.embeddings.backend = "ollama".to_string();
1808 config.embeddings.fallback_to_hash = self.ollama.fallback_to_hash;
1809
1810 config.parallel.enabled = self.pipeline.parallel_execution;
1812 config.parallel.num_threads = self.performance.worker_threads;
1813
1814 config.ollama = crate::ollama::OllamaConfig {
1816 enabled: self.ollama.enabled,
1817 host: self.ollama.host.clone(),
1818 port: self.ollama.port,
1819 chat_model: self.ollama.chat_model.clone(),
1820 embedding_model: self.ollama.embedding_model.clone(),
1821 timeout_seconds: self.ollama.timeout_seconds,
1822 max_retries: self.ollama.max_retries,
1823 fallback_to_hash: self.ollama.fallback_to_hash,
1824 max_tokens: self.ollama.max_tokens,
1825 temperature: self.ollama.temperature,
1826 };
1827
1828 config.auto_save = crate::config::AutoSaveConfig {
1830 enabled: self.auto_save.enabled,
1831 interval_seconds: self.auto_save.interval_seconds,
1832 workspace_name: self.auto_save.workspace_name.clone(),
1833 max_versions: self.auto_save.max_versions,
1834 };
1835
1836 config
1837 }
1838}