1use crate::Result;
5use serde::{Deserialize, Serialize};
6use std::fs;
7use std::path::Path;
8
9#[derive(Debug, Clone, Serialize, Deserialize, Default)]
11pub struct SetConfig {
12 #[serde(default)]
14 pub mode: ModeConfig,
15
16 #[serde(default)]
18 pub semantic: Option<SemanticPipelineConfig>,
19
20 #[serde(default)]
22 pub algorithmic: Option<AlgorithmicPipelineConfig>,
23
24 #[serde(default)]
26 pub hybrid: Option<HybridPipelineConfig>,
27
28 #[serde(default)]
30 pub general: GeneralConfig,
31
32 #[serde(default)]
34 pub pipeline: PipelineConfig,
35
36 #[serde(default)]
38 pub storage: StorageConfig,
39
40 #[serde(default)]
42 pub models: ModelsConfig,
43
44 #[serde(default)]
46 pub performance: PerformanceConfig,
47
48 #[serde(default)]
50 pub ollama: OllamaSetConfig,
51
52 #[serde(default)]
54 pub gliner: GlinerSetConfig,
55
56 #[serde(default)]
58 pub experimental: ExperimentalConfig,
59
60 #[serde(default)]
62 pub entity_extraction: EntityExtractionTopLevelConfig,
63
64 #[serde(default)]
66 pub auto_save: AutoSaveSetConfig,
67}
68
69#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct AutoSaveSetConfig {
72 #[serde(default)]
76 pub enabled: bool,
77
78 #[serde(default)]
82 pub base_dir: Option<String>,
83
84 #[serde(default = "default_auto_save_interval")]
86 pub interval_seconds: u64,
87
88 #[serde(default)]
90 pub workspace_name: Option<String>,
91
92 #[serde(default = "default_max_auto_save_versions")]
94 pub max_versions: usize,
95}
96
97impl Default for AutoSaveSetConfig {
98 fn default() -> Self {
99 Self {
100 enabled: false,
101 base_dir: None,
102 interval_seconds: default_auto_save_interval(),
103 workspace_name: None,
104 max_versions: default_max_auto_save_versions(),
105 }
106 }
107}
108
109#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct GeneralConfig {
112 #[serde(default = "default_log_level")]
114 pub log_level: String,
115
116 #[serde(default = "default_output_dir")]
118 pub output_dir: String,
119
120 #[serde(default)]
122 pub input_document_path: Option<String>,
123
124 #[serde(default)]
126 pub max_threads: Option<usize>,
127
128 #[serde(default)]
130 pub enable_profiling: bool,
131}
132
133#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct PipelineConfig {
136 #[serde(default = "default_workflows")]
138 pub workflows: Vec<String>,
139
140 #[serde(default = "default_true")]
142 pub parallel_execution: bool,
143
144 #[serde(default)]
146 pub text_extraction: TextExtractionConfig,
147
148 #[serde(default)]
150 pub entity_extraction: EntityExtractionConfig,
151
152 #[serde(default)]
154 pub graph_building: GraphBuildingConfig,
155
156 #[serde(default)]
158 pub community_detection: CommunityDetectionConfig,
159}
160
161#[derive(Debug, Clone, Serialize, Deserialize)]
163pub struct TextExtractionConfig {
164 #[serde(default = "default_chunk_size")]
166 pub chunk_size: usize,
167
168 #[serde(default = "default_chunk_overlap")]
170 pub chunk_overlap: usize,
171
172 #[serde(default = "default_true")]
174 pub clean_control_chars: bool,
175
176 #[serde(default = "default_min_chunk_size")]
178 pub min_chunk_size: usize,
179
180 #[serde(default)]
182 pub cleaning: Option<CleaningConfig>,
183}
184
185#[derive(Debug, Clone, Serialize, Deserialize)]
187pub struct CleaningConfig {
188 #[serde(default)]
190 pub remove_urls: bool,
191
192 #[serde(default)]
194 pub remove_emails: bool,
195
196 #[serde(default = "default_true")]
198 pub normalize_whitespace: bool,
199
200 #[serde(default)]
202 pub remove_special_chars: bool,
203}
204
205#[derive(Debug, Clone, Serialize, Deserialize)]
207pub struct EntityExtractionConfig {
208 #[serde(default = "default_ner_model")]
210 pub model_name: String,
211
212 #[serde(default = "default_extraction_temperature")]
214 pub temperature: f32,
215
216 #[serde(default = "default_max_tokens")]
218 pub max_tokens: usize,
219
220 pub entity_types: Option<Vec<String>>,
222
223 #[serde(default = "default_confidence_threshold")]
225 pub confidence_threshold: f32,
226
227 pub custom_prompt: Option<String>,
229
230 #[serde(default)]
232 pub filters: Option<EntityFiltersConfig>,
233}
234
235#[derive(Debug, Clone, Serialize, Deserialize)]
237pub struct EntityFiltersConfig {
238 #[serde(default = "default_min_entity_length")]
240 pub min_entity_length: usize,
241
242 #[serde(default = "default_max_entity_length")]
244 pub max_entity_length: usize,
245
246 pub allowed_entity_types: Option<Vec<String>>,
248
249 #[serde(default = "default_confidence_threshold")]
251 pub confidence_threshold: f32,
252
253 pub allowed_patterns: Option<Vec<String>>,
255
256 pub excluded_patterns: Option<Vec<String>>,
258
259 #[serde(default)]
261 pub enable_fuzzy_matching: bool,
262}
263
264#[derive(Debug, Clone, Serialize, Deserialize)]
266pub struct GraphBuildingConfig {
267 #[serde(default = "default_relation_scorer")]
269 pub relation_scorer: String,
270
271 #[serde(default = "default_min_relation_score")]
273 pub min_relation_score: f32,
274
275 #[serde(default = "default_max_connections")]
277 pub max_connections_per_node: usize,
278
279 #[serde(default = "default_true")]
281 pub bidirectional_relations: bool,
282}
283
284#[derive(Debug, Clone, Serialize, Deserialize)]
286pub struct CommunityDetectionConfig {
287 #[serde(default = "default_community_algorithm")]
289 pub algorithm: String,
290
291 #[serde(default = "default_resolution")]
293 pub resolution: f32,
294
295 #[serde(default = "default_min_community_size")]
297 pub min_community_size: usize,
298
299 #[serde(default)]
301 pub max_community_size: usize,
302}
303
304#[derive(Debug, Clone, Serialize, Deserialize)]
306pub struct StorageConfig {
307 #[serde(default = "default_database_type")]
309 pub database_type: String,
310
311 #[serde(default = "default_database_path")]
313 pub database_path: String,
314
315 #[serde(default = "default_true")]
317 pub enable_wal: bool,
318
319 pub postgresql: Option<PostgreSQLConfig>,
321
322 pub neo4j: Option<Neo4jConfig>,
324}
325
326#[derive(Debug, Clone, Serialize, Deserialize)]
328pub struct PostgreSQLConfig {
329 pub host: String,
331 pub port: u16,
333 pub database: String,
335 pub username: String,
337 pub password: String,
339 #[serde(default = "default_pool_size")]
341 pub pool_size: usize,
342}
343
344#[derive(Debug, Clone, Serialize, Deserialize)]
346pub struct Neo4jConfig {
347 pub uri: String,
349 pub username: String,
351 pub password: String,
353 #[serde(default)]
355 pub encrypted: bool,
356}
357
358#[derive(Debug, Clone, Serialize, Deserialize)]
360pub struct ModelsConfig {
361 #[serde(default = "default_primary_llm")]
363 pub primary_llm: String,
364
365 #[serde(default = "default_embedding_model")]
367 pub embedding_model: String,
368
369 #[serde(default = "default_max_context")]
371 pub max_context_length: usize,
372
373 #[serde(default)]
375 pub llm_params: Option<LLMParamsConfig>,
376
377 #[serde(default)]
379 pub local: Option<LocalModelsConfig>,
380}
381
382#[derive(Debug, Clone, Serialize, Deserialize)]
384pub struct LLMParamsConfig {
385 #[serde(default = "default_temperature")]
387 pub temperature: f32,
388
389 #[serde(default = "default_top_p")]
391 pub top_p: f32,
392
393 #[serde(default)]
395 pub frequency_penalty: f32,
396
397 #[serde(default)]
399 pub presence_penalty: f32,
400
401 pub stop_sequences: Option<Vec<String>>,
403}
404
405#[derive(Debug, Clone, Serialize, Deserialize)]
407pub struct LocalModelsConfig {
408 #[serde(default = "default_ollama_url")]
410 pub ollama_base_url: String,
411
412 #[serde(default = "default_ollama_model")]
414 pub model_name: String,
415
416 #[serde(default = "default_ollama_embedding")]
418 pub embedding_model: String,
419}
420
421#[derive(Debug, Clone, Serialize, Deserialize)]
423pub struct PerformanceConfig {
424 #[serde(default = "default_true")]
426 pub batch_processing: bool,
427
428 #[serde(default = "default_batch_size")]
430 pub batch_size: usize,
431
432 #[serde(default = "default_worker_threads")]
434 pub worker_threads: usize,
435
436 #[serde(default = "default_memory_limit")]
438 pub memory_limit_mb: usize,
439}
440
441#[derive(Debug, Clone, Serialize, Deserialize)]
443pub struct OllamaSetConfig {
444 #[serde(default = "default_true")]
446 pub enabled: bool,
447
448 #[serde(default = "default_ollama_host")]
450 pub host: String,
451
452 #[serde(default = "default_ollama_port")]
454 pub port: u16,
455
456 #[serde(default = "default_chat_model")]
458 pub chat_model: String,
459
460 #[serde(default = "default_embedding_model_ollama")]
462 pub embedding_model: String,
463
464 #[serde(default = "default_timeout")]
466 pub timeout_seconds: u64,
467
468 #[serde(default = "default_max_retries")]
470 pub max_retries: u32,
471
472 #[serde(default)]
474 pub fallback_to_hash: bool,
475
476 pub max_tokens: Option<u32>,
478
479 pub temperature: Option<f32>,
481
482 pub keep_alive: Option<String>,
486
487 pub num_ctx: Option<u32>,
492}
493
494#[derive(Debug, Clone, Serialize, Deserialize)]
496pub struct GlinerSetConfig {
497 #[serde(default)]
499 pub enabled: bool,
500 #[serde(default)]
502 pub model_path: String,
503 #[serde(default)]
505 pub tokenizer_path: String,
506 #[serde(default = "default_gliner_mode")]
508 pub mode: String,
509 #[serde(default = "default_gliner_entity_labels")]
511 pub entity_labels: Vec<String>,
512 #[serde(default = "default_gliner_relation_labels")]
514 pub relation_labels: Vec<String>,
515 #[serde(default = "default_entity_threshold")]
517 pub entity_threshold: f32,
518 #[serde(default = "default_relation_threshold")]
520 pub relation_threshold: f32,
521 #[serde(default)]
523 pub use_gpu: bool,
524}
525
526fn default_gliner_mode() -> String { "span".to_string() }
527fn default_gliner_entity_labels() -> Vec<String> { vec!["person".into(), "organization".into(), "location".into()] }
528fn default_gliner_relation_labels() -> Vec<String> { vec!["related to".into(), "part of".into()] }
529fn default_entity_threshold() -> f32 { 0.4 }
530fn default_relation_threshold() -> f32 { 0.5 }
531
532impl Default for GlinerSetConfig {
533 fn default() -> Self {
534 Self {
535 enabled: false,
536 model_path: String::new(),
537 tokenizer_path: String::new(),
538 mode: default_gliner_mode(),
539 entity_labels: default_gliner_entity_labels(),
540 relation_labels: default_gliner_relation_labels(),
541 entity_threshold: default_entity_threshold(),
542 relation_threshold: default_relation_threshold(),
543 use_gpu: false,
544 }
545 }
546}
547
548#[derive(Debug, Clone, Serialize, Deserialize, Default)]
550pub struct ExperimentalConfig {
551 #[serde(default)]
553 pub neural_reranking: bool,
554
555 #[serde(default)]
557 pub federated_learning: bool,
558
559 #[serde(default)]
561 pub real_time_updates: bool,
562
563 #[serde(default)]
565 pub distributed_processing: bool,
566
567 #[serde(default)]
569 pub lazy_graphrag: bool,
570
571 #[serde(default)]
573 pub e2_graphrag: bool,
574
575 #[serde(default)]
577 pub lazy_graphrag_config: Option<LazyGraphRAGConfig>,
578
579 #[serde(default)]
581 pub e2_graphrag_config: Option<E2GraphRAGConfig>,
582}
583
584#[derive(Debug, Clone, Serialize, Deserialize)]
588pub struct LazyGraphRAGConfig {
589 #[serde(default = "default_true")]
591 pub use_concept_extraction: bool,
592
593 #[serde(default = "default_min_concept_length")]
595 pub min_concept_length: usize,
596
597 #[serde(default = "default_max_concept_words")]
599 pub max_concept_words: usize,
600
601 #[serde(default = "default_co_occurrence_threshold")]
603 pub co_occurrence_threshold: usize,
604
605 #[serde(default = "default_true")]
607 pub use_query_refinement: bool,
608
609 #[serde(default = "default_max_refinement_iterations")]
611 pub max_refinement_iterations: usize,
612
613 #[serde(default = "default_true")]
615 pub use_bidirectional_index: bool,
616}
617
618impl Default for LazyGraphRAGConfig {
619 fn default() -> Self {
620 Self {
621 use_concept_extraction: true,
622 min_concept_length: 3,
623 max_concept_words: 5,
624 co_occurrence_threshold: 1,
625 use_query_refinement: true,
626 max_refinement_iterations: 3,
627 use_bidirectional_index: true,
628 }
629 }
630}
631
632#[derive(Debug, Clone, Serialize, Deserialize)]
636pub struct E2GraphRAGConfig {
637 #[serde(default = "default_true")]
639 pub use_lightweight_ner: bool,
640
641 #[serde(default = "default_e2_entity_types")]
643 pub entity_types: Vec<String>,
644
645 #[serde(default = "default_e2_min_confidence")]
647 pub min_confidence: f32,
648
649 #[serde(default = "default_true")]
651 pub use_capitalization_detection: bool,
652
653 #[serde(default = "default_true")]
655 pub use_noun_phrase_extraction: bool,
656
657 #[serde(default = "default_min_entity_frequency")]
659 pub min_entity_frequency: usize,
660
661 #[serde(default = "default_true")]
663 pub use_fast_cooccurrence: bool,
664
665 #[serde(default = "default_true")]
667 pub use_bidirectional_index: bool,
668}
669
670impl Default for E2GraphRAGConfig {
671 fn default() -> Self {
672 Self {
673 use_lightweight_ner: true,
674 entity_types: default_e2_entity_types(),
675 min_confidence: 0.6,
676 use_capitalization_detection: true,
677 use_noun_phrase_extraction: true,
678 min_entity_frequency: 1,
679 use_fast_cooccurrence: true,
680 use_bidirectional_index: true,
681 }
682 }
683}
684
685#[derive(Debug, Clone, Serialize, Deserialize)]
692pub struct ModeConfig {
693 #[serde(default = "default_approach")]
698 pub approach: String,
699}
700
701impl Default for ModeConfig {
702 fn default() -> Self {
703 Self {
704 approach: default_approach(),
705 }
706 }
707}
708
709#[derive(Debug, Clone, Serialize, Deserialize)]
712pub struct SemanticPipelineConfig {
713 #[serde(default)]
715 pub enabled: bool,
716
717 pub embeddings: SemanticEmbeddingsConfig,
719
720 pub entity_extraction: SemanticEntityConfig,
722
723 pub retrieval: SemanticRetrievalConfig,
725
726 pub graph_construction: SemanticGraphConfig,
728}
729
730#[derive(Debug, Clone, Serialize, Deserialize)]
732pub struct SemanticEmbeddingsConfig {
733 #[serde(default = "default_semantic_embedding_backend")]
735 pub backend: String,
736
737 #[serde(default = "default_semantic_embedding_model")]
739 pub model: String,
740
741 #[serde(default = "default_semantic_embedding_dim")]
743 pub dimension: usize,
744
745 #[serde(default = "default_true")]
747 pub use_gpu: bool,
748
749 #[serde(default = "default_similarity_metric")]
751 pub similarity_metric: String,
752
753 #[serde(default = "default_batch_size")]
755 pub batch_size: usize,
756}
757
758#[derive(Debug, Clone, Serialize, Deserialize)]
760pub struct SemanticEntityConfig {
761 #[serde(default = "default_semantic_entity_method")]
763 pub method: String,
764
765 #[serde(default = "default_true")]
767 pub use_gleaning: bool,
768
769 #[serde(default = "default_max_gleaning_rounds")]
771 pub max_gleaning_rounds: usize,
772
773 #[serde(default = "default_chat_model")]
775 pub model: String,
776
777 #[serde(default = "default_semantic_temperature")]
779 pub temperature: f32,
780
781 #[serde(default = "default_semantic_confidence")]
783 pub confidence_threshold: f32,
784}
785
786#[derive(Debug, Clone, Serialize, Deserialize)]
788pub struct SemanticRetrievalConfig {
789 #[serde(default = "default_semantic_retrieval_strategy")]
791 pub strategy: String,
792
793 #[serde(default = "default_true")]
795 pub use_hnsw: bool,
796
797 #[serde(default = "default_hnsw_ef_construction")]
799 pub hnsw_ef_construction: usize,
800
801 #[serde(default = "default_hnsw_m")]
803 pub hnsw_m: usize,
804
805 #[serde(default = "default_top_k")]
807 pub top_k: usize,
808
809 #[serde(default = "default_semantic_similarity_threshold")]
811 pub similarity_threshold: f32,
812}
813
814#[derive(Debug, Clone, Serialize, Deserialize)]
816pub struct SemanticGraphConfig {
817 #[serde(default = "default_semantic_relation_scorer")]
819 pub relation_scorer: String,
820
821 #[serde(default = "default_true")]
823 pub use_transformer_embeddings: bool,
824
825 #[serde(default = "default_min_relation_score")]
827 pub min_relation_score: f32,
828}
829
830#[derive(Debug, Clone, Serialize, Deserialize, Default)]
833pub struct AlgorithmicPipelineConfig {
834 #[serde(default)]
836 pub enabled: bool,
837
838 pub embeddings: AlgorithmicEmbeddingsConfig,
840
841 pub entity_extraction: AlgorithmicEntityConfig,
843
844 pub retrieval: AlgorithmicRetrievalConfig,
846
847 pub graph_construction: AlgorithmicGraphConfig,
849}
850
851#[derive(Debug, Clone, Serialize, Deserialize)]
853pub struct AlgorithmicEmbeddingsConfig {
854 #[serde(default = "default_algorithmic_embedding_backend")]
856 pub backend: String,
857
858 #[serde(default = "default_algorithmic_embedding_dim")]
860 pub dimension: usize,
861
862 #[serde(default = "default_true")]
864 pub use_tfidf: bool,
865
866 #[serde(default = "default_vocabulary_size")]
868 pub vocabulary_size: usize,
869
870 #[serde(default = "default_min_term_frequency")]
872 pub min_term_frequency: usize,
873
874 #[serde(default = "default_max_document_frequency")]
876 pub max_document_frequency: f32,
877}
878
879#[derive(Debug, Clone, Serialize, Deserialize)]
881pub struct AlgorithmicEntityConfig {
882 #[serde(default = "default_algorithmic_entity_method")]
884 pub method: String,
885
886 #[serde(default = "default_true")]
888 pub use_ner_rules: bool,
889
890 #[serde(default)]
892 pub use_pos_tagging: bool,
893
894 #[serde(default = "default_min_entity_length")]
896 pub min_entity_length: usize,
897
898 #[serde(default = "default_algorithmic_confidence")]
900 pub confidence_threshold: f32,
901
902 pub patterns: Option<Vec<String>>,
904}
905
906#[derive(Debug, Clone, Serialize, Deserialize)]
908pub struct AlgorithmicRetrievalConfig {
909 #[serde(default = "default_algorithmic_retrieval_strategy")]
911 pub strategy: String,
912
913 #[serde(default = "default_bm25_k1")]
915 pub k1: f32,
916
917 #[serde(default = "default_bm25_b")]
919 pub b: f32,
920
921 #[serde(default = "default_true")]
923 pub use_stemming: bool,
924
925 #[serde(default = "default_language")]
927 pub language: String,
928
929 #[serde(default = "default_top_k")]
931 pub top_k: usize,
932}
933
934#[derive(Debug, Clone, Serialize, Deserialize)]
936pub struct AlgorithmicGraphConfig {
937 #[serde(default = "default_algorithmic_relation_scorer")]
939 pub relation_scorer: String,
940
941 #[serde(default = "default_true")]
943 pub use_cooccurrence: bool,
944
945 #[serde(default = "default_cooccurrence_window")]
947 pub window_size: usize,
948
949 #[serde(default = "default_algorithmic_min_relation_score")]
951 pub min_relation_score: f32,
952}
953
954#[derive(Debug, Clone, Serialize, Deserialize)]
957pub struct HybridPipelineConfig {
958 #[serde(default)]
960 pub enabled: bool,
961
962 pub weights: HybridWeightsConfig,
964
965 pub embeddings: HybridEmbeddingsConfig,
967
968 pub entity_extraction: HybridEntityConfig,
970
971 pub retrieval: HybridRetrievalConfig,
973
974 pub graph_construction: HybridGraphConfig,
976
977 #[serde(default = "default_hybrid_fallback_strategy")]
979 pub fallback_strategy: String,
980
981 #[serde(default = "default_true")]
983 pub cross_validation: bool,
984}
985
986#[derive(Debug, Clone, Serialize, Deserialize)]
988pub struct HybridWeightsConfig {
989 #[serde(default = "default_hybrid_semantic_weight")]
991 pub semantic_weight: f32,
992
993 #[serde(default = "default_hybrid_algorithmic_weight")]
995 pub algorithmic_weight: f32,
996}
997
998#[derive(Debug, Clone, Serialize, Deserialize)]
1000pub struct HybridEmbeddingsConfig {
1001 #[serde(default = "default_semantic_embedding_backend")]
1003 pub primary: String,
1004
1005 #[serde(default = "default_algorithmic_embedding_backend")]
1007 pub fallback: String,
1008
1009 #[serde(default = "default_true")]
1011 pub combine_scores: bool,
1012
1013 #[serde(default = "default_true")]
1015 pub auto_fallback: bool,
1016}
1017
1018#[derive(Debug, Clone, Serialize, Deserialize)]
1020pub struct HybridEntityConfig {
1021 #[serde(default = "default_true")]
1023 pub use_both: bool,
1024
1025 #[serde(default = "default_hybrid_llm_weight")]
1027 pub llm_weight: f32,
1028
1029 #[serde(default = "default_hybrid_pattern_weight")]
1031 pub pattern_weight: f32,
1032
1033 #[serde(default = "default_true")]
1035 pub cross_validate: bool,
1036
1037 #[serde(default = "default_hybrid_confidence_boost")]
1039 pub confidence_boost: f32,
1040}
1041
1042#[derive(Debug, Clone, Serialize, Deserialize)]
1044pub struct HybridRetrievalConfig {
1045 #[serde(default = "default_hybrid_retrieval_strategy")]
1047 pub strategy: String,
1048
1049 #[serde(default = "default_true")]
1051 pub combine_vector_bm25: bool,
1052
1053 #[serde(default = "default_hybrid_vector_weight")]
1055 pub vector_weight: f32,
1056
1057 #[serde(default = "default_hybrid_bm25_weight")]
1059 pub bm25_weight: f32,
1060
1061 #[serde(default = "default_rrf_constant")]
1063 pub rrf_constant: usize,
1064}
1065
1066#[derive(Debug, Clone, Serialize, Deserialize)]
1068pub struct HybridGraphConfig {
1069 #[serde(default = "default_semantic_relation_scorer")]
1071 pub primary_scorer: String,
1072
1073 #[serde(default = "default_algorithmic_relation_scorer")]
1075 pub fallback_scorer: String,
1076
1077 #[serde(default = "default_true")]
1079 pub combine_scores: bool,
1080}
1081
1082#[derive(Debug, Clone, Serialize, Deserialize)]
1084pub struct EntityExtractionTopLevelConfig {
1085 #[serde(default = "default_true")]
1087 pub enabled: bool,
1088
1089 #[serde(default = "default_confidence_threshold")]
1091 pub min_confidence: f32,
1092
1093 #[serde(default)]
1095 pub use_gleaning: bool,
1096
1097 #[serde(default = "default_gleaning_rounds")]
1099 pub max_gleaning_rounds: usize,
1100
1101 #[serde(default = "default_gleaning_improvement")]
1103 pub gleaning_improvement_threshold: f32,
1104
1105 #[serde(default)]
1107 pub semantic_merging: bool,
1108
1109 #[serde(default = "default_merge_threshold")]
1111 pub merge_similarity_threshold: f32,
1112
1113 #[serde(default)]
1115 pub automatic_linking: bool,
1116
1117 #[serde(default = "default_confidence_threshold")]
1119 pub linking_confidence_threshold: f32,
1120}
1121
1122impl Default for EntityExtractionTopLevelConfig {
1123 fn default() -> Self {
1124 Self {
1125 enabled: true,
1126 min_confidence: default_confidence_threshold(),
1127 use_gleaning: false,
1128 max_gleaning_rounds: default_gleaning_rounds(),
1129 gleaning_improvement_threshold: default_gleaning_improvement(),
1130 semantic_merging: false,
1131 merge_similarity_threshold: default_merge_threshold(),
1132 automatic_linking: false,
1133 linking_confidence_threshold: default_confidence_threshold(),
1134 }
1135 }
1136}
1137
1138fn default_log_level() -> String {
1140 "info".to_string()
1141}
1142fn default_output_dir() -> String {
1143 "./output".to_string()
1144}
1145fn default_true() -> bool {
1146 true
1147}
1148fn default_workflows() -> Vec<String> {
1149 vec![
1150 "extract_text".to_string(),
1151 "extract_entities".to_string(),
1152 "build_graph".to_string(),
1153 "detect_communities".to_string(),
1154 ]
1155}
1156fn default_chunk_size() -> usize {
1157 512
1158}
1159fn default_chunk_overlap() -> usize {
1160 64
1161}
1162fn default_min_chunk_size() -> usize {
1163 50
1164}
1165fn default_ner_model() -> String {
1166 "microsoft/DialoGPT-medium".to_string()
1167}
1168fn default_temperature() -> f32 {
1169 0.1
1170}
1171fn default_extraction_temperature() -> f32 {
1172 0.0
1173}
1174fn default_max_tokens() -> usize {
1175 2048
1176}
1177fn default_min_entity_length() -> usize {
1178 3
1179}
1180fn default_max_entity_length() -> usize {
1181 100
1182}
1183fn default_confidence_threshold() -> f32 {
1184 0.8
1185}
1186fn default_relation_scorer() -> String {
1187 "cosine_similarity".to_string()
1188}
1189fn default_min_relation_score() -> f32 {
1190 0.7
1191}
1192fn default_max_connections() -> usize {
1193 10
1194}
1195fn default_community_algorithm() -> String {
1196 "leiden".to_string()
1197}
1198fn default_resolution() -> f32 {
1199 1.0
1200}
1201fn default_min_community_size() -> usize {
1202 3
1203}
1204fn default_database_type() -> String {
1205 "sqlite".to_string()
1206}
1207fn default_database_path() -> String {
1208 "./graphrag.db".to_string()
1209}
1210fn default_pool_size() -> usize {
1211 10
1212}
1213fn default_primary_llm() -> String {
1214 "gpt-4".to_string()
1215}
1216fn default_embedding_model() -> String {
1217 "text-embedding-ada-002".to_string()
1218}
1219fn default_max_context() -> usize {
1220 4096
1221}
1222fn default_top_p() -> f32 {
1223 0.9
1224}
1225fn default_ollama_url() -> String {
1226 "http://localhost:11434".to_string()
1227}
1228fn default_ollama_model() -> String {
1229 "llama2:7b".to_string()
1230}
1231fn default_ollama_embedding() -> String {
1232 "nomic-embed-text".to_string()
1233}
1234fn default_batch_size() -> usize {
1235 100
1236}
1237fn default_worker_threads() -> usize {
1238 4
1239}
1240fn default_memory_limit() -> usize {
1241 1024
1242}
1243fn default_ollama_host() -> String {
1244 "http://localhost".to_string()
1245}
1246fn default_ollama_port() -> u16 {
1247 11434
1248}
1249fn default_chat_model() -> String {
1250 "llama3.1:8b".to_string()
1251}
1252fn default_embedding_model_ollama() -> String {
1253 "nomic-embed-text".to_string()
1254}
1255fn default_timeout() -> u64 {
1256 60
1257}
1258fn default_max_retries() -> u32 {
1259 3
1260}
1261fn default_gleaning_rounds() -> usize {
1262 3
1263}
1264fn default_gleaning_improvement() -> f32 {
1265 0.1
1266}
1267fn default_merge_threshold() -> f32 {
1268 0.85
1269}
1270
1271fn default_approach() -> String {
1277 "semantic".to_string() }
1279
1280fn default_semantic_embedding_backend() -> String {
1282 "huggingface".to_string()
1283}
1284fn default_semantic_embedding_model() -> String {
1285 "sentence-transformers/all-MiniLM-L6-v2".to_string()
1286}
1287fn default_semantic_embedding_dim() -> usize {
1288 384 }
1290fn default_similarity_metric() -> String {
1291 "cosine".to_string()
1292}
1293fn default_semantic_entity_method() -> String {
1294 "llm".to_string()
1295}
1296fn default_max_gleaning_rounds() -> usize {
1297 3
1298}
1299fn default_semantic_temperature() -> f32 {
1300 0.1
1301}
1302fn default_semantic_confidence() -> f32 {
1303 0.7
1304}
1305fn default_semantic_retrieval_strategy() -> String {
1306 "vector".to_string()
1307}
1308fn default_hnsw_ef_construction() -> usize {
1309 200
1310}
1311fn default_hnsw_m() -> usize {
1312 16
1313}
1314fn default_top_k() -> usize {
1315 10
1316}
1317fn default_semantic_similarity_threshold() -> f32 {
1318 0.7
1319}
1320fn default_semantic_relation_scorer() -> String {
1321 "embedding_similarity".to_string()
1322}
1323
1324fn default_algorithmic_embedding_backend() -> String {
1326 "hash".to_string()
1327}
1328fn default_algorithmic_embedding_dim() -> usize {
1329 128
1330}
1331fn default_vocabulary_size() -> usize {
1332 10000
1333}
1334fn default_min_term_frequency() -> usize {
1335 2
1336}
1337fn default_max_document_frequency() -> f32 {
1338 0.8
1339}
1340fn default_algorithmic_entity_method() -> String {
1341 "pattern".to_string()
1342}
1343fn default_algorithmic_confidence() -> f32 {
1344 0.75
1345}
1346fn default_algorithmic_retrieval_strategy() -> String {
1347 "bm25".to_string()
1348}
1349fn default_bm25_k1() -> f32 {
1350 1.5
1351}
1352fn default_bm25_b() -> f32 {
1353 0.75
1354}
1355fn default_language() -> String {
1356 "english".to_string()
1357}
1358fn default_algorithmic_relation_scorer() -> String {
1359 "jaccard".to_string()
1360}
1361fn default_cooccurrence_window() -> usize {
1362 10
1363}
1364fn default_algorithmic_min_relation_score() -> f32 {
1365 0.6
1366}
1367
1368fn default_hybrid_semantic_weight() -> f32 {
1370 0.6
1371}
1372fn default_hybrid_algorithmic_weight() -> f32 {
1373 0.4
1374}
1375fn default_hybrid_llm_weight() -> f32 {
1376 0.7
1377}
1378fn default_hybrid_pattern_weight() -> f32 {
1379 0.3
1380}
1381fn default_hybrid_confidence_boost() -> f32 {
1382 0.15
1383}
1384fn default_hybrid_retrieval_strategy() -> String {
1385 "fusion".to_string()
1386}
1387fn default_hybrid_vector_weight() -> f32 {
1388 0.6
1389}
1390fn default_hybrid_bm25_weight() -> f32 {
1391 0.4
1392}
1393fn default_rrf_constant() -> usize {
1394 60
1395}
1396fn default_hybrid_fallback_strategy() -> String {
1397 "semantic_first".to_string()
1398}
1399fn default_auto_save_interval() -> u64 {
1400 300 }
1402fn default_max_auto_save_versions() -> usize {
1403 5 }
1405
1406fn default_min_concept_length() -> usize {
1408 3 }
1410fn default_max_concept_words() -> usize {
1411 5 }
1413fn default_co_occurrence_threshold() -> usize {
1414 1 }
1416fn default_max_refinement_iterations() -> usize {
1417 3 }
1419
1420fn default_e2_entity_types() -> Vec<String> {
1422 vec![
1423 "PERSON".to_string(),
1424 "ORGANIZATION".to_string(),
1425 "LOCATION".to_string(),
1426 "CONCEPT".to_string(),
1427 ]
1428}
1429fn default_e2_min_confidence() -> f32 {
1430 0.6 }
1432fn default_min_entity_frequency() -> usize {
1433 1 }
1435
1436impl Default for GeneralConfig {
1437 fn default() -> Self {
1438 Self {
1439 log_level: default_log_level(),
1440 output_dir: default_output_dir(),
1441 input_document_path: None,
1442 max_threads: None,
1443 enable_profiling: false,
1444 }
1445 }
1446}
1447
1448impl Default for PipelineConfig {
1449 fn default() -> Self {
1450 Self {
1451 workflows: default_workflows(),
1452 parallel_execution: default_true(),
1453 text_extraction: TextExtractionConfig::default(),
1454 entity_extraction: EntityExtractionConfig::default(),
1455 graph_building: GraphBuildingConfig::default(),
1456 community_detection: CommunityDetectionConfig::default(),
1457 }
1458 }
1459}
1460
1461impl Default for TextExtractionConfig {
1462 fn default() -> Self {
1463 Self {
1464 chunk_size: default_chunk_size(),
1465 chunk_overlap: default_chunk_overlap(),
1466 clean_control_chars: default_true(),
1467 min_chunk_size: default_min_chunk_size(),
1468 cleaning: None,
1469 }
1470 }
1471}
1472
1473impl Default for EntityExtractionConfig {
1474 fn default() -> Self {
1475 Self {
1476 model_name: default_ner_model(),
1477 temperature: default_temperature(),
1478 max_tokens: default_max_tokens(),
1479 entity_types: None,
1480 confidence_threshold: default_confidence_threshold(),
1481 custom_prompt: None,
1482 filters: None,
1483 }
1484 }
1485}
1486
1487impl Default for GraphBuildingConfig {
1488 fn default() -> Self {
1489 Self {
1490 relation_scorer: default_relation_scorer(),
1491 min_relation_score: default_min_relation_score(),
1492 max_connections_per_node: default_max_connections(),
1493 bidirectional_relations: default_true(),
1494 }
1495 }
1496}
1497
1498impl Default for CommunityDetectionConfig {
1499 fn default() -> Self {
1500 Self {
1501 algorithm: default_community_algorithm(),
1502 resolution: default_resolution(),
1503 min_community_size: default_min_community_size(),
1504 max_community_size: 0,
1505 }
1506 }
1507}
1508
1509impl Default for StorageConfig {
1510 fn default() -> Self {
1511 Self {
1512 database_type: default_database_type(),
1513 database_path: default_database_path(),
1514 enable_wal: default_true(),
1515 postgresql: None,
1516 neo4j: None,
1517 }
1518 }
1519}
1520
1521impl Default for ModelsConfig {
1522 fn default() -> Self {
1523 Self {
1524 primary_llm: default_primary_llm(),
1525 embedding_model: default_embedding_model(),
1526 max_context_length: default_max_context(),
1527 llm_params: None,
1528 local: None,
1529 }
1530 }
1531}
1532
1533impl Default for PerformanceConfig {
1534 fn default() -> Self {
1535 Self {
1536 batch_processing: default_true(),
1537 batch_size: default_batch_size(),
1538 worker_threads: default_worker_threads(),
1539 memory_limit_mb: default_memory_limit(),
1540 }
1541 }
1542}
1543
1544impl Default for OllamaSetConfig {
1545 fn default() -> Self {
1546 Self {
1547 enabled: default_true(),
1548 host: default_ollama_host(),
1549 port: default_ollama_port(),
1550 chat_model: default_chat_model(),
1551 embedding_model: default_embedding_model_ollama(),
1552 timeout_seconds: default_timeout(),
1553 max_retries: default_max_retries(),
1554 fallback_to_hash: false,
1555 max_tokens: Some(800),
1556 temperature: Some(0.3),
1557 keep_alive: None,
1558 num_ctx: None,
1559 }
1560 }
1561}
1562
1563impl Default for SemanticPipelineConfig {
1568 fn default() -> Self {
1569 Self {
1570 enabled: true,
1571 embeddings: SemanticEmbeddingsConfig::default(),
1572 entity_extraction: SemanticEntityConfig::default(),
1573 retrieval: SemanticRetrievalConfig::default(),
1574 graph_construction: SemanticGraphConfig::default(),
1575 }
1576 }
1577}
1578
1579impl Default for SemanticEmbeddingsConfig {
1580 fn default() -> Self {
1581 Self {
1582 backend: default_semantic_embedding_backend(),
1583 model: default_semantic_embedding_model(),
1584 dimension: default_semantic_embedding_dim(),
1585 use_gpu: default_true(),
1586 similarity_metric: default_similarity_metric(),
1587 batch_size: default_batch_size(),
1588 }
1589 }
1590}
1591
1592impl Default for SemanticEntityConfig {
1593 fn default() -> Self {
1594 Self {
1595 method: default_semantic_entity_method(),
1596 use_gleaning: default_true(),
1597 max_gleaning_rounds: default_max_gleaning_rounds(),
1598 model: default_chat_model(),
1599 temperature: default_semantic_temperature(),
1600 confidence_threshold: default_semantic_confidence(),
1601 }
1602 }
1603}
1604
1605impl Default for SemanticRetrievalConfig {
1606 fn default() -> Self {
1607 Self {
1608 strategy: default_semantic_retrieval_strategy(),
1609 use_hnsw: default_true(),
1610 hnsw_ef_construction: default_hnsw_ef_construction(),
1611 hnsw_m: default_hnsw_m(),
1612 top_k: default_top_k(),
1613 similarity_threshold: default_semantic_similarity_threshold(),
1614 }
1615 }
1616}
1617
1618impl Default for SemanticGraphConfig {
1619 fn default() -> Self {
1620 Self {
1621 relation_scorer: default_semantic_relation_scorer(),
1622 use_transformer_embeddings: default_true(),
1623 min_relation_score: default_min_relation_score(),
1624 }
1625 }
1626}
1627
1628impl Default for AlgorithmicEmbeddingsConfig {
1629 fn default() -> Self {
1630 Self {
1631 backend: default_algorithmic_embedding_backend(),
1632 dimension: default_algorithmic_embedding_dim(),
1633 use_tfidf: default_true(),
1634 vocabulary_size: default_vocabulary_size(),
1635 min_term_frequency: default_min_term_frequency(),
1636 max_document_frequency: default_max_document_frequency(),
1637 }
1638 }
1639}
1640
1641impl Default for AlgorithmicEntityConfig {
1642 fn default() -> Self {
1643 Self {
1644 method: default_algorithmic_entity_method(),
1645 use_ner_rules: default_true(),
1646 use_pos_tagging: false,
1647 min_entity_length: default_min_entity_length(),
1648 confidence_threshold: default_algorithmic_confidence(),
1649 patterns: None,
1650 }
1651 }
1652}
1653
1654impl Default for AlgorithmicRetrievalConfig {
1655 fn default() -> Self {
1656 Self {
1657 strategy: default_algorithmic_retrieval_strategy(),
1658 k1: default_bm25_k1(),
1659 b: default_bm25_b(),
1660 use_stemming: default_true(),
1661 language: default_language(),
1662 top_k: default_top_k(),
1663 }
1664 }
1665}
1666
1667impl Default for AlgorithmicGraphConfig {
1668 fn default() -> Self {
1669 Self {
1670 relation_scorer: default_algorithmic_relation_scorer(),
1671 use_cooccurrence: default_true(),
1672 window_size: default_cooccurrence_window(),
1673 min_relation_score: default_algorithmic_min_relation_score(),
1674 }
1675 }
1676}
1677
1678impl Default for HybridPipelineConfig {
1679 fn default() -> Self {
1680 Self {
1681 enabled: false,
1682 weights: HybridWeightsConfig::default(),
1683 embeddings: HybridEmbeddingsConfig::default(),
1684 entity_extraction: HybridEntityConfig::default(),
1685 retrieval: HybridRetrievalConfig::default(),
1686 graph_construction: HybridGraphConfig::default(),
1687 fallback_strategy: default_hybrid_fallback_strategy(),
1688 cross_validation: default_true(),
1689 }
1690 }
1691}
1692
1693impl Default for HybridWeightsConfig {
1694 fn default() -> Self {
1695 Self {
1696 semantic_weight: default_hybrid_semantic_weight(),
1697 algorithmic_weight: default_hybrid_algorithmic_weight(),
1698 }
1699 }
1700}
1701
1702impl Default for HybridEmbeddingsConfig {
1703 fn default() -> Self {
1704 Self {
1705 primary: default_semantic_embedding_backend(),
1706 fallback: default_algorithmic_embedding_backend(),
1707 combine_scores: default_true(),
1708 auto_fallback: default_true(),
1709 }
1710 }
1711}
1712
1713impl Default for HybridEntityConfig {
1714 fn default() -> Self {
1715 Self {
1716 use_both: default_true(),
1717 llm_weight: default_hybrid_llm_weight(),
1718 pattern_weight: default_hybrid_pattern_weight(),
1719 cross_validate: default_true(),
1720 confidence_boost: default_hybrid_confidence_boost(),
1721 }
1722 }
1723}
1724
1725impl Default for HybridRetrievalConfig {
1726 fn default() -> Self {
1727 Self {
1728 strategy: default_hybrid_retrieval_strategy(),
1729 combine_vector_bm25: default_true(),
1730 vector_weight: default_hybrid_vector_weight(),
1731 bm25_weight: default_hybrid_bm25_weight(),
1732 rrf_constant: default_rrf_constant(),
1733 }
1734 }
1735}
1736
1737impl Default for HybridGraphConfig {
1738 fn default() -> Self {
1739 Self {
1740 primary_scorer: default_semantic_relation_scorer(),
1741 fallback_scorer: default_algorithmic_relation_scorer(),
1742 combine_scores: default_true(),
1743 }
1744 }
1745}
1746
1747impl SetConfig {
1748 pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
1750 let path_ref = path.as_ref();
1751 let content = fs::read_to_string(path_ref)?;
1752
1753 let extension = path_ref.extension().and_then(|e| e.to_str()).unwrap_or("");
1755
1756 let config: SetConfig = match extension {
1757 #[cfg(feature = "json5-support")]
1758 "json5" | "json" => {
1759 json5::from_str(&content).map_err(|e| crate::core::GraphRAGError::Config {
1760 message: format!("JSON5 parse error: {e}"),
1761 })?
1762 },
1763 #[cfg(not(feature = "json5-support"))]
1764 "json5" | "json" => {
1765 return Err(crate::core::GraphRAGError::Config {
1766 message: "JSON5 support not enabled. Rebuild with --features json5-support"
1767 .to_string(),
1768 });
1769 },
1770 _ => toml::from_str(&content).map_err(|e| crate::core::GraphRAGError::Config {
1771 message: format!("TOML parse error: {e}"),
1772 })?,
1773 };
1774
1775 Ok(config)
1776 }
1777
1778 pub fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
1780 let toml_string =
1781 toml::to_string_pretty(&self).map_err(|e| crate::core::GraphRAGError::Config {
1782 message: format!("TOML serialize error: {e}"),
1783 })?;
1784
1785 let commented_toml = format!(
1787 "# =============================================================================\n\
1788 # GraphRAG Configuration File\n\
1789 # Complete configuration with extensive parameters for easy customization\n\
1790 # =============================================================================\n\n{toml_string}"
1791 );
1792
1793 fs::write(path, commented_toml)?;
1794 Ok(())
1795 }
1796
1797 pub fn to_graphrag_config(&self) -> crate::Config {
1799 let mut config = crate::Config {
1800 approach: self.mode.approach.clone(),
1801 ..Default::default()
1802 };
1803
1804 config.text.chunk_size = self.pipeline.text_extraction.chunk_size;
1806 config.text.chunk_overlap = self.pipeline.text_extraction.chunk_overlap;
1807
1808 config.entities.min_confidence = self.entity_extraction.min_confidence;
1810
1811 if let Some(ref types) = self.pipeline.entity_extraction.entity_types {
1813 config.entities.entity_types = types.clone();
1814 }
1815
1816 match self.mode.approach.as_str() {
1821 "semantic" => {
1822 if let Some(ref semantic) = self.semantic {
1823 config.entities.use_gleaning = semantic.entity_extraction.use_gleaning;
1824 config.entities.max_gleaning_rounds =
1825 semantic.entity_extraction.max_gleaning_rounds;
1826 config.entities.min_confidence =
1827 semantic.entity_extraction.confidence_threshold;
1828 } else {
1829 config.entities.use_gleaning = self.entity_extraction.use_gleaning;
1831 config.entities.max_gleaning_rounds = self.entity_extraction.max_gleaning_rounds;
1832 config.entities.min_confidence = self.entity_extraction.min_confidence;
1833 }
1834 },
1835 "algorithmic" => {
1836 config.entities.use_gleaning = false;
1838 if let Some(ref algorithmic) = self.algorithmic {
1839 config.entities.min_confidence =
1840 algorithmic.entity_extraction.confidence_threshold;
1841 }
1842 },
1843 "hybrid" => {
1844 config.entities.use_gleaning = true;
1846 if self.hybrid.is_some() {
1847 config.entities.max_gleaning_rounds = 2; }
1850 },
1851 _ => {
1852 config.entities.use_gleaning = self.entity_extraction.use_gleaning;
1854 config.entities.max_gleaning_rounds = self.entity_extraction.max_gleaning_rounds;
1855 },
1856 }
1857
1858 config.graph.similarity_threshold = self.pipeline.graph_building.min_relation_score;
1860 config.graph.max_connections = self.pipeline.graph_building.max_connections_per_node;
1861 config.graph.extract_relationships = true; config.graph.relationship_confidence_threshold = 0.5; config.retrieval.top_k = 10; config.embeddings.dimension = 768; config.embeddings.backend = "ollama".to_string();
1870 config.embeddings.fallback_to_hash = self.ollama.fallback_to_hash;
1871
1872 config.parallel.enabled = self.pipeline.parallel_execution;
1874 config.parallel.num_threads = self.performance.worker_threads;
1875
1876 config.ollama = crate::ollama::OllamaConfig {
1878 enabled: self.ollama.enabled,
1879 host: self.ollama.host.clone(),
1880 port: self.ollama.port,
1881 chat_model: self.ollama.chat_model.clone(),
1882 embedding_model: self.ollama.embedding_model.clone(),
1883 timeout_seconds: self.ollama.timeout_seconds,
1884 max_retries: self.ollama.max_retries,
1885 fallback_to_hash: self.ollama.fallback_to_hash,
1886 max_tokens: self.ollama.max_tokens,
1887 temperature: self.ollama.temperature,
1888 enable_caching: true,
1889 keep_alive: self.ollama.keep_alive.clone(),
1890 num_ctx: self.ollama.num_ctx,
1891 };
1892
1893 config.gliner = crate::config::GlinerConfig {
1895 enabled: self.gliner.enabled,
1896 model_path: self.gliner.model_path.clone(),
1897 tokenizer_path: self.gliner.tokenizer_path.clone(),
1898 mode: self.gliner.mode.clone(),
1899 entity_labels: self.gliner.entity_labels.clone(),
1900 relation_labels: self.gliner.relation_labels.clone(),
1901 entity_threshold: self.gliner.entity_threshold,
1902 relation_threshold: self.gliner.relation_threshold,
1903 use_gpu: self.gliner.use_gpu,
1904 };
1905
1906 config.auto_save = crate::config::AutoSaveConfig {
1908 enabled: self.auto_save.enabled,
1909 base_dir: self.auto_save.base_dir.clone(),
1910 interval_seconds: self.auto_save.interval_seconds,
1911 workspace_name: self.auto_save.workspace_name.clone(),
1912 max_versions: self.auto_save.max_versions,
1913 };
1914
1915 config
1916 }
1917}