graphrag_core/config/
validation.rs1use crate::{GraphRAGError, Result};
2use crate::config::{Config, SetConfig};
3use std::path::Path;
4
5#[derive(Debug, Clone, Default)]
7pub struct ValidationResult {
8 pub is_valid: bool,
10 pub errors: Vec<String>,
12 pub warnings: Vec<String>,
14 pub suggestions: Vec<String>,
16}
17
18impl ValidationResult {
19 pub fn new() -> Self {
21 Self::default()
22 }
23
24 pub fn add_error(&mut self, error: String) {
26 self.errors.push(error);
27 self.is_valid = false;
28 }
29
30 pub fn add_warning(&mut self, warning: String) {
32 self.warnings.push(warning);
33 }
34
35 pub fn add_suggestion(&mut self, suggestion: String) {
37 self.suggestions.push(suggestion);
38 }
39}
40
41pub trait Validatable {
43 fn validate(&self) -> ValidationResult;
45 fn validate_strict(&self) -> ValidationResult;
47}
48
49impl Validatable for Config {
50 fn validate(&self) -> ValidationResult {
51 let mut result = ValidationResult::new();
52
53 if self.output_dir.is_empty() {
55 result.add_error("Output directory cannot be empty".to_string());
56 }
57
58 if self.chunk_size == 0 {
60 result.add_error("Chunk size must be greater than 0".to_string());
61 } else if self.chunk_size < 100 {
62 result.add_warning("Chunk size is very small (<100), this may affect performance".to_string());
63 } else if self.chunk_size > 10000 {
64 result.add_warning("Chunk size is very large (>10000), this may affect quality".to_string());
65 } else {
66 }
68
69 if self.chunk_overlap >= self.chunk_size {
71 result.add_error("Chunk overlap must be less than chunk size".to_string());
72 } else if self.chunk_overlap > self.chunk_size / 2 {
73 result.add_warning("Chunk overlap is more than 50% of chunk size, this may be inefficient".to_string());
74 } else {
75 }
77
78 if let Some(max_entities) = self.max_entities_per_chunk {
80 if max_entities == 0 {
81 result.add_error("Max entities per chunk must be greater than 0".to_string());
82 } else if max_entities > 100 {
83 result.add_warning("Max entities per chunk is very high (>100)".to_string());
84 } else {
85 }
87 }
88
89 if let Some(top_k) = self.top_k_results {
91 if top_k == 0 {
92 result.add_error("Top-k results must be greater than 0".to_string());
93 } else if top_k > 100 {
94 result.add_warning("Top-k results is very high (>100), this may affect performance".to_string());
95 } else {
96 }
98 }
99
100 if let Some(threshold) = self.similarity_threshold {
102 if !(0.0..=1.0).contains(&threshold) {
103 result.add_error("Similarity threshold must be between 0.0 and 1.0".to_string());
104 } else if threshold < 0.1 {
105 result.add_warning("Similarity threshold is very low (<0.1), this may return irrelevant results".to_string());
106 } else if threshold > 0.9 {
107 result.add_warning("Similarity threshold is very high (>0.9), this may return too few results".to_string());
108 } else {
109 }
111 }
112
113 if self.chunk_size > 1000 && self.chunk_overlap < 100 {
115 result.add_suggestion("Consider increasing chunk overlap for better context preservation with large chunks".to_string());
116 }
117
118 result
119 }
120
121 fn validate_strict(&self) -> ValidationResult {
122 let mut result = self.validate();
123
124 let output_path = Path::new(&self.output_dir);
128 if !output_path.exists() {
129 result.add_warning(format!("Output directory does not exist: {}", self.output_dir));
130 result.add_suggestion("Directory will be created automatically".to_string());
131 }
132
133 #[cfg(not(feature = "ollama"))]
135 {
136 result.add_warning("Ollama feature is not enabled, local LLM support unavailable".to_string());
137 }
138
139 #[cfg(not(feature = "parallel-processing"))]
140 {
141 result.add_warning("Parallel processing is not enabled, performance may be reduced".to_string());
142 }
143
144 let optimal_chunk_size = 800;
146 let optimal_overlap = 200;
147
148 if (self.chunk_size as i32 - optimal_chunk_size).abs() > 300 {
149 result.add_suggestion(format!(
150 "Consider using chunk size around {} for optimal performance",
151 optimal_chunk_size
152 ));
153 }
154
155 if (self.chunk_overlap as i32 - optimal_overlap).abs() > 100 {
156 result.add_suggestion(format!(
157 "Consider using chunk overlap around {} for optimal context preservation",
158 optimal_overlap
159 ));
160 }
161
162 result
163 }
164}
165
166fn validate_pipeline_approach(config: &SetConfig, result: &mut ValidationResult) {
168 let approach = &config.mode.approach;
169
170 match approach.as_str() {
172 "semantic" | "algorithmic" | "hybrid" => {},
173 invalid => {
174 result.add_error(format!("Invalid pipeline approach: '{}'. Must be 'semantic', 'algorithmic', or 'hybrid'", invalid));
175 return;
176 }
177 }
178
179 if approach == "semantic" {
181 match &config.semantic {
182 None => {
183 result.add_error("Semantic pipeline approach selected but [semantic] configuration is missing".to_string());
184 }
185 Some(semantic) => {
186 if !semantic.enabled {
187 result.add_error("Semantic pipeline approach selected but semantic.enabled = false".to_string());
188 }
189
190 let valid_backends = ["huggingface", "openai", "voyage", "cohere", "jina", "mistral", "together", "ollama"];
192 if !valid_backends.contains(&semantic.embeddings.backend.as_str()) {
193 result.add_error(format!(
194 "Invalid semantic embedding backend: '{}'. Must be one of: {}",
195 semantic.embeddings.backend,
196 valid_backends.join(", ")
197 ));
198 }
199
200 if semantic.embeddings.dimension == 0 {
201 result.add_error("Semantic embedding dimension must be greater than 0".to_string());
202 }
203
204 if semantic.entity_extraction.confidence_threshold < 0.0 || semantic.entity_extraction.confidence_threshold > 1.0 {
206 result.add_error("Semantic entity extraction confidence threshold must be between 0.0 and 1.0".to_string());
207 }
208
209 if semantic.entity_extraction.temperature < 0.0 || semantic.entity_extraction.temperature > 2.0 {
210 result.add_error("Semantic entity extraction temperature must be between 0.0 and 2.0".to_string());
211 }
212
213 if semantic.retrieval.similarity_threshold < 0.0 || semantic.retrieval.similarity_threshold > 1.0 {
215 result.add_error("Semantic retrieval similarity threshold must be between 0.0 and 1.0".to_string());
216 }
217
218 if semantic.retrieval.top_k == 0 {
219 result.add_error("Semantic retrieval top_k must be greater than 0".to_string());
220 }
221 }
222 }
223 }
224
225 if approach == "algorithmic" {
227 match &config.algorithmic {
228 None => {
229 result.add_error("Algorithmic pipeline approach selected but [algorithmic] configuration is missing".to_string());
230 }
231 Some(algorithmic) => {
232 if !algorithmic.enabled {
233 result.add_error("Algorithmic pipeline approach selected but algorithmic.enabled = false".to_string());
234 }
235
236 if algorithmic.embeddings.backend != "hash" {
238 result.add_warning(format!(
239 "Algorithmic pipeline typically uses 'hash' backend, but '{}' is configured",
240 algorithmic.embeddings.backend
241 ));
242 }
243
244 if algorithmic.embeddings.dimension == 0 {
245 result.add_error("Algorithmic embedding dimension must be greater than 0".to_string());
246 }
247
248 if algorithmic.embeddings.max_document_frequency < 0.0 || algorithmic.embeddings.max_document_frequency > 1.0 {
249 result.add_error("Algorithmic max_document_frequency must be between 0.0 and 1.0".to_string());
250 }
251
252 if algorithmic.entity_extraction.confidence_threshold < 0.0 || algorithmic.entity_extraction.confidence_threshold > 1.0 {
254 result.add_error("Algorithmic entity extraction confidence threshold must be between 0.0 and 1.0".to_string());
255 }
256
257 if algorithmic.retrieval.k1 < 0.0 {
259 result.add_error("BM25 k1 parameter must be non-negative".to_string());
260 }
261
262 if algorithmic.retrieval.b < 0.0 || algorithmic.retrieval.b > 1.0 {
263 result.add_error("BM25 b parameter must be between 0.0 and 1.0".to_string());
264 }
265
266 if algorithmic.retrieval.top_k == 0 {
267 result.add_error("Algorithmic retrieval top_k must be greater than 0".to_string());
268 }
269 }
270 }
271 }
272
273 if approach == "hybrid" {
275 match &config.hybrid {
276 None => {
277 result.add_error("Hybrid pipeline approach selected but [hybrid] configuration is missing".to_string());
278 }
279 Some(hybrid) => {
280 if !hybrid.enabled {
281 result.add_error("Hybrid pipeline approach selected but hybrid.enabled = false".to_string());
282 }
283
284 let weight_sum = hybrid.weights.semantic_weight + hybrid.weights.algorithmic_weight;
286 if (weight_sum - 1.0).abs() > 0.01 {
287 result.add_warning(format!(
288 "Hybrid weights should sum to 1.0 (currently: {:.2})",
289 weight_sum
290 ));
291 }
292
293 if hybrid.weights.semantic_weight < 0.0 || hybrid.weights.semantic_weight > 1.0 {
294 result.add_error("Hybrid semantic_weight must be between 0.0 and 1.0".to_string());
295 }
296
297 if hybrid.weights.algorithmic_weight < 0.0 || hybrid.weights.algorithmic_weight > 1.0 {
298 result.add_error("Hybrid algorithmic_weight must be between 0.0 and 1.0".to_string());
299 }
300
301 let entity_weight_sum = hybrid.entity_extraction.llm_weight + hybrid.entity_extraction.pattern_weight;
303 if (entity_weight_sum - 1.0).abs() > 0.01 {
304 result.add_warning(format!(
305 "Hybrid entity extraction weights should sum to 1.0 (currently: {:.2})",
306 entity_weight_sum
307 ));
308 }
309
310 let retrieval_weight_sum = hybrid.retrieval.vector_weight + hybrid.retrieval.bm25_weight;
312 if (retrieval_weight_sum - 1.0).abs() > 0.01 {
313 result.add_warning(format!(
314 "Hybrid retrieval weights should sum to 1.0 (currently: {:.2})",
315 retrieval_weight_sum
316 ));
317 }
318
319 if hybrid.retrieval.rrf_constant == 0 {
320 result.add_error("Hybrid RRF constant must be greater than 0 (typically 60)".to_string());
321 }
322
323 if hybrid.entity_extraction.confidence_boost < 0.0 || hybrid.entity_extraction.confidence_boost > 1.0 {
325 result.add_warning("Hybrid confidence_boost should typically be between 0.0 and 1.0".to_string());
326 }
327 }
328 }
329 }
330
331 match approach.as_str() {
333 "semantic" => {
334 result.add_suggestion("Semantic pipeline uses neural embeddings and LLM-based extraction for high-quality results".to_string());
335 if config.ollama.enabled {
336 result.add_suggestion("Consider using 'llama3.1:8b' for entity extraction with gleaning enabled".to_string());
337 }
338 }
339 "algorithmic" => {
340 result.add_suggestion("Algorithmic pipeline uses pattern matching and TF-IDF for fast, resource-efficient processing".to_string());
341 result.add_suggestion("Algorithmic pipeline works well for structured documents and doesn't require an LLM".to_string());
342 }
343 "hybrid" => {
344 result.add_suggestion("Hybrid pipeline combines semantic and algorithmic approaches for balanced quality and performance".to_string());
345 result.add_suggestion("Fine-tune hybrid weights based on your specific use case and evaluation metrics".to_string());
346 }
347 _ => {}
348 }
349}
350
351impl Validatable for SetConfig {
352 fn validate(&self) -> ValidationResult {
353 let mut result = ValidationResult::new();
354
355 validate_pipeline_approach(self, &mut result);
357
358 if let Some(input_path) = &self.general.input_document_path {
360 if input_path.is_empty() {
361 result.add_error("Input document path cannot be empty".to_string());
362 } else {
363 let path = Path::new(input_path);
364 if !path.exists() {
365 result.add_error(format!("Input document not found: {}", input_path));
366 } else if !path.is_file() {
367 result.add_error(format!("Input path is not a file: {}", input_path));
368 } else {
369 }
371 }
372 } else {
373 result.add_error("Input document path is required".to_string());
374 }
375
376 if self.general.output_dir.is_empty() {
377 result.add_error("Output directory cannot be empty".to_string());
378 }
379
380 let pipeline = &self.pipeline;
382 if pipeline.text_extraction.chunk_size == 0 {
383 result.add_error("Chunk size must be greater than 0".to_string());
384 }
385
386 if pipeline.text_extraction.chunk_overlap >= pipeline.text_extraction.chunk_size {
387 result.add_error("Chunk overlap must be less than chunk size".to_string());
388 }
389
390 let ollama = &self.ollama;
392 if ollama.enabled {
393 if ollama.host.is_empty() {
394 result.add_error("Ollama host cannot be empty when enabled".to_string());
395 }
396
397 if ollama.port == 0 {
398 result.add_error("Ollama port must be valid".to_string());
399 }
400
401 if ollama.chat_model.is_empty() {
402 result.add_error("Ollama chat model must be specified".to_string());
403 }
404
405 if ollama.embedding_model.is_empty() {
406 result.add_error("Ollama embedding model must be specified".to_string());
407 }
408
409 if ollama.chat_model == "llama2" {
411 result.add_suggestion("Consider using 'llama3.1:8b' for better performance".to_string());
412 }
413 }
414
415 let storage = &self.storage;
417 match storage.database_type.as_str() {
418 "memory" | "file" | "sqlite" | "postgresql" | "neo4j" => {},
419 db_type => {
420 result.add_error(format!("Unknown database type: {}", db_type));
421 result.add_suggestion("Supported types: memory, file, sqlite, postgresql, neo4j".to_string());
422 }
423 }
424
425 result
426 }
427
428 fn validate_strict(&self) -> ValidationResult {
429 let mut result = self.validate();
430
431 if !self.ollama.enabled {
433 result.add_warning("Ollama is not enabled, will use mock LLM".to_string());
434 }
435
436 result
437 }
438}
439
440pub fn validate_config_file(path: &Path, strict: bool) -> Result<ValidationResult> {
442 let config_str = std::fs::read_to_string(path)?;
443 let config: SetConfig = toml::from_str(&config_str)
444 .map_err(|e| GraphRAGError::Config {
445 message: format!("Failed to parse TOML config: {}", e)
446 })?;
447
448 let result = if strict {
449 config.validate_strict()
450 } else {
451 config.validate()
452 };
453
454 Ok(result)
455}
456
457#[cfg(test)]
458mod tests {
459 use super::*;
460
461 #[test]
462 fn test_config_validation() {
463 let config = Config {
464 chunk_size: 0,
465 ..Default::default()
466 };
467
468 let result = config.validate();
469 assert!(!result.is_valid);
470 assert!(!result.errors.is_empty());
471 }
472
473 #[test]
474 fn test_chunk_overlap_validation() {
475 let config = Config {
476 chunk_size: 100,
477 chunk_overlap: 150,
478 ..Default::default()
479 };
480
481 let result = config.validate();
482 assert!(!result.is_valid);
483 assert!(result.errors.iter().any(|e| e.contains("overlap")));
484 }
485
486 #[test]
487 fn test_suggestions() {
488 let config = Config {
489 chunk_size: 2000,
490 chunk_overlap: 50,
491 ..Default::default()
492 };
493
494 let result = config.validate();
495 assert!(!result.suggestions.is_empty());
496 }
497}