1use crate::config::{Config, SetConfig};
2use crate::{GraphRAGError, Result};
3use std::path::Path;
4
5#[derive(Debug, Clone, Default)]
7pub struct ValidationResult {
8 pub is_valid: bool,
10 pub errors: Vec<String>,
12 pub warnings: Vec<String>,
14 pub suggestions: Vec<String>,
16}
17
18impl ValidationResult {
19 pub fn new() -> Self {
21 Self::default()
22 }
23
24 pub fn add_error(&mut self, error: String) {
26 self.errors.push(error);
27 self.is_valid = false;
28 }
29
30 pub fn add_warning(&mut self, warning: String) {
32 self.warnings.push(warning);
33 }
34
35 pub fn add_suggestion(&mut self, suggestion: String) {
37 self.suggestions.push(suggestion);
38 }
39}
40
41pub trait Validatable {
43 fn validate(&self) -> ValidationResult;
45 fn validate_strict(&self) -> ValidationResult;
47}
48
49impl Validatable for Config {
50 fn validate(&self) -> ValidationResult {
51 let mut result = ValidationResult::new();
52
53 if self.output_dir.is_empty() {
55 result.add_error("Output directory cannot be empty".to_string());
56 }
57
58 if self.chunk_size == 0 {
60 result.add_error("Chunk size must be greater than 0".to_string());
61 } else if self.chunk_size < 100 {
62 result.add_warning(
63 "Chunk size is very small (<100), this may affect performance".to_string(),
64 );
65 } else if self.chunk_size > 10000 {
66 result.add_warning(
67 "Chunk size is very large (>10000), this may affect quality".to_string(),
68 );
69 } else {
70 }
72
73 if self.chunk_overlap >= self.chunk_size {
75 result.add_error("Chunk overlap must be less than chunk size".to_string());
76 } else if self.chunk_overlap > self.chunk_size / 2 {
77 result.add_warning(
78 "Chunk overlap is more than 50% of chunk size, this may be inefficient".to_string(),
79 );
80 } else {
81 }
83
84 if let Some(max_entities) = self.max_entities_per_chunk {
86 if max_entities == 0 {
87 result.add_error("Max entities per chunk must be greater than 0".to_string());
88 } else if max_entities > 100 {
89 result.add_warning("Max entities per chunk is very high (>100)".to_string());
90 } else {
91 }
93 }
94
95 if let Some(top_k) = self.top_k_results {
97 if top_k == 0 {
98 result.add_error("Top-k results must be greater than 0".to_string());
99 } else if top_k > 100 {
100 result.add_warning(
101 "Top-k results is very high (>100), this may affect performance".to_string(),
102 );
103 } else {
104 }
106 }
107
108 if let Some(threshold) = self.similarity_threshold {
110 if !(0.0..=1.0).contains(&threshold) {
111 result.add_error("Similarity threshold must be between 0.0 and 1.0".to_string());
112 } else if threshold < 0.1 {
113 result.add_warning(
114 "Similarity threshold is very low (<0.1), this may return irrelevant results"
115 .to_string(),
116 );
117 } else if threshold > 0.9 {
118 result.add_warning(
119 "Similarity threshold is very high (>0.9), this may return too few results"
120 .to_string(),
121 );
122 } else {
123 }
125 }
126
127 if self.chunk_size > 1000 && self.chunk_overlap < 100 {
129 result.add_suggestion("Consider increasing chunk overlap for better context preservation with large chunks".to_string());
130 }
131
132 result
133 }
134
135 fn validate_strict(&self) -> ValidationResult {
136 let mut result = self.validate();
137
138 let output_path = Path::new(&self.output_dir);
142 if !output_path.exists() {
143 result.add_warning(format!(
144 "Output directory does not exist: {}",
145 self.output_dir
146 ));
147 result.add_suggestion("Directory will be created automatically".to_string());
148 }
149
150 #[cfg(not(feature = "ollama"))]
152 {
153 result.add_warning(
154 "Ollama feature is not enabled, local LLM support unavailable".to_string(),
155 );
156 }
157
158 #[cfg(not(feature = "parallel-processing"))]
159 {
160 result.add_warning(
161 "Parallel processing is not enabled, performance may be reduced".to_string(),
162 );
163 }
164
165 let optimal_chunk_size = 800;
167 let optimal_overlap = 200;
168
169 if (self.chunk_size as i32 - optimal_chunk_size).abs() > 300 {
170 result.add_suggestion(format!(
171 "Consider using chunk size around {} for optimal performance",
172 optimal_chunk_size
173 ));
174 }
175
176 if (self.chunk_overlap as i32 - optimal_overlap).abs() > 100 {
177 result.add_suggestion(format!(
178 "Consider using chunk overlap around {} for optimal context preservation",
179 optimal_overlap
180 ));
181 }
182
183 result
184 }
185}
186
187fn validate_pipeline_approach(config: &SetConfig, result: &mut ValidationResult) {
189 let approach = &config.mode.approach;
190
191 match approach.as_str() {
193 "semantic" | "algorithmic" | "hybrid" => {},
194 invalid => {
195 result.add_error(format!(
196 "Invalid pipeline approach: '{}'. Must be 'semantic', 'algorithmic', or 'hybrid'",
197 invalid
198 ));
199 return;
200 },
201 }
202
203 if approach == "semantic" {
205 match &config.semantic {
206 None => {
207 result.add_error(
208 "Semantic pipeline approach selected but [semantic] configuration is missing"
209 .to_string(),
210 );
211 },
212 Some(semantic) => {
213 if !semantic.enabled {
214 result.add_error(
215 "Semantic pipeline approach selected but semantic.enabled = false"
216 .to_string(),
217 );
218 }
219
220 let valid_backends = [
222 "huggingface",
223 "openai",
224 "voyage",
225 "cohere",
226 "jina",
227 "mistral",
228 "together",
229 "ollama",
230 ];
231 if !valid_backends.contains(&semantic.embeddings.backend.as_str()) {
232 result.add_error(format!(
233 "Invalid semantic embedding backend: '{}'. Must be one of: {}",
234 semantic.embeddings.backend,
235 valid_backends.join(", ")
236 ));
237 }
238
239 if semantic.embeddings.dimension == 0 {
240 result.add_error(
241 "Semantic embedding dimension must be greater than 0".to_string(),
242 );
243 }
244
245 if semantic.entity_extraction.confidence_threshold < 0.0
247 || semantic.entity_extraction.confidence_threshold > 1.0
248 {
249 result.add_error("Semantic entity extraction confidence threshold must be between 0.0 and 1.0".to_string());
250 }
251
252 if semantic.entity_extraction.temperature < 0.0
253 || semantic.entity_extraction.temperature > 2.0
254 {
255 result.add_error(
256 "Semantic entity extraction temperature must be between 0.0 and 2.0"
257 .to_string(),
258 );
259 }
260
261 if semantic.retrieval.similarity_threshold < 0.0
263 || semantic.retrieval.similarity_threshold > 1.0
264 {
265 result.add_error(
266 "Semantic retrieval similarity threshold must be between 0.0 and 1.0"
267 .to_string(),
268 );
269 }
270
271 if semantic.retrieval.top_k == 0 {
272 result.add_error("Semantic retrieval top_k must be greater than 0".to_string());
273 }
274 },
275 }
276 }
277
278 if approach == "algorithmic" {
280 match &config.algorithmic {
281 None => {
282 result.add_error("Algorithmic pipeline approach selected but [algorithmic] configuration is missing".to_string());
283 },
284 Some(algorithmic) => {
285 if !algorithmic.enabled {
286 result.add_error(
287 "Algorithmic pipeline approach selected but algorithmic.enabled = false"
288 .to_string(),
289 );
290 }
291
292 if algorithmic.embeddings.backend != "hash" {
294 result.add_warning(format!(
295 "Algorithmic pipeline typically uses 'hash' backend, but '{}' is configured",
296 algorithmic.embeddings.backend
297 ));
298 }
299
300 if algorithmic.embeddings.dimension == 0 {
301 result.add_error(
302 "Algorithmic embedding dimension must be greater than 0".to_string(),
303 );
304 }
305
306 if algorithmic.embeddings.max_document_frequency < 0.0
307 || algorithmic.embeddings.max_document_frequency > 1.0
308 {
309 result.add_error(
310 "Algorithmic max_document_frequency must be between 0.0 and 1.0"
311 .to_string(),
312 );
313 }
314
315 if algorithmic.entity_extraction.confidence_threshold < 0.0
317 || algorithmic.entity_extraction.confidence_threshold > 1.0
318 {
319 result.add_error("Algorithmic entity extraction confidence threshold must be between 0.0 and 1.0".to_string());
320 }
321
322 if algorithmic.retrieval.k1 < 0.0 {
324 result.add_error("BM25 k1 parameter must be non-negative".to_string());
325 }
326
327 if algorithmic.retrieval.b < 0.0 || algorithmic.retrieval.b > 1.0 {
328 result.add_error("BM25 b parameter must be between 0.0 and 1.0".to_string());
329 }
330
331 if algorithmic.retrieval.top_k == 0 {
332 result.add_error(
333 "Algorithmic retrieval top_k must be greater than 0".to_string(),
334 );
335 }
336 },
337 }
338 }
339
340 if approach == "hybrid" {
342 match &config.hybrid {
343 None => {
344 result.add_error(
345 "Hybrid pipeline approach selected but [hybrid] configuration is missing"
346 .to_string(),
347 );
348 },
349 Some(hybrid) => {
350 if !hybrid.enabled {
351 result.add_error(
352 "Hybrid pipeline approach selected but hybrid.enabled = false".to_string(),
353 );
354 }
355
356 let weight_sum = hybrid.weights.semantic_weight + hybrid.weights.algorithmic_weight;
358 if (weight_sum - 1.0).abs() > 0.01 {
359 result.add_warning(format!(
360 "Hybrid weights should sum to 1.0 (currently: {:.2})",
361 weight_sum
362 ));
363 }
364
365 if hybrid.weights.semantic_weight < 0.0 || hybrid.weights.semantic_weight > 1.0 {
366 result.add_error(
367 "Hybrid semantic_weight must be between 0.0 and 1.0".to_string(),
368 );
369 }
370
371 if hybrid.weights.algorithmic_weight < 0.0
372 || hybrid.weights.algorithmic_weight > 1.0
373 {
374 result.add_error(
375 "Hybrid algorithmic_weight must be between 0.0 and 1.0".to_string(),
376 );
377 }
378
379 let entity_weight_sum =
381 hybrid.entity_extraction.llm_weight + hybrid.entity_extraction.pattern_weight;
382 if (entity_weight_sum - 1.0).abs() > 0.01 {
383 result.add_warning(format!(
384 "Hybrid entity extraction weights should sum to 1.0 (currently: {:.2})",
385 entity_weight_sum
386 ));
387 }
388
389 let retrieval_weight_sum =
391 hybrid.retrieval.vector_weight + hybrid.retrieval.bm25_weight;
392 if (retrieval_weight_sum - 1.0).abs() > 0.01 {
393 result.add_warning(format!(
394 "Hybrid retrieval weights should sum to 1.0 (currently: {:.2})",
395 retrieval_weight_sum
396 ));
397 }
398
399 if hybrid.retrieval.rrf_constant == 0 {
400 result.add_error(
401 "Hybrid RRF constant must be greater than 0 (typically 60)".to_string(),
402 );
403 }
404
405 if hybrid.entity_extraction.confidence_boost < 0.0
407 || hybrid.entity_extraction.confidence_boost > 1.0
408 {
409 result.add_warning(
410 "Hybrid confidence_boost should typically be between 0.0 and 1.0"
411 .to_string(),
412 );
413 }
414 },
415 }
416 }
417
418 match approach.as_str() {
420 "semantic" => {
421 result.add_suggestion("Semantic pipeline uses neural embeddings and LLM-based extraction for high-quality results".to_string());
422 if config.ollama.enabled {
423 result.add_suggestion(
424 "Consider using 'llama3.1:8b' for entity extraction with gleaning enabled"
425 .to_string(),
426 );
427 }
428 },
429 "algorithmic" => {
430 result.add_suggestion("Algorithmic pipeline uses pattern matching and TF-IDF for fast, resource-efficient processing".to_string());
431 result.add_suggestion("Algorithmic pipeline works well for structured documents and doesn't require an LLM".to_string());
432 },
433 "hybrid" => {
434 result.add_suggestion("Hybrid pipeline combines semantic and algorithmic approaches for balanced quality and performance".to_string());
435 result.add_suggestion(
436 "Fine-tune hybrid weights based on your specific use case and evaluation metrics"
437 .to_string(),
438 );
439 },
440 _ => {},
441 }
442}
443
444impl Validatable for SetConfig {
445 fn validate(&self) -> ValidationResult {
446 let mut result = ValidationResult::new();
447
448 validate_pipeline_approach(self, &mut result);
450
451 if let Some(input_path) = &self.general.input_document_path {
453 if input_path.is_empty() {
454 result.add_error("Input document path cannot be empty".to_string());
455 } else {
456 let path = Path::new(input_path);
457 if !path.exists() {
458 result.add_error(format!("Input document not found: {}", input_path));
459 } else if !path.is_file() {
460 result.add_error(format!("Input path is not a file: {}", input_path));
461 } else {
462 }
464 }
465 } else {
466 result.add_error("Input document path is required".to_string());
467 }
468
469 if self.general.output_dir.is_empty() {
470 result.add_error("Output directory cannot be empty".to_string());
471 }
472
473 let pipeline = &self.pipeline;
475 if pipeline.text_extraction.chunk_size == 0 {
476 result.add_error("Chunk size must be greater than 0".to_string());
477 }
478
479 if pipeline.text_extraction.chunk_overlap >= pipeline.text_extraction.chunk_size {
480 result.add_error("Chunk overlap must be less than chunk size".to_string());
481 }
482
483 let ollama = &self.ollama;
485 if ollama.enabled {
486 if ollama.host.is_empty() {
487 result.add_error("Ollama host cannot be empty when enabled".to_string());
488 }
489
490 if ollama.port == 0 {
491 result.add_error("Ollama port must be valid".to_string());
492 }
493
494 if ollama.chat_model.is_empty() {
495 result.add_error("Ollama chat model must be specified".to_string());
496 }
497
498 if ollama.embedding_model.is_empty() {
499 result.add_error("Ollama embedding model must be specified".to_string());
500 }
501
502 if ollama.chat_model == "llama2" {
504 result.add_suggestion(
505 "Consider using 'llama3.1:8b' for better performance".to_string(),
506 );
507 }
508 }
509
510 let storage = &self.storage;
512 match storage.database_type.as_str() {
513 "memory" | "file" | "sqlite" | "postgresql" | "neo4j" => {},
514 db_type => {
515 result.add_error(format!("Unknown database type: {}", db_type));
516 result.add_suggestion(
517 "Supported types: memory, file, sqlite, postgresql, neo4j".to_string(),
518 );
519 },
520 }
521
522 result
523 }
524
525 fn validate_strict(&self) -> ValidationResult {
526 let mut result = self.validate();
527
528 if !self.ollama.enabled {
530 result.add_warning("Ollama is not enabled, will use mock LLM".to_string());
531 }
532
533 result
534 }
535}
536
537pub fn validate_config_file(path: &Path, strict: bool) -> Result<ValidationResult> {
539 let config_str = std::fs::read_to_string(path)?;
540 let config: SetConfig = toml::from_str(&config_str).map_err(|e| GraphRAGError::Config {
541 message: format!("Failed to parse TOML config: {}", e),
542 })?;
543
544 let result = if strict {
545 config.validate_strict()
546 } else {
547 config.validate()
548 };
549
550 Ok(result)
551}
552
553#[cfg(test)]
554mod tests {
555 use super::*;
556
557 #[test]
558 fn test_config_validation() {
559 let config = Config {
560 chunk_size: 0,
561 ..Default::default()
562 };
563
564 let result = config.validate();
565 assert!(!result.is_valid);
566 assert!(!result.errors.is_empty());
567 }
568
569 #[test]
570 fn test_chunk_overlap_validation() {
571 let config = Config {
572 chunk_size: 100,
573 chunk_overlap: 150,
574 ..Default::default()
575 };
576
577 let result = config.validate();
578 assert!(!result.is_valid);
579 assert!(result.errors.iter().any(|e| e.contains("overlap")));
580 }
581
582 #[test]
583 fn test_suggestions() {
584 let config = Config {
585 chunk_size: 2000,
586 chunk_overlap: 50,
587 ..Default::default()
588 };
589
590 let result = config.validate();
591 assert!(!result.suggestions.is_empty());
592 }
593}