1use crate::{Document, Entity, Relationship, TextChunk};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct PhaseValidation {
13 pub phase_name: String,
15 pub passed: bool,
17 pub checks: Vec<ValidationCheck>,
19 pub warnings: Vec<String>,
21 pub metrics: HashMap<String, f64>,
23}
24
25#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct ValidationCheck {
28 pub name: String,
30 pub passed: bool,
32 pub expected: String,
34 pub actual: String,
36 pub message: String,
38}
39
40pub struct DocumentProcessingValidator;
42
43impl DocumentProcessingValidator {
44 pub fn validate(document: &Document, chunks: &[TextChunk]) -> PhaseValidation {
46 let mut checks = Vec::new();
47 let mut warnings = Vec::new();
48 let mut metrics = HashMap::new();
49
50 checks.push(ValidationCheck {
52 name: "document_not_empty".to_string(),
53 passed: !document.content.is_empty(),
54 expected: "Non-empty content".to_string(),
55 actual: format!("{} characters", document.content.len()),
56 message: if document.content.is_empty() {
57 "Document content is empty".to_string()
58 } else {
59 "Document contains content".to_string()
60 },
61 });
62
63 checks.push(ValidationCheck {
65 name: "chunks_created".to_string(),
66 passed: !chunks.is_empty(),
67 expected: "At least 1 chunk".to_string(),
68 actual: format!("{} chunks", chunks.len()),
69 message: if chunks.is_empty() {
70 "No chunks were created from document".to_string()
71 } else {
72 format!("Successfully created {} chunks", chunks.len())
73 },
74 });
75
76 if !chunks.is_empty() {
78 let total_chunk_chars: usize = chunks.iter().map(|c| c.content.len()).sum();
79 let coverage_ratio = total_chunk_chars as f64 / document.content.len() as f64;
80
81 checks.push(ValidationCheck {
82 name: "content_coverage".to_string(),
83 passed: coverage_ratio >= 0.9, expected: "Coverage ratio >= 0.9".to_string(),
85 actual: format!("{:.2}", coverage_ratio),
86 message: format!(
87 "Chunks cover {:.1}% of original content",
88 coverage_ratio * 100.0
89 ),
90 });
91
92 metrics.insert("coverage_ratio".to_string(), coverage_ratio);
93 }
94
95 let empty_chunks = chunks
97 .iter()
98 .filter(|c| c.content.trim().is_empty())
99 .count();
100 checks.push(ValidationCheck {
101 name: "no_empty_chunks".to_string(),
102 passed: empty_chunks == 0,
103 expected: "0 empty chunks".to_string(),
104 actual: format!("{} empty chunks", empty_chunks),
105 message: if empty_chunks > 0 {
106 format!("Found {} empty chunks", empty_chunks)
107 } else {
108 "All chunks have content".to_string()
109 },
110 });
111
112 let chunks_with_metadata = chunks
114 .iter()
115 .filter(|c| {
116 c.metadata.chapter.is_some()
117 || !c.metadata.keywords.is_empty()
118 || c.metadata.summary.is_some()
119 })
120 .count();
121
122 let metadata_ratio = if chunks.is_empty() {
123 0.0
124 } else {
125 chunks_with_metadata as f64 / chunks.len() as f64
126 };
127
128 if metadata_ratio < 0.5 {
129 warnings.push(format!(
130 "Only {}/{} chunks have enriched metadata ({}%)",
131 chunks_with_metadata,
132 chunks.len(),
133 (metadata_ratio * 100.0) as u32
134 ));
135 }
136
137 checks.push(ValidationCheck {
138 name: "metadata_enrichment".to_string(),
139 passed: true, expected: "Metadata enrichment (optional)".to_string(),
141 actual: format!("{}/{} chunks", chunks_with_metadata, chunks.len()),
142 message: format!("{:.1}% of chunks have metadata", metadata_ratio * 100.0),
143 });
144
145 metrics.insert("metadata_ratio".to_string(), metadata_ratio);
146 metrics.insert("chunks_count".to_string(), chunks.len() as f64);
147 metrics.insert(
148 "avg_chunk_size".to_string(),
149 chunks.iter().map(|c| c.content.len()).sum::<usize>() as f64
150 / chunks.len().max(1) as f64,
151 );
152
153 let passed = checks.iter().all(|c| c.passed);
154
155 PhaseValidation {
156 phase_name: "Document Processing".to_string(),
157 passed,
158 checks,
159 warnings,
160 metrics,
161 }
162 }
163}
164
165pub struct EntityExtractionValidator;
167
168impl EntityExtractionValidator {
169 pub fn validate(chunks: &[TextChunk], entities: &[Entity]) -> PhaseValidation {
171 let mut checks = Vec::new();
172 let mut warnings = Vec::new();
173 let mut metrics = HashMap::new();
174
175 checks.push(ValidationCheck {
177 name: "entities_extracted".to_string(),
178 passed: !entities.is_empty(),
179 expected: "At least 1 entity".to_string(),
180 actual: format!("{} entities", entities.len()),
181 message: if entities.is_empty() {
182 "No entities were extracted".to_string()
183 } else {
184 format!("Successfully extracted {} entities", entities.len())
185 },
186 });
187
188 let invalid_confidence = entities
190 .iter()
191 .filter(|e| e.confidence < 0.0 || e.confidence > 1.0)
192 .count();
193
194 checks.push(ValidationCheck {
195 name: "confidence_scores_valid".to_string(),
196 passed: invalid_confidence == 0,
197 expected: "All confidences in [0.0, 1.0]".to_string(),
198 actual: format!("{} invalid scores", invalid_confidence),
199 message: if invalid_confidence > 0 {
200 format!(
201 "{} entities have invalid confidence scores",
202 invalid_confidence
203 )
204 } else {
205 "All confidence scores are valid".to_string()
206 },
207 });
208
209 let missing_types = entities.iter().filter(|e| e.entity_type.is_empty()).count();
211 checks.push(ValidationCheck {
212 name: "entity_types_populated".to_string(),
213 passed: missing_types == 0,
214 expected: "All entities have types".to_string(),
215 actual: format!("{} without types", missing_types),
216 message: if missing_types > 0 {
217 format!("{} entities missing entity_type", missing_types)
218 } else {
219 "All entities have types assigned".to_string()
220 },
221 });
222
223 let empty_names = entities.iter().filter(|e| e.name.trim().is_empty()).count();
225 checks.push(ValidationCheck {
226 name: "entity_names_valid".to_string(),
227 passed: empty_names == 0,
228 expected: "All entities have names".to_string(),
229 actual: format!("{} empty names", empty_names),
230 message: if empty_names > 0 {
231 format!("{} entities have empty names", empty_names)
232 } else {
233 "All entities have valid names".to_string()
234 },
235 });
236
237 if !entities.is_empty() {
239 let chunk_ids: Vec<_> = chunks.iter().map(|c| &c.id).collect();
240 let invalid_mentions = entities
241 .iter()
242 .flat_map(|e| &e.mentions)
243 .filter(|m| !chunk_ids.contains(&&m.chunk_id))
244 .count();
245
246 checks.push(ValidationCheck {
247 name: "entity_mentions_valid".to_string(),
248 passed: invalid_mentions == 0,
249 expected: "All mentions reference valid chunks".to_string(),
250 actual: format!("{} invalid references", invalid_mentions),
251 message: if invalid_mentions > 0 {
252 format!(
253 "{} entity mentions reference non-existent chunks",
254 invalid_mentions
255 )
256 } else {
257 "All entity mentions are valid".to_string()
258 },
259 });
260
261 if invalid_mentions > 0 {
262 warnings.push("Some entity mentions reference non-existent chunks".to_string());
263 }
264 }
265
266 metrics.insert("entities_count".to_string(), entities.len() as f64);
268 if !entities.is_empty() {
269 metrics.insert(
270 "avg_confidence".to_string(),
271 entities.iter().map(|e| e.confidence as f64).sum::<f64>() / entities.len() as f64,
272 );
273 metrics.insert(
274 "avg_mentions_per_entity".to_string(),
275 entities.iter().map(|e| e.mentions.len()).sum::<usize>() as f64
276 / entities.len() as f64,
277 );
278 }
279
280 if let Some(&avg_conf) = metrics.get("avg_confidence") {
282 if avg_conf < 0.5 {
283 warnings.push(format!("Low average entity confidence: {:.2}", avg_conf));
284 }
285 }
286
287 let passed = checks.iter().all(|c| c.passed);
288
289 PhaseValidation {
290 phase_name: "Entity Extraction".to_string(),
291 passed,
292 checks,
293 warnings,
294 metrics,
295 }
296 }
297}
298
299pub struct RelationshipExtractionValidator;
301
302impl RelationshipExtractionValidator {
303 pub fn validate(entities: &[Entity], relationships: &[Relationship]) -> PhaseValidation {
305 let mut checks = Vec::new();
306 let mut warnings = Vec::new();
307 let mut metrics = HashMap::new();
308
309 if !entities.is_empty() {
311 let has_relationships = !relationships.is_empty();
312 checks.push(ValidationCheck {
313 name: "relationships_extracted".to_string(),
314 passed: has_relationships,
315 expected: "At least 1 relationship".to_string(),
316 actual: format!("{} relationships", relationships.len()),
317 message: if !has_relationships {
318 "No relationships extracted despite entities present".to_string()
319 } else {
320 format!("Extracted {} relationships", relationships.len())
321 },
322 });
323
324 if !has_relationships {
325 warnings.push("No relationships found between entities".to_string());
326 }
327 }
328
329 let invalid_confidence = relationships
331 .iter()
332 .filter(|r| r.confidence < 0.0 || r.confidence > 1.0)
333 .count();
334
335 checks.push(ValidationCheck {
336 name: "relationship_confidence_valid".to_string(),
337 passed: invalid_confidence == 0,
338 expected: "All confidences in [0.0, 1.0]".to_string(),
339 actual: format!("{} invalid", invalid_confidence),
340 message: if invalid_confidence > 0 {
341 format!(
342 "{} relationships have invalid confidence",
343 invalid_confidence
344 )
345 } else {
346 "All relationship confidences valid".to_string()
347 },
348 });
349
350 let missing_types = relationships
352 .iter()
353 .filter(|r| r.relation_type.is_empty())
354 .count();
355 checks.push(ValidationCheck {
356 name: "relationship_types_populated".to_string(),
357 passed: missing_types == 0,
358 expected: "All relationships typed".to_string(),
359 actual: format!("{} untyped", missing_types),
360 message: if missing_types > 0 {
361 format!("{} relationships missing type", missing_types)
362 } else {
363 "All relationships have types".to_string()
364 },
365 });
366
367 let entity_ids: Vec<_> = entities.iter().map(|e| &e.id).collect();
369 let orphan_relationships = relationships
370 .iter()
371 .filter(|r| !entity_ids.contains(&&r.source) || !entity_ids.contains(&&r.target))
372 .count();
373
374 checks.push(ValidationCheck {
375 name: "relationship_entities_exist".to_string(),
376 passed: orphan_relationships == 0,
377 expected: "All relationships reference valid entities".to_string(),
378 actual: format!("{} orphaned", orphan_relationships),
379 message: if orphan_relationships > 0 {
380 format!(
381 "{} relationships reference non-existent entities",
382 orphan_relationships
383 )
384 } else {
385 "All relationships have valid entity references".to_string()
386 },
387 });
388
389 if orphan_relationships > 0 {
390 warnings.push(
391 "Some relationships reference entities that don't exist in the graph".to_string(),
392 );
393 }
394
395 metrics.insert(
397 "relationships_count".to_string(),
398 relationships.len() as f64,
399 );
400 if !entities.is_empty() {
401 metrics.insert(
402 "relationships_per_entity".to_string(),
403 relationships.len() as f64 / entities.len() as f64,
404 );
405 }
406 if !relationships.is_empty() {
407 metrics.insert(
408 "avg_relationship_confidence".to_string(),
409 relationships
410 .iter()
411 .map(|r| r.confidence as f64)
412 .sum::<f64>()
413 / relationships.len() as f64,
414 );
415 }
416
417 let passed = checks.iter().all(|c| c.passed);
418
419 PhaseValidation {
420 phase_name: "Relationship Extraction".to_string(),
421 passed,
422 checks,
423 warnings,
424 metrics,
425 }
426 }
427}
428
429pub struct GraphConstructionValidator;
431
432impl GraphConstructionValidator {
433 pub fn validate(
435 documents: usize,
436 chunks: usize,
437 entities: usize,
438 relationships: usize,
439 ) -> PhaseValidation {
440 let mut checks = Vec::new();
441 let mut warnings = Vec::new();
442 let mut metrics = HashMap::new();
443
444 checks.push(ValidationCheck {
446 name: "graph_not_empty".to_string(),
447 passed: entities > 0 || documents > 0,
448 expected: "At least some nodes".to_string(),
449 actual: format!("{} entities, {} docs", entities, documents),
450 message: if entities == 0 && documents == 0 {
451 "Graph is completely empty".to_string()
452 } else {
453 "Graph contains content".to_string()
454 },
455 });
456
457 if chunks > 0 {
459 let entities_per_chunk = entities as f64 / chunks as f64;
460 let reasonable = (0.1..=10.0).contains(&entities_per_chunk);
461
462 checks.push(ValidationCheck {
463 name: "entity_chunk_ratio_reasonable".to_string(),
464 passed: reasonable,
465 expected: "0.1 to 10 entities per chunk".to_string(),
466 actual: format!("{:.2} entities/chunk", entities_per_chunk),
467 message: if !reasonable {
468 format!("Unusual entity-to-chunk ratio: {:.2}", entities_per_chunk)
469 } else {
470 "Entity density looks reasonable".to_string()
471 },
472 });
473
474 metrics.insert("entities_per_chunk".to_string(), entities_per_chunk);
475
476 if entities_per_chunk < 0.5 {
477 warnings.push("Low entity density - may need better entity extraction".to_string());
478 }
479 if entities_per_chunk > 5.0 {
480 warnings.push("High entity density - may have duplicate extractions".to_string());
481 }
482 }
483
484 if entities > 1 {
486 let connectivity = relationships as f64 / entities as f64;
487 let is_connected = connectivity > 0.1; checks.push(ValidationCheck {
490 name: "graph_connectivity".to_string(),
491 passed: is_connected,
492 expected: ">0.1 relationships per entity".to_string(),
493 actual: format!("{:.2} rels/entity", connectivity),
494 message: if !is_connected {
495 "Graph is sparsely connected".to_string()
496 } else {
497 "Graph has reasonable connectivity".to_string()
498 },
499 });
500
501 metrics.insert("connectivity".to_string(), connectivity);
502
503 if connectivity < 0.5 {
504 warnings.push("Graph is sparsely connected - entities may be isolated".to_string());
505 }
506 }
507
508 metrics.insert("documents".to_string(), documents as f64);
510 metrics.insert("chunks".to_string(), chunks as f64);
511 metrics.insert("entities".to_string(), entities as f64);
512 metrics.insert("relationships".to_string(), relationships as f64);
513
514 let passed = checks.iter().all(|c| c.passed);
515
516 PhaseValidation {
517 phase_name: "Graph Construction".to_string(),
518 passed,
519 checks,
520 warnings,
521 metrics,
522 }
523 }
524}
525
526#[derive(Debug, Clone, Serialize, Deserialize)]
528pub struct PipelineValidationReport {
529 pub phases: Vec<PhaseValidation>,
531 pub overall_passed: bool,
533 pub total_checks: usize,
535 pub passed_checks: usize,
537 pub summary: String,
539}
540
541impl PipelineValidationReport {
542 pub fn from_phases(phases: Vec<PhaseValidation>) -> Self {
544 let overall_passed = phases.iter().all(|p| p.passed);
545 let total_checks = phases.iter().map(|p| p.checks.len()).sum();
546 let passed_checks = phases
547 .iter()
548 .flat_map(|p| &p.checks)
549 .filter(|c| c.passed)
550 .count();
551
552 let summary = if overall_passed {
553 format!(
554 "✅ All pipeline phases validated successfully ({}/{} checks passed)",
555 passed_checks, total_checks
556 )
557 } else {
558 let failed_phases: Vec<_> = phases
559 .iter()
560 .filter(|p| !p.passed)
561 .map(|p| p.phase_name.as_str())
562 .collect();
563 format!(
564 "❌ Pipeline validation failed in: {} ({}/{} checks passed)",
565 failed_phases.join(", "),
566 passed_checks,
567 total_checks
568 )
569 };
570
571 Self {
572 phases,
573 overall_passed,
574 total_checks,
575 passed_checks,
576 summary,
577 }
578 }
579
580 pub fn detailed_report(&self) -> String {
582 let mut report = String::new();
583 report.push_str("# Pipeline Validation Report\n\n");
584 report.push_str(&format!("{}\n\n", self.summary));
585 report.push_str(&format!(
586 "**Total Checks**: {}/{} passed\n\n",
587 self.passed_checks, self.total_checks
588 ));
589
590 for phase in &self.phases {
591 report.push_str(&format!("## Phase: {}\n", phase.phase_name));
592 report.push_str(&format!(
593 "**Status**: {}\n\n",
594 if phase.passed {
595 "✅ PASSED"
596 } else {
597 "❌ FAILED"
598 }
599 ));
600
601 report.push_str("### Checks\n");
603 for check in &phase.checks {
604 let icon = if check.passed { "✅" } else { "❌" };
605 report.push_str(&format!("{} **{}**: {}\n", icon, check.name, check.message));
606 report.push_str(&format!(" - Expected: {}\n", check.expected));
607 report.push_str(&format!(" - Actual: {}\n\n", check.actual));
608 }
609
610 if !phase.warnings.is_empty() {
612 report.push_str("### Warnings\n");
613 for warning in &phase.warnings {
614 report.push_str(&format!("⚠️ {}\n", warning));
615 }
616 report.push('\n');
617 }
618
619 if !phase.metrics.is_empty() {
621 report.push_str("### Metrics\n");
622 for (key, value) in &phase.metrics {
623 report.push_str(&format!("- {}: {:.2}\n", key, value));
624 }
625 report.push('\n');
626 }
627
628 report.push_str("---\n\n");
629 }
630
631 report
632 }
633
634 pub fn all_warnings(&self) -> Vec<String> {
636 self.phases
637 .iter()
638 .flat_map(|p| p.warnings.clone())
639 .collect()
640 }
641
642 pub fn failed_phases(&self) -> Vec<&PhaseValidation> {
644 self.phases.iter().filter(|p| !p.passed).collect()
645 }
646}
647
648#[cfg(test)]
649mod tests {
650 use super::*;
651 use crate::{ChunkId, DocumentId, EntityId};
652
653 #[test]
654 fn test_document_processing_validation() {
655 let doc = Document::new(
656 DocumentId::new("test".to_string()),
657 "Test".to_string(),
658 "This is test content with multiple words.".to_string(),
659 );
660
661 let chunks = vec![
662 TextChunk::new(
663 ChunkId::new("c1".to_string()),
664 doc.id.clone(),
665 "This is test".to_string(),
666 0,
667 12,
668 ),
669 TextChunk::new(
670 ChunkId::new("c2".to_string()),
671 doc.id.clone(),
672 "content with multiple words.".to_string(),
673 13,
674 41,
675 ),
676 ];
677
678 let validation = DocumentProcessingValidator::validate(&doc, &chunks);
679 assert!(validation.passed);
680 assert!(validation.checks.iter().all(|c| c.passed));
681 }
682
683 #[test]
684 fn test_entity_extraction_validation() {
685 let chunks = vec![TextChunk::new(
686 ChunkId::new("c1".to_string()),
687 DocumentId::new("test".to_string()),
688 "Alice works at Stanford".to_string(),
689 0,
690 23,
691 )];
692
693 let entities = vec![Entity {
694 id: EntityId::new("e1".to_string()),
695 name: "Alice".to_string(),
696 entity_type: "person".to_string(),
697 confidence: 0.9,
698 mentions: vec![],
699 embedding: None,
700 first_mentioned: None,
701 last_mentioned: None,
702 temporal_validity: None,
703 }];
704
705 let validation = EntityExtractionValidator::validate(&chunks, &entities);
706 assert!(validation.passed);
707 }
708
709 #[test]
710 fn test_pipeline_report() {
711 let doc_validation = PhaseValidation {
712 phase_name: "Test Phase".to_string(),
713 passed: true,
714 checks: vec![ValidationCheck {
715 name: "test_check".to_string(),
716 passed: true,
717 expected: "pass".to_string(),
718 actual: "pass".to_string(),
719 message: "OK".to_string(),
720 }],
721 warnings: vec![],
722 metrics: HashMap::new(),
723 };
724
725 let report = PipelineValidationReport::from_phases(vec![doc_validation]);
726 assert!(report.overall_passed);
727 assert_eq!(report.total_checks, 1);
728 assert_eq!(report.passed_checks, 1);
729 }
730}