sql_splitter/validate/
mod.rs

1//! Validate module for SQL dump integrity checking.
2//!
3//! This module provides:
4//! - SQL syntax validation (via parser error detection)
5//! - DDL/DML consistency checks (INSERTs reference existing tables)
6//! - Duplicate primary key detection (all dialects)
7//! - FK referential integrity checking (all dialects)
8//! - Encoding validation (UTF-8)
9
10use crate::parser::{
11    determine_buffer_size, mysql_insert, postgres_copy, Parser, SqlDialect, StatementType,
12};
13use crate::progress::ProgressReader;
14use crate::schema::{Schema, SchemaBuilder, TableId};
15use crate::splitter::Compression;
16use ahash::{AHashMap, AHashSet};
17use serde::Serialize;
18use std::fmt;
19use std::fs::File;
20use std::hash::{Hash, Hasher};
21use std::io::Read;
22use std::path::PathBuf;
23use std::sync::Arc;
24
25/// Maximum number of issues to collect before stopping
26const MAX_ISSUES: usize = 1000;
27
28/// Issue severity level
29#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
30#[serde(rename_all = "lowercase")]
31pub enum Severity {
32    Error,
33    Warning,
34    Info,
35}
36
37impl fmt::Display for Severity {
38    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
39        match self {
40            Severity::Error => write!(f, "ERROR"),
41            Severity::Warning => write!(f, "WARNING"),
42            Severity::Info => write!(f, "INFO"),
43        }
44    }
45}
46
47/// Location in the SQL dump where an issue was found
48#[derive(Debug, Clone, Serialize)]
49pub struct Location {
50    #[serde(skip_serializing_if = "Option::is_none")]
51    pub table: Option<String>,
52    #[serde(skip_serializing_if = "Option::is_none")]
53    pub statement_index: Option<u64>,
54    #[serde(skip_serializing_if = "Option::is_none")]
55    pub approx_line: Option<u64>,
56}
57
58impl Location {
59    pub fn new() -> Self {
60        Self {
61            table: None,
62            statement_index: None,
63            approx_line: None,
64        }
65    }
66
67    pub fn with_table(mut self, table: impl Into<String>) -> Self {
68        self.table = Some(table.into());
69        self
70    }
71
72    pub fn with_statement(mut self, index: u64) -> Self {
73        self.statement_index = Some(index);
74        self
75    }
76
77    #[allow(dead_code)]
78    pub fn with_line(mut self, line: u64) -> Self {
79        self.approx_line = Some(line);
80        self
81    }
82}
83
84impl Default for Location {
85    fn default() -> Self {
86        Self::new()
87    }
88}
89
90/// A validation issue found in the SQL dump
91#[derive(Debug, Clone, Serialize)]
92pub struct ValidationIssue {
93    pub code: &'static str,
94    pub severity: Severity,
95    pub message: String,
96    #[serde(skip_serializing_if = "Option::is_none")]
97    pub location: Option<Location>,
98}
99
100impl ValidationIssue {
101    pub fn error(code: &'static str, message: impl Into<String>) -> Self {
102        Self {
103            code,
104            severity: Severity::Error,
105            message: message.into(),
106            location: None,
107        }
108    }
109
110    pub fn warning(code: &'static str, message: impl Into<String>) -> Self {
111        Self {
112            code,
113            severity: Severity::Warning,
114            message: message.into(),
115            location: None,
116        }
117    }
118
119    pub fn info(code: &'static str, message: impl Into<String>) -> Self {
120        Self {
121            code,
122            severity: Severity::Info,
123            message: message.into(),
124            location: None,
125        }
126    }
127
128    pub fn with_location(mut self, location: Location) -> Self {
129        self.location = Some(location);
130        self
131    }
132}
133
134impl fmt::Display for ValidationIssue {
135    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
136        write!(f, "{} [{}]", self.severity, self.code)?;
137        if let Some(ref loc) = self.location {
138            if let Some(ref table) = loc.table {
139                write!(f, " table={}", table)?;
140            }
141            if let Some(stmt) = loc.statement_index {
142                write!(f, " stmt={}", stmt)?;
143            }
144            if let Some(line) = loc.approx_line {
145                write!(f, " line~{}", line)?;
146            }
147        }
148        write!(f, ": {}", self.message)
149    }
150}
151
152/// Validation options
153#[derive(Debug, Clone)]
154pub struct ValidateOptions {
155    pub path: PathBuf,
156    pub dialect: Option<SqlDialect>,
157    pub progress: bool,
158    pub strict: bool,
159    pub json: bool,
160    pub max_rows_per_table: usize,
161    pub fk_checks_enabled: bool,
162    /// Optional global cap on tracked PK/FK keys for memory safety.
163    /// When exceeded, PK/FK checks are skipped for the remainder of the run.
164    /// If None, no limit is enforced (default).
165    pub max_pk_fk_keys: Option<usize>,
166}
167
168/// Validation summary with collected issues
169#[derive(Debug, Serialize)]
170pub struct ValidationSummary {
171    pub dialect: String,
172    pub issues: Vec<ValidationIssue>,
173    pub summary: SummaryStats,
174    pub checks: CheckResults,
175}
176
177#[derive(Debug, Serialize)]
178pub struct SummaryStats {
179    pub errors: usize,
180    pub warnings: usize,
181    pub info: usize,
182    pub tables_scanned: usize,
183    pub statements_scanned: u64,
184}
185
186#[derive(Debug, Serialize)]
187pub struct CheckResults {
188    pub syntax: CheckStatus,
189    pub encoding: CheckStatus,
190    pub ddl_dml_consistency: CheckStatus,
191    pub pk_duplicates: CheckStatus,
192    pub fk_integrity: CheckStatus,
193}
194
195#[derive(Debug, Serialize)]
196#[serde(rename_all = "lowercase")]
197pub enum CheckStatus {
198    Ok,
199    Failed(usize),
200    Skipped(String),
201}
202
203impl fmt::Display for CheckStatus {
204    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
205        match self {
206            CheckStatus::Ok => write!(f, "OK"),
207            CheckStatus::Failed(n) => write!(f, "{} issues", n),
208            CheckStatus::Skipped(reason) => write!(f, "Skipped ({})", reason),
209        }
210    }
211}
212
213impl ValidationSummary {
214    pub fn has_errors(&self) -> bool {
215        self.summary.errors > 0
216    }
217
218    pub fn has_warnings(&self) -> bool {
219        self.summary.warnings > 0
220    }
221}
222
223/// Compact primary/foreign key representation for duplicate and FK checks.
224/// We use a 64-bit hash; collision risk is negligible for realistic dumps.
225type PkHash = u64;
226
227/// Hash a list of PK/FK values into a compact 64-bit hash.
228/// Uses AHash for fast, high-quality hashing.
229fn hash_pk_values(values: &smallvec::SmallVec<[mysql_insert::PkValue; 2]>) -> PkHash {
230    let mut hasher = ahash::AHasher::default();
231
232    // Include arity (number of columns) in the hash to distinguish (1) from (1, NULL)
233    (values.len() as u8).hash(&mut hasher);
234
235    for v in values {
236        match v {
237            mysql_insert::PkValue::Int(i) => {
238                0u8.hash(&mut hasher);
239                i.hash(&mut hasher);
240            }
241            mysql_insert::PkValue::BigInt(i) => {
242                1u8.hash(&mut hasher);
243                i.hash(&mut hasher);
244            }
245            mysql_insert::PkValue::Text(s) => {
246                2u8.hash(&mut hasher);
247                s.hash(&mut hasher);
248            }
249            mysql_insert::PkValue::Null => {
250                3u8.hash(&mut hasher);
251            }
252        }
253    }
254
255    hasher.finish()
256}
257
258/// Pending FK check to be validated after all PKs are loaded.
259/// Uses compact hash representation to minimize memory usage.
260struct PendingFkCheck {
261    child_table_id: TableId,
262    parent_table_id: TableId,
263    fk_hash: PkHash,
264    stmt_idx: u64,
265}
266
267/// Per-table tracking state for data checks.
268/// Uses hashed PK values to minimize memory usage.
269struct TableState {
270    row_count: u64,
271    /// Set of hashed PKs for duplicate and FK parent existence checks.
272    /// When None, PK/FK checks for this table are skipped (due to row or memory limits).
273    pk_values: Option<AHashSet<PkHash>>,
274    pk_column_indices: Vec<usize>,
275    pk_duplicates: u64,
276    fk_missing_parents: u64,
277}
278
279impl TableState {
280    fn new() -> Self {
281        Self {
282            row_count: 0,
283            pk_values: Some(AHashSet::new()),
284            pk_column_indices: Vec::new(),
285            pk_duplicates: 0,
286            fk_missing_parents: 0,
287        }
288    }
289
290    fn with_pk_columns(mut self, indices: Vec<usize>) -> Self {
291        self.pk_column_indices = indices;
292        self
293    }
294}
295
296/// SQL dump validator
297pub struct Validator {
298    options: ValidateOptions,
299    issues: Vec<ValidationIssue>,
300    dialect: SqlDialect,
301
302    // DDL/DML tracking
303    tables_from_ddl: AHashSet<String>,
304    tables_from_dml: Vec<(String, u64)>, // (table_name, statement_index)
305
306    // Schema for MySQL PK/FK checks
307    schema_builder: SchemaBuilder,
308    schema: Option<Schema>,
309
310    // Per-table state for data checks
311    table_states: AHashMap<TableId, TableState>,
312
313    // Pending FK checks (deferred until all PKs are loaded)
314    pending_fk_checks: Vec<PendingFkCheck>,
315
316    // Progress callback for byte-based progress tracking (Arc for reuse across passes)
317    progress_fn: Option<Arc<dyn Fn(u64) + Send + Sync>>,
318
319    // Counters
320    statement_count: u64,
321    syntax_errors: usize,
322    encoding_warnings: usize,
323    ddl_dml_errors: usize,
324    pk_errors: usize,
325    fk_errors: usize,
326
327    // Memory tracking for PK/FK checks
328    tracked_pk_count: usize,
329    tracked_fk_count: usize,
330    pk_fk_checks_disabled_due_to_memory: bool,
331}
332
333impl Validator {
334    pub fn new(options: ValidateOptions) -> Self {
335        Self {
336            dialect: options.dialect.unwrap_or(SqlDialect::MySql),
337            options,
338            issues: Vec::new(),
339            tables_from_ddl: AHashSet::new(),
340            tables_from_dml: Vec::new(),
341            schema_builder: SchemaBuilder::new(),
342            schema: None,
343            table_states: AHashMap::new(),
344            pending_fk_checks: Vec::new(),
345            progress_fn: None,
346            statement_count: 0,
347            syntax_errors: 0,
348            encoding_warnings: 0,
349            ddl_dml_errors: 0,
350            pk_errors: 0,
351            fk_errors: 0,
352            tracked_pk_count: 0,
353            tracked_fk_count: 0,
354            pk_fk_checks_disabled_due_to_memory: false,
355        }
356    }
357
358    /// Set a progress callback for byte-based progress tracking.
359    /// The callback receives cumulative bytes read across both validation passes.
360    pub fn with_progress<F>(mut self, f: F) -> Self
361    where
362        F: Fn(u64) + Send + Sync + 'static,
363    {
364        self.progress_fn = Some(Arc::new(f));
365        self
366    }
367
368    fn add_issue(&mut self, issue: ValidationIssue) {
369        if self.issues.len() >= MAX_ISSUES {
370            return;
371        }
372
373        match issue.severity {
374            Severity::Error => match issue.code {
375                "SYNTAX" => self.syntax_errors += 1,
376                "DDL_MISSING_TABLE" => self.ddl_dml_errors += 1,
377                "DUPLICATE_PK" => self.pk_errors += 1,
378                "FK_MISSING_PARENT" => self.fk_errors += 1,
379                _ => {}
380            },
381            Severity::Warning => {
382                if issue.code == "ENCODING" {
383                    self.encoding_warnings += 1;
384                }
385            }
386            Severity::Info => {}
387        }
388
389        self.issues.push(issue);
390    }
391
392    /// Check if we've exceeded the memory budget for PK/FK tracking.
393    /// If so, disable further checks and free existing state.
394    fn enforce_pk_fk_memory_budget(&mut self) {
395        if self.pk_fk_checks_disabled_due_to_memory {
396            return;
397        }
398
399        let Some(limit) = self.options.max_pk_fk_keys else {
400            return;
401        };
402
403        let total_tracked = self.tracked_pk_count + self.tracked_fk_count;
404        if total_tracked > limit {
405            self.pk_fk_checks_disabled_due_to_memory = true;
406
407            // Drop existing state to free memory
408            for state in self.table_states.values_mut() {
409                state.pk_values = None;
410            }
411            self.pending_fk_checks.clear();
412            self.pending_fk_checks.shrink_to_fit();
413
414            self.add_issue(ValidationIssue::warning(
415                "PK_FK_CHECKS_SKIPPED_MEMORY",
416                format!(
417                    "Skipping PK/FK checks after tracking {} keys (memory limit of {} exceeded)",
418                    total_tracked, limit
419                ),
420            ));
421        }
422    }
423
424    pub fn validate(mut self) -> anyhow::Result<ValidationSummary> {
425        let file = File::open(&self.options.path)?;
426        let file_size = file.metadata()?.len();
427        let buffer_size = determine_buffer_size(file_size);
428
429        // Pass 1 reports bytes as 0 to file_size/2 (first half of progress bar)
430        let compression = Compression::from_path(&self.options.path);
431        let reader: Box<dyn Read> = if let Some(ref cb) = self.progress_fn {
432            let cb = Arc::clone(cb);
433            let progress_reader = ProgressReader::new(file, move |bytes| {
434                // Scale to first half: 0% to 50%
435                cb(bytes / 2)
436            });
437            compression.wrap_reader(Box::new(progress_reader))
438        } else {
439            compression.wrap_reader(Box::new(file))
440        };
441
442        let mut parser = Parser::with_dialect(reader, buffer_size, self.dialect);
443
444        // Pass 1: Build schema and check DDL/DML consistency
445        loop {
446            match parser.read_statement() {
447                Ok(Some(stmt)) => {
448                    self.statement_count += 1;
449                    self.process_statement(&stmt);
450                }
451                Ok(None) => break,
452                Err(e) => {
453                    self.add_issue(
454                        ValidationIssue::error("SYNTAX", format!("Parser error: {}", e))
455                            .with_location(
456                                Location::new().with_statement(self.statement_count + 1),
457                            ),
458                    );
459                    break;
460                }
461            }
462        }
463
464        // Check for DML referencing missing tables - collect issues first, then add them
465        let missing_table_issues: Vec<_> = self
466            .tables_from_dml
467            .iter()
468            .filter(|(table, _)| {
469                let table_lower = table.to_lowercase();
470                !self
471                    .tables_from_ddl
472                    .iter()
473                    .any(|t| t.to_lowercase() == table_lower)
474            })
475            .map(|(table, stmt_idx)| {
476                ValidationIssue::error(
477                    "DDL_MISSING_TABLE",
478                    format!(
479                        "INSERT/COPY references table '{}' with no CREATE TABLE",
480                        table
481                    ),
482                )
483                .with_location(Location::new().with_table(table).with_statement(*stmt_idx))
484            })
485            .collect();
486
487        for issue in missing_table_issues {
488            self.add_issue(issue);
489        }
490
491        // Finalize schema and resolve FK references for data checks (all dialects)
492        if self.options.fk_checks_enabled {
493            self.schema = Some(self.schema_builder.build());
494            self.schema_builder = SchemaBuilder::new(); // Reset to avoid double use
495            self.initialize_table_states();
496        }
497
498        // Pass 2: Data checks (PK + collect FK refs) - requires re-reading the file
499        let schema_not_empty = self.schema.as_ref().is_some_and(|s| !s.is_empty());
500        if self.options.fk_checks_enabled && schema_not_empty {
501            self.run_data_checks()?;
502            // Now that all PKs are loaded, validate the collected FK references
503            self.validate_pending_fk_checks();
504        }
505
506        Ok(self.build_summary())
507    }
508
509    fn process_statement(&mut self, stmt: &[u8]) {
510        // Check encoding
511        if std::str::from_utf8(stmt).is_err() {
512            self.add_issue(
513                ValidationIssue::warning("ENCODING", "Statement contains invalid UTF-8 bytes")
514                    .with_location(Location::new().with_statement(self.statement_count)),
515            );
516        }
517
518        let (stmt_type, table_name) =
519            Parser::<&[u8]>::parse_statement_with_dialect(stmt, self.dialect);
520
521        match stmt_type {
522            StatementType::CreateTable => {
523                if !table_name.is_empty() {
524                    self.tables_from_ddl.insert(table_name.clone());
525
526                    // Parse CREATE TABLE for schema info (all dialects supported)
527                    if let Ok(stmt_str) = std::str::from_utf8(stmt) {
528                        self.schema_builder.parse_create_table(stmt_str);
529                    }
530                }
531            }
532            StatementType::AlterTable => {
533                // Parse ALTER TABLE for FK constraints (all dialects supported)
534                if let Ok(stmt_str) = std::str::from_utf8(stmt) {
535                    self.schema_builder.parse_alter_table(stmt_str);
536                }
537            }
538            StatementType::Insert | StatementType::Copy => {
539                if !table_name.is_empty() {
540                    self.tables_from_dml
541                        .push((table_name, self.statement_count));
542                }
543            }
544            StatementType::Unknown => {
545                // Could be a session command or comment - not an error
546            }
547            _ => {}
548        }
549    }
550
551    fn initialize_table_states(&mut self) {
552        let schema = match &self.schema {
553            Some(s) => s,
554            None => return,
555        };
556
557        for table_schema in schema.iter() {
558            let pk_indices: Vec<usize> = table_schema
559                .primary_key
560                .iter()
561                .map(|col_id| col_id.0 as usize)
562                .collect();
563
564            let state = TableState::new().with_pk_columns(pk_indices);
565            self.table_states.insert(table_schema.id, state);
566        }
567    }
568
569    fn run_data_checks(&mut self) -> anyhow::Result<()> {
570        let file = File::open(&self.options.path)?;
571        let file_size = file.metadata()?.len();
572        let buffer_size = determine_buffer_size(file_size);
573
574        // Pass 2 reports bytes as file_size/2 to file_size (second half of progress bar)
575        let compression = Compression::from_path(&self.options.path);
576        let reader: Box<dyn Read> = if let Some(ref cb) = self.progress_fn {
577            let cb = Arc::clone(cb);
578            let progress_reader = ProgressReader::new(file, move |bytes| {
579                // Scale to second half: 50% to 100%
580                cb(file_size / 2 + bytes / 2)
581            });
582            compression.wrap_reader(Box::new(progress_reader))
583        } else {
584            compression.wrap_reader(Box::new(file))
585        };
586
587        let mut parser = Parser::with_dialect(reader, buffer_size, self.dialect);
588        let mut stmt_count: u64 = 0;
589
590        while let Some(stmt) = parser.read_statement()? {
591            stmt_count += 1;
592
593            let (stmt_type, table_name) =
594                Parser::<&[u8]>::parse_statement_with_dialect(&stmt, self.dialect);
595
596            // Get table_id without holding a borrow on self.schema
597            let table_id = match &self.schema {
598                Some(s) => match s.get_table_id(&table_name) {
599                    Some(id) => id,
600                    None => continue,
601                },
602                None => continue,
603            };
604
605            match stmt_type {
606                StatementType::Insert => {
607                    // MySQL and SQLite use INSERT VALUES syntax
608                    self.check_insert_statement(&stmt, table_id, &table_name, stmt_count);
609                }
610                StatementType::Copy => {
611                    // PostgreSQL uses COPY ... FROM stdin format
612                    self.check_copy_statement(&stmt, table_id, &table_name, stmt_count);
613                }
614                _ => continue,
615            }
616        }
617
618        Ok(())
619    }
620
621    /// Check rows from a MySQL/SQLite INSERT statement
622    fn check_insert_statement(
623        &mut self,
624        stmt: &[u8],
625        table_id: TableId,
626        table_name: &str,
627        stmt_count: u64,
628    ) {
629        let table_schema = match &self.schema {
630            Some(s) => match s.table(table_id) {
631                Some(ts) => ts,
632                None => return,
633            },
634            None => return,
635        };
636
637        // Parse rows from INSERT using the schema (works for MySQL and SQLite)
638        let rows = match mysql_insert::parse_mysql_insert_rows(stmt, table_schema) {
639            Ok(r) => r,
640            Err(_) => return,
641        };
642
643        for row in rows {
644            self.check_mysql_row(table_id, table_name, &row, stmt_count);
645        }
646    }
647
648    /// Check rows from a PostgreSQL COPY statement
649    fn check_copy_statement(
650        &mut self,
651        stmt: &[u8],
652        table_id: TableId,
653        table_name: &str,
654        stmt_count: u64,
655    ) {
656        // Find the COPY header line and the data section
657        let stmt_str = match std::str::from_utf8(stmt) {
658            Ok(s) => s,
659            Err(_) => return,
660        };
661
662        // Find the data section (after the header line ending with "FROM stdin;")
663        let data_start = if let Some(pos) = stmt_str.find("FROM stdin;") {
664            pos + "FROM stdin;".len()
665        } else if let Some(pos) = stmt_str.find("from stdin;") {
666            pos + "from stdin;".len()
667        } else {
668            return;
669        };
670
671        // Skip any whitespace/newlines after the header
672        let data_section = stmt_str[data_start..].trim_start();
673        if data_section.is_empty() {
674            return;
675        }
676
677        // Parse column list from the header
678        let header = &stmt_str[..data_start];
679        let column_order = postgres_copy::parse_copy_columns(header);
680
681        // Get table schema
682        let table_schema = match &self.schema {
683            Some(s) => match s.table(table_id) {
684                Some(ts) => ts,
685                None => return,
686            },
687            None => return,
688        };
689
690        // Parse the COPY data rows
691        let rows = match postgres_copy::parse_postgres_copy_rows(
692            data_section.as_bytes(),
693            table_schema,
694            column_order,
695        ) {
696            Ok(r) => r,
697            Err(_) => return,
698        };
699
700        for row in rows {
701            self.check_copy_row(table_id, table_name, &row, stmt_count);
702        }
703    }
704
705    /// Check a row from MySQL INSERT or SQLite INSERT
706    fn check_mysql_row(
707        &mut self,
708        table_id: TableId,
709        table_name: &str,
710        row: &mysql_insert::ParsedRow,
711        stmt_idx: u64,
712    ) {
713        self.check_row_common(
714            table_id,
715            table_name,
716            row.pk.as_ref(),
717            &row.fk_values,
718            stmt_idx,
719        );
720    }
721
722    /// Check a row from PostgreSQL COPY
723    fn check_copy_row(
724        &mut self,
725        table_id: TableId,
726        table_name: &str,
727        row: &postgres_copy::ParsedCopyRow,
728        stmt_idx: u64,
729    ) {
730        self.check_row_common(
731            table_id,
732            table_name,
733            row.pk.as_ref(),
734            &row.fk_values,
735            stmt_idx,
736        );
737    }
738
739    /// Common row checking logic for all dialects
740    fn check_row_common(
741        &mut self,
742        table_id: TableId,
743        table_name: &str,
744        pk: Option<&smallvec::SmallVec<[mysql_insert::PkValue; 2]>>,
745        fk_values: &[(mysql_insert::FkRef, smallvec::SmallVec<[mysql_insert::PkValue; 2]>)],
746        stmt_idx: u64,
747    ) {
748        // Skip if memory budget exceeded
749        if self.pk_fk_checks_disabled_due_to_memory {
750            return;
751        }
752
753        let max_rows = self.options.max_rows_per_table as u64;
754
755        let state = match self.table_states.get_mut(&table_id) {
756            Some(s) => s,
757            None => return,
758        };
759
760        state.row_count += 1;
761
762        // Check if we've exceeded max rows for this table
763        if state.row_count > max_rows {
764            if state.pk_values.is_some() {
765                state.pk_values = None;
766                self.add_issue(
767                    ValidationIssue::warning(
768                        "PK_CHECK_SKIPPED",
769                        format!(
770                            "Skipping PK/FK checks for table '{}' after {} rows (increase --max-rows-per-table)",
771                            table_name, max_rows
772                        ),
773                    )
774                    .with_location(Location::new().with_table(table_name)),
775                );
776            }
777            return;
778        }
779
780        // PK duplicate check using hash-based storage (8 bytes per key instead of full values)
781        if let Some(pk_values) = pk {
782            if let Some(ref mut pk_set) = state.pk_values {
783                let pk_hash = hash_pk_values(pk_values);
784
785                if pk_set.insert(pk_hash) {
786                    // Only count unique keys
787                    self.tracked_pk_count += 1;
788                    self.enforce_pk_fk_memory_budget();
789                } else {
790                    // Duplicate detected
791                    state.pk_duplicates += 1;
792
793                    // Build human-readable display on demand (duplicates are rare)
794                    let pk_display: String = pk_values
795                        .iter()
796                        .map(|v| match v {
797                            mysql_insert::PkValue::Int(i) => i.to_string(),
798                            mysql_insert::PkValue::BigInt(i) => i.to_string(),
799                            mysql_insert::PkValue::Text(s) => s.to_string(),
800                            mysql_insert::PkValue::Null => "NULL".to_string(),
801                        })
802                        .collect::<Vec<_>>()
803                        .join(", ");
804
805                    self.add_issue(
806                        ValidationIssue::error(
807                            "DUPLICATE_PK",
808                            format!(
809                                "Duplicate primary key in table '{}': ({})",
810                                table_name, pk_display
811                            ),
812                        )
813                        .with_location(
814                            Location::new()
815                                .with_table(table_name)
816                                .with_statement(stmt_idx),
817                        ),
818                    );
819                }
820            }
821        }
822
823        // Skip FK collection if checks are disabled
824        if self.pk_fk_checks_disabled_due_to_memory {
825            return;
826        }
827
828        // Collect FK references for deferred validation (after all PKs are loaded)
829        // First, gather the FK checks into a temp vec to avoid borrow issues
830        let new_fk_checks: Vec<PendingFkCheck> = {
831            let schema = match &self.schema {
832                Some(s) => s,
833                None => return,
834            };
835
836            let table_schema = match schema.table(table_id) {
837                Some(t) => t,
838                None => return,
839            };
840
841            fk_values
842                .iter()
843                .filter_map(|(fk_ref, fk_vals)| {
844                    // Skip if all FK values are NULL (nullable FK)
845                    if fk_vals.iter().all(|v| v.is_null()) {
846                        return None;
847                    }
848
849                    let fk_def = table_schema.foreign_keys.get(fk_ref.fk_index as usize)?;
850                    let parent_table_id = fk_def.referenced_table_id?;
851
852                    // Store only the hash, not full values - saves significant memory
853                    let fk_hash = hash_pk_values(fk_vals);
854
855                    Some(PendingFkCheck {
856                        child_table_id: table_id,
857                        parent_table_id,
858                        fk_hash,
859                        stmt_idx,
860                    })
861                })
862                .collect()
863        };
864
865        // Now add the FK checks and update memory tracking
866        let new_count = new_fk_checks.len();
867        self.pending_fk_checks.extend(new_fk_checks);
868        self.tracked_fk_count += new_count;
869
870        if new_count > 0 {
871            self.enforce_pk_fk_memory_budget();
872        }
873    }
874
875    /// Validate all collected FK references after all PKs are loaded
876    fn validate_pending_fk_checks(&mut self) {
877        for check in std::mem::take(&mut self.pending_fk_checks) {
878            let parent_has_pk = self
879                .table_states
880                .get(&check.parent_table_id)
881                .and_then(|s| s.pk_values.as_ref())
882                .is_some_and(|set| set.contains(&check.fk_hash));
883
884            if !parent_has_pk {
885                let state = match self.table_states.get_mut(&check.child_table_id) {
886                    Some(s) => s,
887                    None => continue,
888                };
889                state.fk_missing_parents += 1;
890
891                // Only add issue for first few violations per table
892                if state.fk_missing_parents <= 5 {
893                    // Derive table names from the schema (not stored per FK to save memory)
894                    let (child_name, parent_name) = if let Some(schema) = &self.schema {
895                        let child = schema
896                            .table(check.child_table_id)
897                            .map(|t| t.name.clone())
898                            .unwrap_or_else(|| "<unknown>".to_string());
899                        let parent = schema
900                            .table(check.parent_table_id)
901                            .map(|t| t.name.clone())
902                            .unwrap_or_else(|| "<unknown>".to_string());
903                        (child, parent)
904                    } else {
905                        ("<unknown>".to_string(), "<unknown>".to_string())
906                    };
907
908                    self.add_issue(
909                        ValidationIssue::error(
910                            "FK_MISSING_PARENT",
911                            format!(
912                                "FK violation in '{}': references missing row in '{}'",
913                                child_name, parent_name
914                            ),
915                        )
916                        .with_location(
917                            Location::new()
918                                .with_table(child_name)
919                                .with_statement(check.stmt_idx),
920                        ),
921                    );
922                }
923            }
924        }
925    }
926
927    fn build_summary(&self) -> ValidationSummary {
928        let errors = self
929            .issues
930            .iter()
931            .filter(|i| matches!(i.severity, Severity::Error))
932            .count();
933        let warnings = self
934            .issues
935            .iter()
936            .filter(|i| matches!(i.severity, Severity::Warning))
937            .count();
938        let info = self
939            .issues
940            .iter()
941            .filter(|i| matches!(i.severity, Severity::Info))
942            .count();
943
944        let syntax_status = if self.syntax_errors > 0 {
945            CheckStatus::Failed(self.syntax_errors)
946        } else {
947            CheckStatus::Ok
948        };
949
950        let encoding_status = if self.encoding_warnings > 0 {
951            CheckStatus::Failed(self.encoding_warnings)
952        } else {
953            CheckStatus::Ok
954        };
955
956        let ddl_dml_status = if self.ddl_dml_errors > 0 {
957            CheckStatus::Failed(self.ddl_dml_errors)
958        } else {
959            CheckStatus::Ok
960        };
961
962        let pk_status = if !self.options.fk_checks_enabled {
963            CheckStatus::Skipped("--no-fk-checks".to_string())
964        } else if self.pk_fk_checks_disabled_due_to_memory {
965            CheckStatus::Skipped("memory limit exceeded".to_string())
966        } else if self.pk_errors > 0 {
967            CheckStatus::Failed(self.pk_errors)
968        } else {
969            CheckStatus::Ok
970        };
971
972        let fk_status = if !self.options.fk_checks_enabled {
973            CheckStatus::Skipped("--no-fk-checks".to_string())
974        } else if self.pk_fk_checks_disabled_due_to_memory {
975            CheckStatus::Skipped("memory limit exceeded".to_string())
976        } else if self.fk_errors > 0 {
977            CheckStatus::Failed(self.fk_errors)
978        } else {
979            CheckStatus::Ok
980        };
981
982        ValidationSummary {
983            dialect: self.dialect.to_string(),
984            issues: self.issues.clone(),
985            summary: SummaryStats {
986                errors,
987                warnings,
988                info,
989                tables_scanned: self.tables_from_ddl.len(),
990                statements_scanned: self.statement_count,
991            },
992            checks: CheckResults {
993                syntax: syntax_status,
994                encoding: encoding_status,
995                ddl_dml_consistency: ddl_dml_status,
996                pk_duplicates: pk_status,
997                fk_integrity: fk_status,
998            },
999        }
1000    }
1001}