scirs2_io/validation/
formats.rs

1//! Format-specific validation utilities
2//!
3//! This module provides validators for specific file formats used in scientific computing.
4
5use std::fs::File;
6use std::io::{BufReader, Read};
7use std::path::Path;
8
9use super::{FormatValidatorRegistry, ValidationSource};
10use crate::error::{IoError, Result};
11
12/// Common scientific data format types
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum DataFormat {
15    /// Comma-Separated Values
16    CSV,
17    /// Tab-Separated Values
18    TSV,
19    /// JavaScript Object Notation
20    JSON,
21    /// MATLAB .mat file
22    MATLAB,
23    /// Attribute-Relation File Format (ARFF)
24    ARFF,
25    /// HDF5 format
26    HDF5,
27    /// NetCDF format
28    NetCDF,
29    /// PNG image format
30    PNG,
31    /// JPEG image format
32    JPEG,
33    /// TIFF image format
34    TIFF,
35    /// WAV audio format
36    WAV,
37}
38
39impl DataFormat {
40    /// Get a string representation of the format
41    pub fn as_str(&self) -> &'static str {
42        match self {
43            DataFormat::CSV => "CSV",
44            DataFormat::TSV => "TSV",
45            DataFormat::JSON => "JSON",
46            DataFormat::MATLAB => "MATLAB",
47            DataFormat::ARFF => "ARFF",
48            DataFormat::HDF5 => "HDF5",
49            DataFormat::NetCDF => "NetCDF",
50            DataFormat::PNG => "PNG",
51            DataFormat::JPEG => "JPEG",
52            DataFormat::TIFF => "TIFF",
53            DataFormat::WAV => "WAV",
54        }
55    }
56
57    /// Parse format name from a string
58    pub fn from_str(name: &str) -> Option<Self> {
59        match name.to_uppercase().as_str() {
60            "CSV" => Some(DataFormat::CSV),
61            "TSV" => Some(DataFormat::TSV),
62            "JSON" => Some(DataFormat::JSON),
63            "MAT" | "MATLAB" => Some(DataFormat::MATLAB),
64            "ARFF" => Some(DataFormat::ARFF),
65            "HDF5" | "H5" => Some(DataFormat::HDF5),
66            "NETCDF" | "NC" => Some(DataFormat::NetCDF),
67            "PNG" => Some(DataFormat::PNG),
68            "JPEG" | "JPG" => Some(DataFormat::JPEG),
69            "TIFF" | "TIF" => Some(DataFormat::TIFF),
70            "WAV" => Some(DataFormat::WAV),
71            _ => None,
72        }
73    }
74}
75
76/// Get a registry with all scientific data format validators
77#[allow(dead_code)]
78pub fn get_scientific_format_validators() -> FormatValidatorRegistry {
79    let mut registry = FormatValidatorRegistry::new();
80
81    // Add format validators
82
83    // PNG validator
84    registry.add_validator("PNG", |data| {
85        data.len() >= 8 && data[0..8] == [137, 80, 78, 71, 13, 10, 26, 10]
86    });
87
88    // JPEG validator
89    registry.add_validator("JPEG", |data| {
90        data.len() >= 3 && data[0..3] == [0xFF, 0xD8, 0xFF]
91    });
92
93    // TIFF validator
94    registry.add_validator("TIFF", |data| {
95        data.len() >= 4
96            && (
97                data[0..4] == [0x49, 0x49, 0x2A, 0x00] || // Little endian
98            data[0..4] == [0x4D, 0x4D, 0x00, 0x2A]
99                // Big endian
100            )
101    });
102
103    // WAV validator
104    registry.add_validator("WAV", |data| {
105        data.len() >= 12 && &data[0..4] == b"RIFF" && &data[8..12] == b"WAVE"
106    });
107
108    // JSON validator
109    registry.add_validator("JSON", |data| {
110        if data.is_empty() {
111            return false;
112        }
113
114        // Find the first non-whitespace character
115        for (i, &byte) in data.iter().enumerate() {
116            if !byte.is_ascii_whitespace() {
117                // Check if it's { or [
118                return byte == b'{' || byte == b'[' ||
119                       // Or allow "key": value format in case it's a fragment
120                       (byte == b'"' && data.len() > i + 2 && data[i+1..].contains(&b':'));
121            }
122        }
123
124        false
125    });
126
127    // CSV validator
128    registry.add_validator("CSV", |data| {
129        // Basic validation: contains commas and has consistent structure
130        if data.is_empty() || !data.contains(&b',') {
131            return false;
132        }
133
134        // Check for newlines (files should have more than one line)
135        if !data.contains(&b'\n') && !data.contains(&b'\r') {
136            return false;
137        }
138
139        // Check for structure consistency by counting commas in first few lines
140        let mut lines = data.split(|&b| b == b'\n');
141
142        // Get the first line (skipping empty lines)
143        let first_line = lines.find(|line| !line.is_empty()).unwrap_or(&[]);
144
145        // Count commas in first line
146        let comma_count = first_line.iter().filter(|&&b| b == b',').count();
147
148        // Check that other lines have similar comma counts
149        // (allow some variation for quoted fields)
150        for line in lines.take(5) {
151            if line.is_empty() {
152                continue;
153            }
154
155            let line_comma_count = line.iter().filter(|&&b| b == b',').count();
156
157            // Allow some variation, but not too much
158            if (line_comma_count as isize - comma_count as isize).abs() > 2 {
159                return false;
160            }
161        }
162
163        true
164    });
165
166    // TSV validator
167    registry.add_validator("TSV", |data| {
168        // Similar to CSV but with tabs
169        if data.is_empty() || !data.contains(&b'\t') {
170            return false;
171        }
172
173        if !data.contains(&b'\n') && !data.contains(&b'\r') {
174            return false;
175        }
176
177        // Check for structure consistency
178        let mut lines = data.split(|&b| b == b'\n');
179
180        let first_line = lines.find(|line| !line.is_empty()).unwrap_or(&[]);
181
182        let tab_count = first_line.iter().filter(|&&b| b == b'\t').count();
183
184        for line in lines.take(5) {
185            if line.is_empty() {
186                continue;
187            }
188
189            let line_tab_count = line.iter().filter(|&&b| b == b'\t').count();
190
191            if (line_tab_count as isize - tab_count as isize).abs() > 2 {
192                return false;
193            }
194        }
195
196        true
197    });
198
199    // MATLAB .mat file validator
200    registry.add_validator("MATLAB", |data| {
201        // Check for MATLAB Level 5 MAT-file format
202        if data.len() >= 128
203            && (data[0..4] == [0x00, 0x01, 0x00, 0x00] || // Header for MATLAB versions < 7.3
204            data[0..4] == [0x00, 0x01, 0x4D, 0x49])
205        // Header for compressed MATLAB >= 7.3
206        {
207            // Check for "MATLAB" text in header
208            return data[124..128].windows(6).any(|window| window == b"MATLAB");
209        }
210
211        false
212    });
213
214    // ARFF validator
215    registry.add_validator("ARFF", |data| {
216        if data.is_empty() {
217            return false;
218        }
219
220        // Convert to string for easier parsing
221        let mut buffer = Vec::new();
222        buffer.extend_from_slice(data);
223
224        // Try to parse as UTF-8, fall back to Latin-1
225        let content = String::from_utf8(buffer).unwrap_or_else(|_| {
226            // Fall back to Latin-1 encoding
227            data.iter().map(|&b| b as char).collect()
228        });
229
230        // Check for ARFF header
231        content.to_uppercase().contains("@RELATION")
232            && content.to_uppercase().contains("@ATTRIBUTE")
233            && content.to_uppercase().contains("@DATA")
234    });
235
236    // HDF5 validator
237    registry.add_validator("HDF5", |data| {
238        data.len() >= 8 && data[0..8] == [137, 72, 68, 70, 13, 10, 26, 10]
239    });
240
241    // NetCDF validator (basic signature check)
242    registry.add_validator("NetCDF", |data| {
243        data.len() >= 4 && &data[0..4] == b"CDF\x01" || &data[0..4] == b"CDF\x02"
244    });
245
246    registry
247}
248
249/// Validate a file against a specific format
250#[allow(dead_code)]
251pub fn validate_format<P: AsRef<Path>>(path: P, format: DataFormat) -> Result<bool> {
252    let _path = path.as_ref();
253
254    // Open file
255    let file =
256        File::open(_path).map_err(|e| IoError::FileError(format!("Failed to open file: {e}")))?;
257
258    // Read first 8192 bytes for format detection
259    let mut buffer = Vec::with_capacity(8192);
260    file.take(8192)
261        .read_to_end(&mut buffer)
262        .map_err(|e| IoError::FileError(format!("Failed to read file: {e}")))?;
263
264    // Get validators
265    let registry = get_scientific_format_validators();
266
267    // Find validator for the format
268    for validator in registry.validators {
269        if validator.format_name.eq_ignore_ascii_case(format.as_str()) {
270            return Ok(validator.validate(&buffer));
271        }
272    }
273
274    Err(IoError::ValidationError(format!(
275        "No validator found for format: {}",
276        format.as_str()
277    )))
278}
279
280/// Detect the format of a file
281#[allow(dead_code)]
282pub fn detect_file_format<P: AsRef<Path>>(path: P) -> Result<Option<String>> {
283    let _path = path.as_ref();
284
285    // Use registry to validate format
286    let registry = get_scientific_format_validators();
287    registry.validate_format(ValidationSource::FilePath(_path))
288}
289
290/// Structure for validation result details
291#[derive(Debug, Clone)]
292pub struct FormatValidationResult {
293    /// Whether the validation passed
294    pub valid: bool,
295    /// The format that was validated
296    pub format: String,
297    /// Path to the validated file
298    pub file_path: String,
299    /// Additional validation details
300    pub details: Option<String>,
301}
302
303/// Perform comprehensive format validation on a file
304///
305/// This function performs format-specific validation beyond
306/// just the basic format detection.
307#[allow(dead_code)]
308pub fn validate_file_format<P: AsRef<Path>>(
309    path: P,
310    format: DataFormat,
311) -> Result<FormatValidationResult> {
312    let path = path.as_ref();
313
314    // First check basic format signature
315    let basic_valid = validate_format(path, format)?;
316
317    if !basic_valid {
318        return Ok(FormatValidationResult {
319            valid: false,
320            format: format.as_str().to_string(),
321            file_path: path.to_string_lossy().to_string(),
322            details: Some("File does not have the correct format signature".to_string()),
323        });
324    }
325
326    // For some formats, perform more detailed validation
327    match format {
328        DataFormat::CSV => validate_csv_format(path),
329        DataFormat::JSON => validate_json_format(path),
330        DataFormat::ARFF => validate_arff_format(path),
331        DataFormat::WAV => validate_wav_format(path),
332        _ => {
333            // For other formats, basic validation is sufficient for now
334            Ok(FormatValidationResult {
335                valid: true,
336                format: format.as_str().to_string(),
337                file_path: path.to_string_lossy().to_string(),
338                details: None,
339            })
340        }
341    }
342}
343
344/// Validate CSV file structure in detail
345#[allow(dead_code)]
346fn validate_csv_format<P: AsRef<Path>>(path: P) -> Result<FormatValidationResult> {
347    let _path = path.as_ref();
348
349    // Open file
350    let file =
351        File::open(_path).map_err(|e| IoError::FileError(format!("Failed to open file: {e}")))?;
352
353    let mut reader = BufReader::new(file);
354    let mut content = Vec::new();
355    reader
356        .read_to_end(&mut content)
357        .map_err(|e| IoError::FileError(format!("Failed to read file: {e}")))?;
358
359    if content.is_empty() {
360        return Ok(FormatValidationResult {
361            valid: false,
362            format: "CSV".to_string(),
363            file_path: path.as_ref().to_string_lossy().to_string(),
364            details: Some("File is empty".to_string()),
365        });
366    }
367
368    // Check for consistent number of fields
369    let mut lines = content
370        .split(|&b| b == b'\n' || b == b'\r')
371        .filter(|line| !line.is_empty());
372
373    // Get field count from first line
374    let first_line = match lines.next() {
375        Some(line) => line,
376        None => {
377            return Ok(FormatValidationResult {
378                valid: false,
379                format: "CSV".to_string(),
380                file_path: path.as_ref().to_string_lossy().to_string(),
381                details: Some("File has no content".to_string()),
382            });
383        }
384    };
385
386    // Count fields in first line (accounting for quoted fields)
387    let first_field_count = count_csv_fields(first_line);
388
389    // Check remaining lines for consistency
390    let mut line_number = 2;
391    let mut inconsistent_lines = Vec::new();
392
393    for line in lines {
394        let field_count = count_csv_fields(line);
395
396        if field_count != first_field_count {
397            inconsistent_lines.push(line_number);
398        }
399
400        line_number += 1;
401    }
402
403    if inconsistent_lines.is_empty() {
404        Ok(FormatValidationResult {
405            valid: true,
406            format: "CSV".to_string(),
407            file_path: path.as_ref().to_string_lossy().to_string(),
408            details: Some(format!(
409                "CSV file with {} fields per line",
410                first_field_count
411            )),
412        })
413    } else {
414        // Report up to 5 inconsistent lines
415        let inconsistent_report = if inconsistent_lines.len() <= 5 {
416            format!(
417                "Lines with inconsistent field counts: {}",
418                inconsistent_lines
419                    .iter()
420                    .map(|n| n.to_string())
421                    .collect::<Vec<_>>()
422                    .join(", ")
423            )
424        } else {
425            format!(
426                "Lines with inconsistent field counts: {} (and {} more)",
427                inconsistent_lines
428                    .iter()
429                    .take(5)
430                    .map(|n| n.to_string())
431                    .collect::<Vec<_>>()
432                    .join(", "),
433                inconsistent_lines.len() - 5
434            )
435        };
436
437        Ok(FormatValidationResult {
438            valid: false,
439            format: "CSV".to_string(),
440            file_path: path.as_ref().to_string_lossy().to_string(),
441            details: Some(format!(
442                "Inconsistent field counts. First line has {} fields. {}",
443                first_field_count, inconsistent_report
444            )),
445        })
446    }
447}
448
449/// Count fields in a CSV line, accounting for quoted fields
450#[allow(dead_code)]
451fn count_csv_fields(line: &[u8]) -> usize {
452    let mut count = 1; // Start at 1 because field count = comma count + 1
453    let mut in_quotes = false;
454
455    for &b in line {
456        match b {
457            b'"' => {
458                // Toggle quote state
459                in_quotes = !in_quotes;
460            }
461            b',' => {
462                // Only count commas outside quotes
463                if !in_quotes {
464                    count += 1;
465                }
466            }
467            _ => {}
468        }
469    }
470
471    count
472}
473
474/// Validate JSON file structure in detail
475#[allow(dead_code)]
476fn validate_json_format<P: AsRef<Path>>(path: P) -> Result<FormatValidationResult> {
477    let _path = path.as_ref();
478
479    // Open and attempt to parse as JSON
480    let file =
481        File::open(_path).map_err(|e| IoError::FileError(format!("Failed to open file: {e}")))?;
482
483    let reader = BufReader::new(file);
484
485    match serde_json::from_reader::<_, serde_json::Value>(reader) {
486        Ok(_) => Ok(FormatValidationResult {
487            valid: true,
488            format: "JSON".to_string(),
489            file_path: path.as_ref().to_string_lossy().to_string(),
490            details: Some("Valid JSON structure".to_string()),
491        }),
492        Err(e) => Ok(FormatValidationResult {
493            valid: false,
494            format: "JSON".to_string(),
495            file_path: path.as_ref().to_string_lossy().to_string(),
496            details: Some(format!("Invalid JSON: {}", e)),
497        }),
498    }
499}
500
501/// Validate ARFF file structure in detail
502#[allow(dead_code)]
503fn validate_arff_format<P: AsRef<Path>>(path: P) -> Result<FormatValidationResult> {
504    let _path = path.as_ref();
505
506    // Open file
507    let file =
508        File::open(_path).map_err(|e| IoError::FileError(format!("Failed to open file: {e}")))?;
509
510    let mut reader = BufReader::new(file);
511    let mut content = String::new();
512    reader
513        .read_to_string(&mut content)
514        .map_err(|e| IoError::FileError(format!("Failed to read file: {e}")))?;
515
516    // Check for required sections
517    let has_relation = content.to_uppercase().contains("@RELATION");
518    let has_attribute = content.to_uppercase().contains("@ATTRIBUTE");
519    let has_data = content.to_uppercase().contains("@DATA");
520
521    let mut details = Vec::new();
522
523    if !has_relation {
524        details.push("Missing @RELATION section".to_string());
525    }
526
527    if !has_attribute {
528        details.push("Missing @ATTRIBUTE section".to_string());
529    }
530
531    if !has_data {
532        details.push("Missing @DATA section".to_string());
533    }
534
535    if details.is_empty() {
536        // Count attributes
537        let attribute_count = content
538            .to_uppercase()
539            .lines()
540            .filter(|line| line.trim().starts_with("@ATTRIBUTE"))
541            .count();
542
543        Ok(FormatValidationResult {
544            valid: true,
545            format: "ARFF".to_string(),
546            file_path: path.as_ref().to_string_lossy().to_string(),
547            details: Some(format!(
548                "Valid ARFF file with {} attributes",
549                attribute_count
550            )),
551        })
552    } else {
553        Ok(FormatValidationResult {
554            valid: false,
555            format: "ARFF".to_string(),
556            file_path: path.as_ref().to_string_lossy().to_string(),
557            details: Some(details.join(", ")),
558        })
559    }
560}
561
562/// Validate WAV file structure in detail
563#[allow(dead_code)]
564fn validate_wav_format<P: AsRef<Path>>(path: P) -> Result<FormatValidationResult> {
565    let _path = path.as_ref();
566
567    // Open file
568    let file =
569        File::open(_path).map_err(|e| IoError::FileError(format!("Failed to open file: {e}")))?;
570
571    let mut reader = BufReader::new(file);
572    let mut header = [0u8; 44]; // Standard WAV header size
573
574    // Try to read header
575    if let Err(e) = reader.read_exact(&mut header) {
576        return Ok(FormatValidationResult {
577            valid: false,
578            format: "WAV".to_string(),
579            file_path: path.as_ref().to_string_lossy().to_string(),
580            details: Some(format!("Failed to read WAV header: {}", e)),
581        });
582    }
583
584    // Check for RIFF header
585    if &header[0..4] != b"RIFF" {
586        return Ok(FormatValidationResult {
587            valid: false,
588            format: "WAV".to_string(),
589            file_path: path.as_ref().to_string_lossy().to_string(),
590            details: Some("Missing RIFF header".to_string()),
591        });
592    }
593
594    // Check for WAVE format
595    if &header[8..12] != b"WAVE" {
596        return Ok(FormatValidationResult {
597            valid: false,
598            format: "WAV".to_string(),
599            file_path: path.as_ref().to_string_lossy().to_string(),
600            details: Some("Missing WAVE format identifier".to_string()),
601        });
602    }
603
604    // Check for fmt chunk
605    if &header[12..16] != b"fmt " {
606        return Ok(FormatValidationResult {
607            valid: false,
608            format: "WAV".to_string(),
609            file_path: path.as_ref().to_string_lossy().to_string(),
610            details: Some("Missing fmt chunk".to_string()),
611        });
612    }
613
614    // Extract audio format (PCM = 1)
615    let audio_format = header[20] as u16 | ((header[21] as u16) << 8);
616    let channels = header[22] as u16 | ((header[23] as u16) << 8);
617    let sample_rate = header[24] as u32
618        | ((header[25] as u32) << 8)
619        | ((header[26] as u32) << 16)
620        | ((header[27] as u32) << 24);
621    let bits_per_sample = header[34] as u16 | ((header[35] as u16) << 8);
622
623    Ok(FormatValidationResult {
624        valid: true,
625        format: "WAV".to_string(),
626        file_path: _path.to_string_lossy().to_string(),
627        details: Some(format!(
628            "Valid WAV file: {} channels, {}Hz, {}-bit, {}",
629            channels,
630            sample_rate,
631            bits_per_sample,
632            if audio_format == 1 { "PCM" } else { "non-PCM" }
633        )),
634    })
635}