1use std::fs::File;
6use std::io::{BufReader, Read};
7use std::path::Path;
8
9use super::{FormatValidatorRegistry, ValidationSource};
10use crate::error::{IoError, Result};
11
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum DataFormat {
15 CSV,
17 TSV,
19 JSON,
21 MATLAB,
23 ARFF,
25 HDF5,
27 NetCDF,
29 PNG,
31 JPEG,
33 TIFF,
35 WAV,
37}
38
39impl DataFormat {
40 pub fn as_str(&self) -> &'static str {
42 match self {
43 DataFormat::CSV => "CSV",
44 DataFormat::TSV => "TSV",
45 DataFormat::JSON => "JSON",
46 DataFormat::MATLAB => "MATLAB",
47 DataFormat::ARFF => "ARFF",
48 DataFormat::HDF5 => "HDF5",
49 DataFormat::NetCDF => "NetCDF",
50 DataFormat::PNG => "PNG",
51 DataFormat::JPEG => "JPEG",
52 DataFormat::TIFF => "TIFF",
53 DataFormat::WAV => "WAV",
54 }
55 }
56
57 pub fn from_str(name: &str) -> Option<Self> {
59 match name.to_uppercase().as_str() {
60 "CSV" => Some(DataFormat::CSV),
61 "TSV" => Some(DataFormat::TSV),
62 "JSON" => Some(DataFormat::JSON),
63 "MAT" | "MATLAB" => Some(DataFormat::MATLAB),
64 "ARFF" => Some(DataFormat::ARFF),
65 "HDF5" | "H5" => Some(DataFormat::HDF5),
66 "NETCDF" | "NC" => Some(DataFormat::NetCDF),
67 "PNG" => Some(DataFormat::PNG),
68 "JPEG" | "JPG" => Some(DataFormat::JPEG),
69 "TIFF" | "TIF" => Some(DataFormat::TIFF),
70 "WAV" => Some(DataFormat::WAV),
71 _ => None,
72 }
73 }
74}
75
76#[allow(dead_code)]
78pub fn get_scientific_format_validators() -> FormatValidatorRegistry {
79 let mut registry = FormatValidatorRegistry::new();
80
81 registry.add_validator("PNG", |data| {
85 data.len() >= 8 && data[0..8] == [137, 80, 78, 71, 13, 10, 26, 10]
86 });
87
88 registry.add_validator("JPEG", |data| {
90 data.len() >= 3 && data[0..3] == [0xFF, 0xD8, 0xFF]
91 });
92
93 registry.add_validator("TIFF", |data| {
95 data.len() >= 4
96 && (
97 data[0..4] == [0x49, 0x49, 0x2A, 0x00] || data[0..4] == [0x4D, 0x4D, 0x00, 0x2A]
99 )
101 });
102
103 registry.add_validator("WAV", |data| {
105 data.len() >= 12 && &data[0..4] == b"RIFF" && &data[8..12] == b"WAVE"
106 });
107
108 registry.add_validator("JSON", |data| {
110 if data.is_empty() {
111 return false;
112 }
113
114 for (i, &byte) in data.iter().enumerate() {
116 if !byte.is_ascii_whitespace() {
117 return byte == b'{' || byte == b'[' ||
119 (byte == b'"' && data.len() > i + 2 && data[i+1..].contains(&b':'));
121 }
122 }
123
124 false
125 });
126
127 registry.add_validator("CSV", |data| {
129 if data.is_empty() || !data.contains(&b',') {
131 return false;
132 }
133
134 if !data.contains(&b'\n') && !data.contains(&b'\r') {
136 return false;
137 }
138
139 let mut lines = data.split(|&b| b == b'\n');
141
142 let first_line = lines.find(|line| !line.is_empty()).unwrap_or(&[]);
144
145 let comma_count = first_line.iter().filter(|&&b| b == b',').count();
147
148 for line in lines.take(5) {
151 if line.is_empty() {
152 continue;
153 }
154
155 let line_comma_count = line.iter().filter(|&&b| b == b',').count();
156
157 if (line_comma_count as isize - comma_count as isize).abs() > 2 {
159 return false;
160 }
161 }
162
163 true
164 });
165
166 registry.add_validator("TSV", |data| {
168 if data.is_empty() || !data.contains(&b'\t') {
170 return false;
171 }
172
173 if !data.contains(&b'\n') && !data.contains(&b'\r') {
174 return false;
175 }
176
177 let mut lines = data.split(|&b| b == b'\n');
179
180 let first_line = lines.find(|line| !line.is_empty()).unwrap_or(&[]);
181
182 let tab_count = first_line.iter().filter(|&&b| b == b'\t').count();
183
184 for line in lines.take(5) {
185 if line.is_empty() {
186 continue;
187 }
188
189 let line_tab_count = line.iter().filter(|&&b| b == b'\t').count();
190
191 if (line_tab_count as isize - tab_count as isize).abs() > 2 {
192 return false;
193 }
194 }
195
196 true
197 });
198
199 registry.add_validator("MATLAB", |data| {
201 if data.len() >= 128
203 && (data[0..4] == [0x00, 0x01, 0x00, 0x00] || data[0..4] == [0x00, 0x01, 0x4D, 0x49])
205 {
207 return data[124..128].windows(6).any(|window| window == b"MATLAB");
209 }
210
211 false
212 });
213
214 registry.add_validator("ARFF", |data| {
216 if data.is_empty() {
217 return false;
218 }
219
220 let mut buffer = Vec::new();
222 buffer.extend_from_slice(data);
223
224 let content = String::from_utf8(buffer).unwrap_or_else(|_| {
226 data.iter().map(|&b| b as char).collect()
228 });
229
230 content.to_uppercase().contains("@RELATION")
232 && content.to_uppercase().contains("@ATTRIBUTE")
233 && content.to_uppercase().contains("@DATA")
234 });
235
236 registry.add_validator("HDF5", |data| {
238 data.len() >= 8 && data[0..8] == [137, 72, 68, 70, 13, 10, 26, 10]
239 });
240
241 registry.add_validator("NetCDF", |data| {
243 data.len() >= 4 && &data[0..4] == b"CDF\x01" || &data[0..4] == b"CDF\x02"
244 });
245
246 registry
247}
248
249#[allow(dead_code)]
251pub fn validate_format<P: AsRef<Path>>(path: P, format: DataFormat) -> Result<bool> {
252 let _path = path.as_ref();
253
254 let file =
256 File::open(_path).map_err(|e| IoError::FileError(format!("Failed to open file: {e}")))?;
257
258 let mut buffer = Vec::with_capacity(8192);
260 file.take(8192)
261 .read_to_end(&mut buffer)
262 .map_err(|e| IoError::FileError(format!("Failed to read file: {e}")))?;
263
264 let registry = get_scientific_format_validators();
266
267 for validator in registry.validators {
269 if validator.format_name.eq_ignore_ascii_case(format.as_str()) {
270 return Ok(validator.validate(&buffer));
271 }
272 }
273
274 Err(IoError::ValidationError(format!(
275 "No validator found for format: {}",
276 format.as_str()
277 )))
278}
279
280#[allow(dead_code)]
282pub fn detect_file_format<P: AsRef<Path>>(path: P) -> Result<Option<String>> {
283 let _path = path.as_ref();
284
285 let registry = get_scientific_format_validators();
287 registry.validate_format(ValidationSource::FilePath(_path))
288}
289
290#[derive(Debug, Clone)]
292pub struct FormatValidationResult {
293 pub valid: bool,
295 pub format: String,
297 pub file_path: String,
299 pub details: Option<String>,
301}
302
303#[allow(dead_code)]
308pub fn validate_file_format<P: AsRef<Path>>(
309 path: P,
310 format: DataFormat,
311) -> Result<FormatValidationResult> {
312 let path = path.as_ref();
313
314 let basic_valid = validate_format(path, format)?;
316
317 if !basic_valid {
318 return Ok(FormatValidationResult {
319 valid: false,
320 format: format.as_str().to_string(),
321 file_path: path.to_string_lossy().to_string(),
322 details: Some("File does not have the correct format signature".to_string()),
323 });
324 }
325
326 match format {
328 DataFormat::CSV => validate_csv_format(path),
329 DataFormat::JSON => validate_json_format(path),
330 DataFormat::ARFF => validate_arff_format(path),
331 DataFormat::WAV => validate_wav_format(path),
332 _ => {
333 Ok(FormatValidationResult {
335 valid: true,
336 format: format.as_str().to_string(),
337 file_path: path.to_string_lossy().to_string(),
338 details: None,
339 })
340 }
341 }
342}
343
344#[allow(dead_code)]
346fn validate_csv_format<P: AsRef<Path>>(path: P) -> Result<FormatValidationResult> {
347 let _path = path.as_ref();
348
349 let file =
351 File::open(_path).map_err(|e| IoError::FileError(format!("Failed to open file: {e}")))?;
352
353 let mut reader = BufReader::new(file);
354 let mut content = Vec::new();
355 reader
356 .read_to_end(&mut content)
357 .map_err(|e| IoError::FileError(format!("Failed to read file: {e}")))?;
358
359 if content.is_empty() {
360 return Ok(FormatValidationResult {
361 valid: false,
362 format: "CSV".to_string(),
363 file_path: path.as_ref().to_string_lossy().to_string(),
364 details: Some("File is empty".to_string()),
365 });
366 }
367
368 let mut lines = content
370 .split(|&b| b == b'\n' || b == b'\r')
371 .filter(|line| !line.is_empty());
372
373 let first_line = match lines.next() {
375 Some(line) => line,
376 None => {
377 return Ok(FormatValidationResult {
378 valid: false,
379 format: "CSV".to_string(),
380 file_path: path.as_ref().to_string_lossy().to_string(),
381 details: Some("File has no content".to_string()),
382 });
383 }
384 };
385
386 let first_field_count = count_csv_fields(first_line);
388
389 let mut line_number = 2;
391 let mut inconsistent_lines = Vec::new();
392
393 for line in lines {
394 let field_count = count_csv_fields(line);
395
396 if field_count != first_field_count {
397 inconsistent_lines.push(line_number);
398 }
399
400 line_number += 1;
401 }
402
403 if inconsistent_lines.is_empty() {
404 Ok(FormatValidationResult {
405 valid: true,
406 format: "CSV".to_string(),
407 file_path: path.as_ref().to_string_lossy().to_string(),
408 details: Some(format!(
409 "CSV file with {} fields per line",
410 first_field_count
411 )),
412 })
413 } else {
414 let inconsistent_report = if inconsistent_lines.len() <= 5 {
416 format!(
417 "Lines with inconsistent field counts: {}",
418 inconsistent_lines
419 .iter()
420 .map(|n| n.to_string())
421 .collect::<Vec<_>>()
422 .join(", ")
423 )
424 } else {
425 format!(
426 "Lines with inconsistent field counts: {} (and {} more)",
427 inconsistent_lines
428 .iter()
429 .take(5)
430 .map(|n| n.to_string())
431 .collect::<Vec<_>>()
432 .join(", "),
433 inconsistent_lines.len() - 5
434 )
435 };
436
437 Ok(FormatValidationResult {
438 valid: false,
439 format: "CSV".to_string(),
440 file_path: path.as_ref().to_string_lossy().to_string(),
441 details: Some(format!(
442 "Inconsistent field counts. First line has {} fields. {}",
443 first_field_count, inconsistent_report
444 )),
445 })
446 }
447}
448
449#[allow(dead_code)]
451fn count_csv_fields(line: &[u8]) -> usize {
452 let mut count = 1; let mut in_quotes = false;
454
455 for &b in line {
456 match b {
457 b'"' => {
458 in_quotes = !in_quotes;
460 }
461 b',' => {
462 if !in_quotes {
464 count += 1;
465 }
466 }
467 _ => {}
468 }
469 }
470
471 count
472}
473
474#[allow(dead_code)]
476fn validate_json_format<P: AsRef<Path>>(path: P) -> Result<FormatValidationResult> {
477 let _path = path.as_ref();
478
479 let file =
481 File::open(_path).map_err(|e| IoError::FileError(format!("Failed to open file: {e}")))?;
482
483 let reader = BufReader::new(file);
484
485 match serde_json::from_reader::<_, serde_json::Value>(reader) {
486 Ok(_) => Ok(FormatValidationResult {
487 valid: true,
488 format: "JSON".to_string(),
489 file_path: path.as_ref().to_string_lossy().to_string(),
490 details: Some("Valid JSON structure".to_string()),
491 }),
492 Err(e) => Ok(FormatValidationResult {
493 valid: false,
494 format: "JSON".to_string(),
495 file_path: path.as_ref().to_string_lossy().to_string(),
496 details: Some(format!("Invalid JSON: {}", e)),
497 }),
498 }
499}
500
501#[allow(dead_code)]
503fn validate_arff_format<P: AsRef<Path>>(path: P) -> Result<FormatValidationResult> {
504 let _path = path.as_ref();
505
506 let file =
508 File::open(_path).map_err(|e| IoError::FileError(format!("Failed to open file: {e}")))?;
509
510 let mut reader = BufReader::new(file);
511 let mut content = String::new();
512 reader
513 .read_to_string(&mut content)
514 .map_err(|e| IoError::FileError(format!("Failed to read file: {e}")))?;
515
516 let has_relation = content.to_uppercase().contains("@RELATION");
518 let has_attribute = content.to_uppercase().contains("@ATTRIBUTE");
519 let has_data = content.to_uppercase().contains("@DATA");
520
521 let mut details = Vec::new();
522
523 if !has_relation {
524 details.push("Missing @RELATION section".to_string());
525 }
526
527 if !has_attribute {
528 details.push("Missing @ATTRIBUTE section".to_string());
529 }
530
531 if !has_data {
532 details.push("Missing @DATA section".to_string());
533 }
534
535 if details.is_empty() {
536 let attribute_count = content
538 .to_uppercase()
539 .lines()
540 .filter(|line| line.trim().starts_with("@ATTRIBUTE"))
541 .count();
542
543 Ok(FormatValidationResult {
544 valid: true,
545 format: "ARFF".to_string(),
546 file_path: path.as_ref().to_string_lossy().to_string(),
547 details: Some(format!(
548 "Valid ARFF file with {} attributes",
549 attribute_count
550 )),
551 })
552 } else {
553 Ok(FormatValidationResult {
554 valid: false,
555 format: "ARFF".to_string(),
556 file_path: path.as_ref().to_string_lossy().to_string(),
557 details: Some(details.join(", ")),
558 })
559 }
560}
561
562#[allow(dead_code)]
564fn validate_wav_format<P: AsRef<Path>>(path: P) -> Result<FormatValidationResult> {
565 let _path = path.as_ref();
566
567 let file =
569 File::open(_path).map_err(|e| IoError::FileError(format!("Failed to open file: {e}")))?;
570
571 let mut reader = BufReader::new(file);
572 let mut header = [0u8; 44]; if let Err(e) = reader.read_exact(&mut header) {
576 return Ok(FormatValidationResult {
577 valid: false,
578 format: "WAV".to_string(),
579 file_path: path.as_ref().to_string_lossy().to_string(),
580 details: Some(format!("Failed to read WAV header: {}", e)),
581 });
582 }
583
584 if &header[0..4] != b"RIFF" {
586 return Ok(FormatValidationResult {
587 valid: false,
588 format: "WAV".to_string(),
589 file_path: path.as_ref().to_string_lossy().to_string(),
590 details: Some("Missing RIFF header".to_string()),
591 });
592 }
593
594 if &header[8..12] != b"WAVE" {
596 return Ok(FormatValidationResult {
597 valid: false,
598 format: "WAV".to_string(),
599 file_path: path.as_ref().to_string_lossy().to_string(),
600 details: Some("Missing WAVE format identifier".to_string()),
601 });
602 }
603
604 if &header[12..16] != b"fmt " {
606 return Ok(FormatValidationResult {
607 valid: false,
608 format: "WAV".to_string(),
609 file_path: path.as_ref().to_string_lossy().to_string(),
610 details: Some("Missing fmt chunk".to_string()),
611 });
612 }
613
614 let audio_format = header[20] as u16 | ((header[21] as u16) << 8);
616 let channels = header[22] as u16 | ((header[23] as u16) << 8);
617 let sample_rate = header[24] as u32
618 | ((header[25] as u32) << 8)
619 | ((header[26] as u32) << 16)
620 | ((header[27] as u32) << 24);
621 let bits_per_sample = header[34] as u16 | ((header[35] as u16) << 8);
622
623 Ok(FormatValidationResult {
624 valid: true,
625 format: "WAV".to_string(),
626 file_path: _path.to_string_lossy().to_string(),
627 details: Some(format!(
628 "Valid WAV file: {} channels, {}Hz, {}-bit, {}",
629 channels,
630 sample_rate,
631 bits_per_sample,
632 if audio_format == 1 { "PCM" } else { "non-PCM" }
633 )),
634 })
635}