use std::fs::File;
use std::io::{BufReader, Read};
use std::path::Path;
use super::{FormatValidatorRegistry, ValidationSource};
use crate::error::{IoError, Result};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DataFormat {
CSV,
TSV,
JSON,
MATLAB,
ARFF,
HDF5,
NetCDF,
PNG,
JPEG,
TIFF,
WAV,
}
impl DataFormat {
pub fn as_str(&self) -> &'static str {
match self {
DataFormat::CSV => "CSV",
DataFormat::TSV => "TSV",
DataFormat::JSON => "JSON",
DataFormat::MATLAB => "MATLAB",
DataFormat::ARFF => "ARFF",
DataFormat::HDF5 => "HDF5",
DataFormat::NetCDF => "NetCDF",
DataFormat::PNG => "PNG",
DataFormat::JPEG => "JPEG",
DataFormat::TIFF => "TIFF",
DataFormat::WAV => "WAV",
}
}
pub fn from_str(name: &str) -> Option<Self> {
match name.to_uppercase().as_str() {
"CSV" => Some(DataFormat::CSV),
"TSV" => Some(DataFormat::TSV),
"JSON" => Some(DataFormat::JSON),
"MAT" | "MATLAB" => Some(DataFormat::MATLAB),
"ARFF" => Some(DataFormat::ARFF),
"HDF5" | "H5" => Some(DataFormat::HDF5),
"NETCDF" | "NC" => Some(DataFormat::NetCDF),
"PNG" => Some(DataFormat::PNG),
"JPEG" | "JPG" => Some(DataFormat::JPEG),
"TIFF" | "TIF" => Some(DataFormat::TIFF),
"WAV" => Some(DataFormat::WAV),
_ => None,
}
}
}
#[allow(dead_code)]
pub fn get_scientific_format_validators() -> FormatValidatorRegistry {
let mut registry = FormatValidatorRegistry::new();
registry.add_validator("PNG", |data| {
data.len() >= 8 && data[0..8] == [137, 80, 78, 71, 13, 10, 26, 10]
});
registry.add_validator("JPEG", |data| {
data.len() >= 3 && data[0..3] == [0xFF, 0xD8, 0xFF]
});
registry.add_validator("TIFF", |data| {
data.len() >= 4
&& (
data[0..4] == [0x49, 0x49, 0x2A, 0x00] || data[0..4] == [0x4D, 0x4D, 0x00, 0x2A]
)
});
registry.add_validator("WAV", |data| {
data.len() >= 12 && &data[0..4] == b"RIFF" && &data[8..12] == b"WAVE"
});
registry.add_validator("JSON", |data| {
if data.is_empty() {
return false;
}
for (i, &byte) in data.iter().enumerate() {
if !byte.is_ascii_whitespace() {
return byte == b'{' || byte == b'[' ||
(byte == b'"' && data.len() > i + 2 && data[i+1..].contains(&b':'));
}
}
false
});
registry.add_validator("CSV", |data| {
if data.is_empty() || !data.contains(&b',') {
return false;
}
if !data.contains(&b'\n') && !data.contains(&b'\r') {
return false;
}
let mut lines = data.split(|&b| b == b'\n');
let first_line = lines.find(|line| !line.is_empty()).unwrap_or(&[]);
let comma_count = first_line.iter().filter(|&&b| b == b',').count();
for line in lines.take(5) {
if line.is_empty() {
continue;
}
let line_comma_count = line.iter().filter(|&&b| b == b',').count();
if (line_comma_count as isize - comma_count as isize).abs() > 2 {
return false;
}
}
true
});
registry.add_validator("TSV", |data| {
if data.is_empty() || !data.contains(&b'\t') {
return false;
}
if !data.contains(&b'\n') && !data.contains(&b'\r') {
return false;
}
let mut lines = data.split(|&b| b == b'\n');
let first_line = lines.find(|line| !line.is_empty()).unwrap_or(&[]);
let tab_count = first_line.iter().filter(|&&b| b == b'\t').count();
for line in lines.take(5) {
if line.is_empty() {
continue;
}
let line_tab_count = line.iter().filter(|&&b| b == b'\t').count();
if (line_tab_count as isize - tab_count as isize).abs() > 2 {
return false;
}
}
true
});
registry.add_validator("MATLAB", |data| {
if data.len() >= 128
&& (data[0..4] == [0x00, 0x01, 0x00, 0x00] || data[0..4] == [0x00, 0x01, 0x4D, 0x49])
{
return data[124..128].windows(6).any(|window| window == b"MATLAB");
}
false
});
registry.add_validator("ARFF", |data| {
if data.is_empty() {
return false;
}
let mut buffer = Vec::new();
buffer.extend_from_slice(data);
let content = String::from_utf8(buffer).unwrap_or_else(|_| {
data.iter().map(|&b| b as char).collect()
});
content.to_uppercase().contains("@RELATION")
&& content.to_uppercase().contains("@ATTRIBUTE")
&& content.to_uppercase().contains("@DATA")
});
registry.add_validator("HDF5", |data| {
data.len() >= 8 && data[0..8] == [137, 72, 68, 70, 13, 10, 26, 10]
});
registry.add_validator("NetCDF", |data| {
data.len() >= 4 && &data[0..4] == b"CDF\x01" || &data[0..4] == b"CDF\x02"
});
registry
}
#[allow(dead_code)]
pub fn validate_format<P: AsRef<Path>>(path: P, format: DataFormat) -> Result<bool> {
let _path = path.as_ref();
let file =
File::open(_path).map_err(|e| IoError::FileError(format!("Failed to open file: {e}")))?;
let mut buffer = Vec::with_capacity(8192);
file.take(8192)
.read_to_end(&mut buffer)
.map_err(|e| IoError::FileError(format!("Failed to read file: {e}")))?;
let registry = get_scientific_format_validators();
for validator in registry.validators {
if validator.format_name.eq_ignore_ascii_case(format.as_str()) {
return Ok(validator.validate(&buffer));
}
}
Err(IoError::ValidationError(format!(
"No validator found for format: {}",
format.as_str()
)))
}
#[allow(dead_code)]
pub fn detect_file_format<P: AsRef<Path>>(path: P) -> Result<Option<String>> {
let _path = path.as_ref();
let registry = get_scientific_format_validators();
registry.validate_format(ValidationSource::FilePath(_path))
}
#[derive(Debug, Clone)]
pub struct FormatValidationResult {
pub valid: bool,
pub format: String,
pub file_path: String,
pub details: Option<String>,
}
#[allow(dead_code)]
pub fn validate_file_format<P: AsRef<Path>>(
path: P,
format: DataFormat,
) -> Result<FormatValidationResult> {
let path = path.as_ref();
let basic_valid = validate_format(path, format)?;
if !basic_valid {
return Ok(FormatValidationResult {
valid: false,
format: format.as_str().to_string(),
file_path: path.to_string_lossy().to_string(),
details: Some("File does not have the correct format signature".to_string()),
});
}
match format {
DataFormat::CSV => validate_csv_format(path),
DataFormat::JSON => validate_json_format(path),
DataFormat::ARFF => validate_arff_format(path),
DataFormat::WAV => validate_wav_format(path),
_ => {
Ok(FormatValidationResult {
valid: true,
format: format.as_str().to_string(),
file_path: path.to_string_lossy().to_string(),
details: None,
})
}
}
}
#[allow(dead_code)]
fn validate_csv_format<P: AsRef<Path>>(path: P) -> Result<FormatValidationResult> {
let _path = path.as_ref();
let file =
File::open(_path).map_err(|e| IoError::FileError(format!("Failed to open file: {e}")))?;
let mut reader = BufReader::new(file);
let mut content = Vec::new();
reader
.read_to_end(&mut content)
.map_err(|e| IoError::FileError(format!("Failed to read file: {e}")))?;
if content.is_empty() {
return Ok(FormatValidationResult {
valid: false,
format: "CSV".to_string(),
file_path: path.as_ref().to_string_lossy().to_string(),
details: Some("File is empty".to_string()),
});
}
let mut lines = content
.split(|&b| b == b'\n' || b == b'\r')
.filter(|line| !line.is_empty());
let first_line = match lines.next() {
Some(line) => line,
None => {
return Ok(FormatValidationResult {
valid: false,
format: "CSV".to_string(),
file_path: path.as_ref().to_string_lossy().to_string(),
details: Some("File has no content".to_string()),
});
}
};
let first_field_count = count_csv_fields(first_line);
let mut inconsistent_lines = Vec::new();
for (i, line) in lines.enumerate() {
let field_count = count_csv_fields(line);
let line_number = i + 2;
if field_count != first_field_count {
inconsistent_lines.push(line_number);
}
}
if inconsistent_lines.is_empty() {
Ok(FormatValidationResult {
valid: true,
format: "CSV".to_string(),
file_path: path.as_ref().to_string_lossy().to_string(),
details: Some(format!(
"CSV file with {} fields per line",
first_field_count
)),
})
} else {
let inconsistent_report = if inconsistent_lines.len() <= 5 {
format!(
"Lines with inconsistent field counts: {}",
inconsistent_lines
.iter()
.map(|n| n.to_string())
.collect::<Vec<_>>()
.join(", ")
)
} else {
format!(
"Lines with inconsistent field counts: {} (and {} more)",
inconsistent_lines
.iter()
.take(5)
.map(|n| n.to_string())
.collect::<Vec<_>>()
.join(", "),
inconsistent_lines.len() - 5
)
};
Ok(FormatValidationResult {
valid: false,
format: "CSV".to_string(),
file_path: path.as_ref().to_string_lossy().to_string(),
details: Some(format!(
"Inconsistent field counts. First line has {} fields. {}",
first_field_count, inconsistent_report
)),
})
}
}
#[allow(dead_code)]
fn count_csv_fields(line: &[u8]) -> usize {
let mut count = 1; let mut in_quotes = false;
for &b in line {
match b {
b'"' => {
in_quotes = !in_quotes;
}
b','
if !in_quotes => {
count += 1;
}
_ => {}
}
}
count
}
#[allow(dead_code)]
fn validate_json_format<P: AsRef<Path>>(path: P) -> Result<FormatValidationResult> {
let _path = path.as_ref();
let file =
File::open(_path).map_err(|e| IoError::FileError(format!("Failed to open file: {e}")))?;
let reader = BufReader::new(file);
match serde_json::from_reader::<_, serde_json::Value>(reader) {
Ok(_) => Ok(FormatValidationResult {
valid: true,
format: "JSON".to_string(),
file_path: path.as_ref().to_string_lossy().to_string(),
details: Some("Valid JSON structure".to_string()),
}),
Err(e) => Ok(FormatValidationResult {
valid: false,
format: "JSON".to_string(),
file_path: path.as_ref().to_string_lossy().to_string(),
details: Some(format!("Invalid JSON: {}", e)),
}),
}
}
#[allow(dead_code)]
fn validate_arff_format<P: AsRef<Path>>(path: P) -> Result<FormatValidationResult> {
let _path = path.as_ref();
let file =
File::open(_path).map_err(|e| IoError::FileError(format!("Failed to open file: {e}")))?;
let mut reader = BufReader::new(file);
let mut content = String::new();
reader
.read_to_string(&mut content)
.map_err(|e| IoError::FileError(format!("Failed to read file: {e}")))?;
let has_relation = content.to_uppercase().contains("@RELATION");
let has_attribute = content.to_uppercase().contains("@ATTRIBUTE");
let has_data = content.to_uppercase().contains("@DATA");
let mut details = Vec::new();
if !has_relation {
details.push("Missing @RELATION section".to_string());
}
if !has_attribute {
details.push("Missing @ATTRIBUTE section".to_string());
}
if !has_data {
details.push("Missing @DATA section".to_string());
}
if details.is_empty() {
let attribute_count = content
.to_uppercase()
.lines()
.filter(|line| line.trim().starts_with("@ATTRIBUTE"))
.count();
Ok(FormatValidationResult {
valid: true,
format: "ARFF".to_string(),
file_path: path.as_ref().to_string_lossy().to_string(),
details: Some(format!(
"Valid ARFF file with {} attributes",
attribute_count
)),
})
} else {
Ok(FormatValidationResult {
valid: false,
format: "ARFF".to_string(),
file_path: path.as_ref().to_string_lossy().to_string(),
details: Some(details.join(", ")),
})
}
}
#[allow(dead_code)]
fn validate_wav_format<P: AsRef<Path>>(path: P) -> Result<FormatValidationResult> {
let _path = path.as_ref();
let file =
File::open(_path).map_err(|e| IoError::FileError(format!("Failed to open file: {e}")))?;
let mut reader = BufReader::new(file);
let mut header = [0u8; 44];
if let Err(e) = reader.read_exact(&mut header) {
return Ok(FormatValidationResult {
valid: false,
format: "WAV".to_string(),
file_path: path.as_ref().to_string_lossy().to_string(),
details: Some(format!("Failed to read WAV header: {}", e)),
});
}
if &header[0..4] != b"RIFF" {
return Ok(FormatValidationResult {
valid: false,
format: "WAV".to_string(),
file_path: path.as_ref().to_string_lossy().to_string(),
details: Some("Missing RIFF header".to_string()),
});
}
if &header[8..12] != b"WAVE" {
return Ok(FormatValidationResult {
valid: false,
format: "WAV".to_string(),
file_path: path.as_ref().to_string_lossy().to_string(),
details: Some("Missing WAVE format identifier".to_string()),
});
}
if &header[12..16] != b"fmt " {
return Ok(FormatValidationResult {
valid: false,
format: "WAV".to_string(),
file_path: path.as_ref().to_string_lossy().to_string(),
details: Some("Missing fmt chunk".to_string()),
});
}
let audio_format = header[20] as u16 | ((header[21] as u16) << 8);
let channels = header[22] as u16 | ((header[23] as u16) << 8);
let sample_rate = header[24] as u32
| ((header[25] as u32) << 8)
| ((header[26] as u32) << 16)
| ((header[27] as u32) << 24);
let bits_per_sample = header[34] as u16 | ((header[35] as u16) << 8);
Ok(FormatValidationResult {
valid: true,
format: "WAV".to_string(),
file_path: _path.to_string_lossy().to_string(),
details: Some(format!(
"Valid WAV file: {} channels, {}Hz, {}-bit, {}",
channels,
sample_rate,
bits_per_sample,
if audio_format == 1 { "PCM" } else { "non-PCM" }
)),
})
}