use std::{
collections::HashMap,
fs::File,
path::{Path, PathBuf},
};
use walkdir::WalkDir;
use crate::Error;
pub const IMAGE_EXTENSIONS: &[&str] = &[
"jpg",
"jpeg",
"png",
"camera.jpeg",
"camera.png",
"camera.jpg",
];
#[cfg(feature = "polars")]
pub fn resolve_arrow_files(arrow_path: &Path) -> Result<HashMap<String, PathBuf>, Error> {
use polars::prelude::*;
let mut file = File::open(arrow_path).map_err(|e| {
Error::InvalidParameters(format!("Cannot open Arrow file {:?}: {}", arrow_path, e))
})?;
let df = IpcReader::new(&mut file).finish().map_err(|e| {
Error::InvalidParameters(format!("Failed to read Arrow file {:?}: {}", arrow_path, e))
})?;
let names = df
.column("name")
.map_err(|e| Error::InvalidParameters(format!("Missing 'name' column: {}", e)))?
.str()
.map_err(|e| Error::InvalidParameters(format!("Invalid 'name' column type: {}", e)))?;
let frames = df.column("frame").ok();
let mut result = HashMap::new();
for idx in 0..df.height() {
let name = match names.get(idx) {
Some(n) => n.to_string(),
None => continue, };
if result.contains_key(&name) {
continue;
}
let frame = frames.and_then(|col| {
col.u64()
.ok()
.and_then(|s| s.get(idx))
.or_else(|| col.u32().ok().and_then(|s| s.get(idx).map(|v| v as u64)))
});
let relative_path = if let Some(frame_num) = frame {
PathBuf::from(&name).join(format!("{}_{:03}.camera.jpeg", name, frame_num))
} else {
PathBuf::from(format!("{}.camera.jpeg", name))
};
result.insert(name, relative_path);
}
Ok(result)
}
#[derive(Debug, Clone)]
pub struct ResolvedFile {
pub name: String,
pub frame: Option<u64>,
pub path: Option<PathBuf>,
pub expected_path: PathBuf,
}
#[cfg(feature = "polars")]
pub fn resolve_files_with_container(
arrow_path: &Path,
sensor_container: &Path,
) -> Result<Vec<ResolvedFile>, Error> {
use polars::prelude::*;
let mut file = File::open(arrow_path).map_err(|e| {
Error::InvalidParameters(format!("Cannot open Arrow file {:?}: {}", arrow_path, e))
})?;
let df = IpcReader::new(&mut file).finish().map_err(|e| {
Error::InvalidParameters(format!("Failed to read Arrow file {:?}: {}", arrow_path, e))
})?;
let file_index = build_file_index(sensor_container)?;
let names = df
.column("name")
.map_err(|e| Error::InvalidParameters(format!("Missing 'name' column: {}", e)))?
.str()
.map_err(|e| Error::InvalidParameters(format!("Invalid 'name' column type: {}", e)))?;
let frames = df.column("frame").ok();
let mut result = Vec::new();
let mut seen_samples: HashMap<String, bool> = HashMap::new();
for idx in 0..df.height() {
let name = match names.get(idx) {
Some(n) => n.to_string(),
None => continue,
};
let frame = frames.and_then(|col| {
col.u64()
.ok()
.and_then(|s| s.get(idx))
.or_else(|| col.u32().ok().and_then(|s| s.get(idx).map(|v| v as u64)))
});
let sample_key = match frame {
Some(f) => format!("{}_{}", name, f),
None => name.clone(),
};
if seen_samples.contains_key(&sample_key) {
continue;
}
seen_samples.insert(sample_key.clone(), true);
let expected_path = if let Some(frame_num) = frame {
PathBuf::from(&name).join(format!("{}_{:03}.camera.jpeg", name, frame_num))
} else {
PathBuf::from(format!("{}.camera.jpeg", name))
};
let actual_path = find_matching_file(&file_index, &name, frame);
result.push(ResolvedFile {
name,
frame,
path: actual_path,
expected_path,
});
}
Ok(result)
}
fn build_file_index(root: &Path) -> Result<HashMap<String, PathBuf>, Error> {
let mut index = HashMap::new();
if !root.exists() {
return Ok(index);
}
for entry in WalkDir::new(root)
.into_iter()
.filter_map(|e| e.ok())
.filter(|e| e.file_type().is_file())
{
let path = entry.path().to_path_buf();
if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
index.insert(filename.to_lowercase(), path.clone());
if let Some(stem) = path.file_stem().and_then(|s| s.to_str()) {
let clean_stem = stem.strip_suffix(".camera").unwrap_or(stem).to_lowercase();
index.entry(clean_stem).or_insert_with(|| path.clone());
}
}
}
Ok(index)
}
fn find_matching_file(
index: &HashMap<String, PathBuf>,
name: &str,
frame: Option<u64>,
) -> Option<PathBuf> {
let search_key = match frame {
Some(f) => format!("{}_{:03}", name, f).to_lowercase(),
None => name.to_lowercase(),
};
for ext in IMAGE_EXTENSIONS {
let key = format!("{}.{}", search_key, ext);
if let Some(path) = index.get(&key) {
return Some(path.clone());
}
}
if let Some(path) = index.get(&search_key) {
return Some(path.clone());
}
None
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ValidationIssue {
MissingArrowFile { expected: PathBuf },
MissingSensorContainer { expected: PathBuf },
MissingFile { name: String, expected: PathBuf },
UnreferencedFile { path: PathBuf },
InvalidStructure { message: String },
}
impl std::fmt::Display for ValidationIssue {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ValidationIssue::MissingArrowFile { expected } => {
write!(f, "Missing Arrow file: {:?}", expected)
}
ValidationIssue::MissingSensorContainer { expected } => {
write!(f, "Missing sensor container directory: {:?}", expected)
}
ValidationIssue::MissingFile { name, expected } => {
write!(f, "Missing file for sample '{}': {:?}", name, expected)
}
ValidationIssue::UnreferencedFile { path } => {
write!(f, "Unreferenced file in container: {:?}", path)
}
ValidationIssue::InvalidStructure { message } => {
write!(f, "Invalid structure: {}", message)
}
}
}
}
#[cfg(feature = "polars")]
pub fn validate_dataset_structure(dataset_dir: &Path) -> Result<Vec<ValidationIssue>, Error> {
let mut issues = Vec::new();
let dataset_name = dataset_dir
.file_name()
.and_then(|n| n.to_str())
.ok_or_else(|| Error::InvalidParameters("Invalid dataset directory path".to_owned()))?;
let arrow_path = dataset_dir.join(format!("{}.arrow", dataset_name));
if !arrow_path.exists() {
issues.push(ValidationIssue::MissingArrowFile {
expected: arrow_path.clone(),
});
return Ok(issues);
}
let container_path = dataset_dir.join(dataset_name);
if !container_path.exists() {
issues.push(ValidationIssue::MissingSensorContainer {
expected: container_path.clone(),
});
return Ok(issues);
}
let resolved = resolve_files_with_container(&arrow_path, &container_path)?;
let mut referenced_files: std::collections::HashSet<PathBuf> = std::collections::HashSet::new();
for file in &resolved {
match &file.path {
Some(path) => {
referenced_files.insert(path.clone());
}
None => {
issues.push(ValidationIssue::MissingFile {
name: file.name.clone(),
expected: file.expected_path.clone(),
});
}
}
}
for entry in WalkDir::new(&container_path)
.into_iter()
.filter_map(|e| e.ok())
.filter(|e| e.file_type().is_file())
{
let path = entry.path().to_path_buf();
let is_image = path
.extension()
.and_then(|e| e.to_str())
.map(|e| {
matches!(
e.to_lowercase().as_str(),
"jpg" | "jpeg" | "png" | "pcd" | "bin"
)
})
.unwrap_or(false);
if is_image && !referenced_files.contains(&path) {
issues.push(ValidationIssue::UnreferencedFile { path });
}
}
Ok(issues)
}
#[cfg(feature = "polars")]
pub fn generate_arrow_from_folder(
folder: &Path,
output: &Path,
detect_sequences: bool,
) -> Result<usize, Error> {
use polars::prelude::*;
use std::io::BufWriter;
let image_files: Vec<PathBuf> = WalkDir::new(folder)
.into_iter()
.filter_map(|e| e.ok())
.filter(|e| e.file_type().is_file())
.filter(|e| {
e.path()
.extension()
.and_then(|ext| ext.to_str())
.map(|ext| {
matches!(
ext.to_lowercase().as_str(),
"jpg" | "jpeg" | "png" | "pcd" | "bin"
)
})
.unwrap_or(false)
})
.map(|e| e.path().to_path_buf())
.collect();
if image_files.is_empty() {
return Err(Error::InvalidParameters(
"No image files found in folder".to_owned(),
));
}
let mut names: Vec<String> = Vec::new();
let mut frames: Vec<Option<u64>> = Vec::new();
for path in &image_files {
let (name, frame) = parse_image_filename(path, folder, detect_sequences);
names.push(name);
frames.push(frame);
}
let name_series = Series::new("name".into(), &names);
let frame_series = Series::new("frame".into(), &frames);
let mut df = DataFrame::new_infer_height(vec![name_series.into(), frame_series.into()])?;
if let Some(parent) = output.parent() {
std::fs::create_dir_all(parent)?;
}
let file = File::create(output)?;
let writer = BufWriter::new(file);
IpcWriter::new(writer)
.finish(&mut df)
.map_err(|e| Error::InvalidParameters(format!("Failed to write Arrow file: {}", e)))?;
Ok(image_files.len())
}
fn parse_image_filename(path: &Path, root: &Path, detect_sequences: bool) -> (String, Option<u64>) {
let stem = path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("unknown");
let clean_stem = stem.strip_suffix(".camera").unwrap_or(stem);
if !detect_sequences {
return (clean_stem.to_string(), None);
}
if let Some(idx) = clean_stem.rfind('_') {
let (name_part, frame_part) = clean_stem.split_at(idx);
let frame_str = &frame_part[1..];
if let Ok(frame) = frame_str.parse::<u64>() {
let relative = path.strip_prefix(root).unwrap_or(path);
if relative.components().count() > 1 {
return (name_part.to_string(), Some(frame));
}
return (name_part.to_string(), Some(frame));
}
}
(clean_stem.to_string(), None)
}
pub fn get_sensor_container_path(dataset_dir: &Path) -> Option<PathBuf> {
let dataset_name = dataset_dir.file_name()?.to_str()?;
Some(dataset_dir.join(dataset_name))
}
pub fn get_arrow_path(dataset_dir: &Path) -> Option<PathBuf> {
let dataset_name = dataset_dir.file_name()?.to_str()?;
Some(dataset_dir.join(format!("{}.arrow", dataset_name)))
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::TempDir;
fn create_test_image(path: &Path) {
let jpeg_data: &[u8] = &[
0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x00,
0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0xFF, 0xDB, 0x00, 0x43, 0x00, 0x08, 0x06, 0x06,
0x07, 0x06, 0x05, 0x08, 0x07, 0x07, 0x07, 0x09, 0x09, 0x08, 0x0A, 0x0C, 0x14, 0x0D,
0x0C, 0x0B, 0x0B, 0x0C, 0x19, 0x12, 0x13, 0x0F, 0x14, 0x1D, 0x1A, 0x1F, 0x1E, 0x1D,
0x1A, 0x1C, 0x1C, 0x20, 0x24, 0x2E, 0x27, 0x20, 0x22, 0x2C, 0x23, 0x1C, 0x1C, 0x28,
0x37, 0x29, 0x2C, 0x30, 0x31, 0x34, 0x34, 0x34, 0x1F, 0x27, 0x39, 0x3D, 0x38, 0x32,
0x3C, 0x2E, 0x33, 0x34, 0x32, 0xFF, 0xC0, 0x00, 0x0B, 0x08, 0x00, 0x01, 0x00, 0x01,
0x01, 0x01, 0x11, 0x00, 0xFF, 0xC4, 0x00, 0x1F, 0x00, 0x00, 0x01, 0x05, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02,
0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xC4, 0x00, 0xB5, 0x10,
0x00, 0x02, 0x01, 0x03, 0x03, 0x02, 0x04, 0x03, 0x05, 0x05, 0x04, 0x04, 0x00, 0x00,
0x01, 0x7D, 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06,
0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xA1, 0x08, 0x23, 0x42,
0xB1, 0xC1, 0x15, 0x52, 0xD1, 0xF0, 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0A, 0x16,
0x17, 0x18, 0x19, 0x1A, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x34, 0x35, 0x36, 0x37,
0x38, 0x39, 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x53, 0x54, 0x55,
0x56, 0x57, 0x58, 0x59, 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x73,
0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
0x8A, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5,
0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA,
0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
0xD7, 0xD8, 0xD9, 0xDA, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA,
0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFF, 0xDA, 0x00, 0x08,
0x01, 0x01, 0x00, 0x00, 0x3F, 0x00, 0xFB, 0xD5, 0xDB, 0x20, 0xA8, 0xF1, 0x4D, 0x9E,
0xBA, 0x79, 0xC5, 0x14, 0x51, 0x40, 0xFF, 0xD9,
];
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent).unwrap();
}
let mut file = File::create(path).unwrap();
file.write_all(jpeg_data).unwrap();
}
#[test]
fn test_get_arrow_path() {
let dir = Path::new("/data/my_dataset");
let arrow = get_arrow_path(dir).unwrap();
assert_eq!(arrow, PathBuf::from("/data/my_dataset/my_dataset.arrow"));
}
#[test]
fn test_get_sensor_container_path() {
let dir = Path::new("/data/my_dataset");
let container = get_sensor_container_path(dir).unwrap();
assert_eq!(container, PathBuf::from("/data/my_dataset/my_dataset"));
}
#[test]
fn test_parse_image_filename_standalone() {
let root = Path::new("/data");
let path = Path::new("/data/image.jpg");
let (name, frame) = parse_image_filename(path, root, true);
assert_eq!(name, "image");
assert_eq!(frame, None);
}
#[test]
fn test_parse_image_filename_camera_extension() {
let root = Path::new("/data");
let path = Path::new("/data/sample.camera.jpeg");
let (name, frame) = parse_image_filename(path, root, true);
assert_eq!(name, "sample");
assert_eq!(frame, None);
}
#[test]
fn test_parse_image_filename_sequence() {
let root = Path::new("/data");
let path = Path::new("/data/seq/seq_001.camera.jpeg");
let (name, frame) = parse_image_filename(path, root, true);
assert_eq!(name, "seq");
assert_eq!(frame, Some(1));
}
#[test]
fn test_parse_image_filename_no_sequence_detection() {
let root = Path::new("/data");
let path = Path::new("/data/seq/seq_001.camera.jpeg");
let (name, frame) = parse_image_filename(path, root, false);
assert_eq!(name, "seq_001");
assert_eq!(frame, None);
}
#[test]
fn test_build_file_index() {
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
create_test_image(&root.join("image1.jpg"));
create_test_image(&root.join("sub/image2.camera.jpeg"));
let index = build_file_index(root).unwrap();
assert!(index.contains_key("image1.jpg"));
assert!(index.contains_key("image2.camera.jpeg"));
assert!(index.contains_key("image1"));
assert!(index.contains_key("image2"));
}
#[test]
fn test_find_matching_file() {
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
create_test_image(&root.join("sample.camera.jpeg"));
create_test_image(&root.join("seq/seq_001.camera.jpeg"));
let index = build_file_index(root).unwrap();
let found = find_matching_file(&index, "sample", None);
assert!(found.is_some());
let found = find_matching_file(&index, "seq", Some(1));
assert!(found.is_some());
let found = find_matching_file(&index, "nonexistent", None);
assert!(found.is_none());
}
#[cfg(feature = "polars")]
#[test]
fn test_generate_arrow_from_folder() {
use polars::prelude::*;
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
let images_dir = root.join("images");
create_test_image(&images_dir.join("photo1.jpg"));
create_test_image(&images_dir.join("photo2.png"));
create_test_image(&images_dir.join("seq/seq_001.camera.jpeg"));
create_test_image(&images_dir.join("seq/seq_002.camera.jpeg"));
let arrow_path = root.join("output.arrow");
let count = generate_arrow_from_folder(&images_dir, &arrow_path, true).unwrap();
assert_eq!(count, 4);
assert!(arrow_path.exists());
let mut file = File::open(&arrow_path).unwrap();
let df = IpcReader::new(&mut file).finish().unwrap();
assert_eq!(df.height(), 4);
assert_eq!(df.width(), 2); assert!(df.column("name").is_ok());
assert!(df.column("frame").is_ok());
}
#[cfg(feature = "polars")]
#[test]
fn test_resolve_arrow_files() {
use polars::prelude::*;
use std::io::BufWriter;
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
let names = Series::new("name".into(), &["sample1", "sample2", "seq"]);
let frames: Vec<Option<u64>> = vec![None, None, Some(1)];
let frame_series = Series::new("frame".into(), &frames);
let mut df = DataFrame::new_infer_height(vec![names.into(), frame_series.into()]).unwrap();
let arrow_path = root.join("test.arrow");
let file = File::create(&arrow_path).unwrap();
let writer = BufWriter::new(file);
IpcWriter::new(writer).finish(&mut df).unwrap();
let resolved = resolve_arrow_files(&arrow_path).unwrap();
assert_eq!(resolved.len(), 3);
assert!(resolved.contains_key("sample1"));
assert!(resolved.contains_key("sample2"));
assert!(resolved.contains_key("seq"));
}
#[cfg(feature = "polars")]
#[test]
fn test_validate_dataset_structure_valid() {
use polars::prelude::*;
use std::io::BufWriter;
let temp_dir = TempDir::new().unwrap();
let dataset_dir = temp_dir.path().join("my_dataset");
std::fs::create_dir_all(&dataset_dir).unwrap();
let names = Series::new("name".into(), &["image1"]);
let frames: Vec<Option<u64>> = vec![None];
let frame_series = Series::new("frame".into(), &frames);
let mut df = DataFrame::new_infer_height(vec![names.into(), frame_series.into()]).unwrap();
let arrow_path = dataset_dir.join("my_dataset.arrow");
let file = File::create(&arrow_path).unwrap();
let writer = BufWriter::new(file);
IpcWriter::new(writer).finish(&mut df).unwrap();
let container = dataset_dir.join("my_dataset");
create_test_image(&container.join("image1.camera.jpeg"));
let issues = validate_dataset_structure(&dataset_dir).unwrap();
let missing_files: Vec<_> = issues
.iter()
.filter(|i| matches!(i, ValidationIssue::MissingFile { .. }))
.collect();
assert!(
missing_files.is_empty(),
"Unexpected missing files: {:?}",
missing_files
);
}
#[cfg(feature = "polars")]
#[test]
fn test_validate_dataset_structure_missing_arrow() {
let temp_dir = TempDir::new().unwrap();
let dataset_dir = temp_dir.path().join("my_dataset");
std::fs::create_dir_all(&dataset_dir).unwrap();
let issues = validate_dataset_structure(&dataset_dir).unwrap();
assert_eq!(issues.len(), 1);
assert!(matches!(
&issues[0],
ValidationIssue::MissingArrowFile { .. }
));
}
#[test]
fn test_image_extensions() {
assert!(IMAGE_EXTENSIONS.contains(&"jpg"));
assert!(IMAGE_EXTENSIONS.contains(&"jpeg"));
assert!(IMAGE_EXTENSIONS.contains(&"png"));
assert!(IMAGE_EXTENSIONS.contains(&"camera.jpeg"));
}
#[test]
fn test_validation_issue_display() {
let issue = ValidationIssue::MissingFile {
name: "test".to_string(),
expected: PathBuf::from("test.jpg"),
};
let display = format!("{}", issue);
assert!(display.contains("test"));
assert!(display.contains("test.jpg"));
}
}