Skip to main content

edgefirst_client/
format.rs

1//! EdgeFirst Dataset Format utilities.
2//!
3//! This module provides tools for working with the EdgeFirst Dataset Format
4//! as documented in DATASET_FORMAT.md. It enables:
5//!
6//! - Reading and resolving file paths from Arrow annotation files
7//! - Generating Arrow files from folders of images (with null annotations)
8//! - Validating dataset directory structures
9//! - (Future) Converting from other formats (COCO, DarkNet, YOLO, etc.)
10//!
11//! # EdgeFirst Dataset Format
12//!
13//! A dataset in EdgeFirst format consists of:
14//! - An Arrow file (`{dataset_name}.arrow`) containing annotation metadata
15//! - A sensor container directory (`{dataset_name}/`) with image/sensor files
16//!
17//! ## Supported Structures
18//!
19//! **Sequence-based** (frame column is not null):
20//! ```text
21//! dataset_name/
22//! ├── dataset_name.arrow
23//! └── dataset_name/
24//!     └── sequence_name/
25//!         ├── sequence_name_001.camera.jpeg
26//!         └── sequence_name_002.camera.jpeg
27//! ```
28//!
29//! **Image-based** (frame column is null):
30//! ```text
31//! dataset_name/
32//! ├── dataset_name.arrow
33//! └── dataset_name/
34//!     ├── image1.jpg
35//!     └── image2.png
36//! ```
37//!
38//! # Example
39//!
40//! ```rust,no_run
41//! use edgefirst_client::format::{resolve_arrow_files, validate_dataset_structure};
42//! use std::path::Path;
43//!
44//! // Resolve all files referenced by an Arrow file
45//! let arrow_path = Path::new("my_dataset/my_dataset.arrow");
46//! let files = resolve_arrow_files(arrow_path)?;
47//! for (name, path) in &files {
48//!     println!("{}: {:?}", name, path);
49//! }
50//!
51//! // Validate the dataset structure
52//! let issues = validate_dataset_structure(Path::new("my_dataset"))?;
53//! if !issues.is_empty() {
54//!     for issue in &issues {
55//!         eprintln!("Warning: {}", issue);
56//!     }
57//! }
58//! # Ok::<(), edgefirst_client::Error>(())
59//! ```
60
61use std::{
62    collections::HashMap,
63    fs::File,
64    path::{Path, PathBuf},
65};
66
67use walkdir::WalkDir;
68
69use crate::Error;
70
71/// Image file extensions supported by EdgeFirst.
72pub const IMAGE_EXTENSIONS: &[&str] = &[
73    "jpg",
74    "jpeg",
75    "png",
76    "camera.jpeg",
77    "camera.png",
78    "camera.jpg",
79];
80
81/// Resolve all file paths referenced by an Arrow annotation file.
82///
83/// Reads the Arrow file and extracts the `name` and `frame` columns to
84/// determine which image files are referenced. Returns a map from sample
85/// name to the expected relative file path within the sensor container.
86///
87/// # Arguments
88///
89/// * `arrow_path` - Path to the Arrow annotation file
90///
91/// # Returns
92///
93/// A map from sample name (e.g., "deer_001") to relative file path within
94/// the sensor container (e.g., "deer/deer_001.camera.jpeg").
95///
96/// # Errors
97///
98/// Returns an error if:
99/// * Arrow file cannot be read
100/// * Arrow file is missing required columns
101/// * Arrow file has invalid data types
102///
103/// # Example
104///
105/// ```rust,no_run
106/// use edgefirst_client::format::resolve_arrow_files;
107/// use std::path::Path;
108///
109/// let arrow_path = Path::new("dataset/dataset.arrow");
110/// let files = resolve_arrow_files(arrow_path)?;
111///
112/// for (name, relative_path) in &files {
113///     println!("Sample '{}' -> {:?}", name, relative_path);
114/// }
115/// # Ok::<(), edgefirst_client::Error>(())
116/// ```
117#[cfg(feature = "polars")]
118pub fn resolve_arrow_files(arrow_path: &Path) -> Result<HashMap<String, PathBuf>, Error> {
119    use polars::prelude::*;
120
121    let mut file = File::open(arrow_path).map_err(|e| {
122        Error::InvalidParameters(format!("Cannot open Arrow file {:?}: {}", arrow_path, e))
123    })?;
124
125    let df = IpcReader::new(&mut file).finish().map_err(|e| {
126        Error::InvalidParameters(format!("Failed to read Arrow file {:?}: {}", arrow_path, e))
127    })?;
128
129    // Get the name column (required)
130    let names = df
131        .column("name")
132        .map_err(|e| Error::InvalidParameters(format!("Missing 'name' column: {}", e)))?
133        .str()
134        .map_err(|e| Error::InvalidParameters(format!("Invalid 'name' column type: {}", e)))?;
135
136    // Get the frame column (optional - determines sequence vs standalone)
137    let frames = df.column("frame").ok();
138
139    let mut result = HashMap::new();
140
141    for idx in 0..df.height() {
142        // Extract sample name
143        let name = match names.get(idx) {
144            Some(n) => n.to_string(),
145            None => continue, // Skip null names
146        };
147
148        // Skip if we've already processed this sample name
149        if result.contains_key(&name) {
150            continue;
151        }
152
153        // Check if this is a sequence sample (frame is not null)
154        let frame = frames.and_then(|col| {
155            // Try as u64 first, then u32
156            col.u64()
157                .ok()
158                .and_then(|s| s.get(idx))
159                .or_else(|| col.u32().ok().and_then(|s| s.get(idx).map(|v| v as u64)))
160        });
161
162        // Build the relative path based on whether this is a sequence or standalone
163        let relative_path = if let Some(frame_num) = frame {
164            // Sequence: name/name_frame.camera.jpeg
165            // The name column contains the sequence name
166            PathBuf::from(&name).join(format!("{}_{:03}.camera.jpeg", name, frame_num))
167        } else {
168            // Standalone: name.jpg (or similar - we'll resolve actual extension later)
169            PathBuf::from(format!("{}.camera.jpeg", name))
170        };
171
172        result.insert(name, relative_path);
173    }
174
175    Ok(result)
176}
177
178/// Information about a resolved sample file.
179#[derive(Debug, Clone)]
180pub struct ResolvedFile {
181    /// Sample name from the Arrow file
182    pub name: String,
183    /// Frame number (None for standalone images)
184    pub frame: Option<u64>,
185    /// Actual file path on disk (if found)
186    pub path: Option<PathBuf>,
187    /// Expected relative path within sensor container
188    pub expected_path: PathBuf,
189}
190
191/// Resolve Arrow file references against actual files in a sensor container.
192///
193/// This function reads an Arrow file, extracts sample references, and attempts
194/// to match them against actual files in the sensor container directory.
195///
196/// # Arguments
197///
198/// * `arrow_path` - Path to the Arrow annotation file
199/// * `sensor_container` - Path to the sensor container directory
200///
201/// # Returns
202///
203/// A list of resolved files with match information.
204///
205/// # Example
206///
207/// ```rust,no_run
208/// use edgefirst_client::format::resolve_files_with_container;
209/// use std::path::Path;
210///
211/// let resolved = resolve_files_with_container(
212///     Path::new("dataset/dataset.arrow"),
213///     Path::new("dataset/dataset"),
214/// )?;
215///
216/// for file in &resolved {
217///     match &file.path {
218///         Some(p) => println!("Found: {} -> {:?}", file.name, p),
219///         None => println!("Missing: {} (expected {:?})", file.name, file.expected_path),
220///     }
221/// }
222/// # Ok::<(), edgefirst_client::Error>(())
223/// ```
224#[cfg(feature = "polars")]
225pub fn resolve_files_with_container(
226    arrow_path: &Path,
227    sensor_container: &Path,
228) -> Result<Vec<ResolvedFile>, Error> {
229    use polars::prelude::*;
230
231    let mut file = File::open(arrow_path).map_err(|e| {
232        Error::InvalidParameters(format!("Cannot open Arrow file {:?}: {}", arrow_path, e))
233    })?;
234
235    let df = IpcReader::new(&mut file).finish().map_err(|e| {
236        Error::InvalidParameters(format!("Failed to read Arrow file {:?}: {}", arrow_path, e))
237    })?;
238
239    // Build an index of all files in the sensor container
240    let file_index = build_file_index(sensor_container)?;
241
242    // Get the name column (required)
243    let names = df
244        .column("name")
245        .map_err(|e| Error::InvalidParameters(format!("Missing 'name' column: {}", e)))?
246        .str()
247        .map_err(|e| Error::InvalidParameters(format!("Invalid 'name' column type: {}", e)))?;
248
249    // Get the frame column (optional)
250    let frames = df.column("frame").ok();
251
252    let mut result = Vec::new();
253    let mut seen_samples: HashMap<String, bool> = HashMap::new();
254
255    for idx in 0..df.height() {
256        let name = match names.get(idx) {
257            Some(n) => n.to_string(),
258            None => continue,
259        };
260
261        // Create unique key for deduplication (name + frame)
262        let frame = frames.and_then(|col| {
263            col.u64()
264                .ok()
265                .and_then(|s| s.get(idx))
266                .or_else(|| col.u32().ok().and_then(|s| s.get(idx).map(|v| v as u64)))
267        });
268
269        let sample_key = match frame {
270            Some(f) => format!("{}_{}", name, f),
271            None => name.clone(),
272        };
273
274        // Skip duplicates
275        if seen_samples.contains_key(&sample_key) {
276            continue;
277        }
278        seen_samples.insert(sample_key.clone(), true);
279
280        // Build expected path and try to find actual file
281        let expected_path = if let Some(frame_num) = frame {
282            PathBuf::from(&name).join(format!("{}_{:03}.camera.jpeg", name, frame_num))
283        } else {
284            PathBuf::from(format!("{}.camera.jpeg", name))
285        };
286
287        // Try to find the actual file using flexible matching
288        let actual_path = find_matching_file(&file_index, &name, frame);
289
290        result.push(ResolvedFile {
291            name,
292            frame,
293            path: actual_path,
294            expected_path,
295        });
296    }
297
298    Ok(result)
299}
300
301/// Build an index of all files in a directory for fast lookup.
302fn build_file_index(root: &Path) -> Result<HashMap<String, PathBuf>, Error> {
303    let mut index = HashMap::new();
304
305    if !root.exists() {
306        return Ok(index);
307    }
308
309    for entry in WalkDir::new(root)
310        .into_iter()
311        .filter_map(|e| e.ok())
312        .filter(|e| e.file_type().is_file())
313    {
314        let path = entry.path().to_path_buf();
315        if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
316            // Index by full filename
317            index.insert(filename.to_lowercase(), path.clone());
318
319            // Also index by stem (without extension) for flexible matching
320            if let Some(stem) = path.file_stem().and_then(|s| s.to_str()) {
321                // Handle double extensions like .camera.jpeg
322                let clean_stem = stem.strip_suffix(".camera").unwrap_or(stem).to_lowercase();
323                index.entry(clean_stem).or_insert_with(|| path.clone());
324            }
325        }
326    }
327
328    Ok(index)
329}
330
331/// Find a matching file in the index using flexible matching.
332fn find_matching_file(
333    index: &HashMap<String, PathBuf>,
334    name: &str,
335    frame: Option<u64>,
336) -> Option<PathBuf> {
337    let search_key = match frame {
338        Some(f) => format!("{}_{:03}", name, f).to_lowercase(),
339        None => name.to_lowercase(),
340    };
341
342    // Try exact filename match first
343    for ext in IMAGE_EXTENSIONS {
344        let key = format!("{}.{}", search_key, ext);
345        if let Some(path) = index.get(&key) {
346            return Some(path.clone());
347        }
348    }
349
350    // Try stem match
351    if let Some(path) = index.get(&search_key) {
352        return Some(path.clone());
353    }
354
355    None
356}
357
358/// Validation issue found in dataset structure.
359#[derive(Debug, Clone, PartialEq, Eq)]
360pub enum ValidationIssue {
361    /// Arrow file is missing
362    MissingArrowFile { expected: PathBuf },
363    /// Sensor container directory is missing
364    MissingSensorContainer { expected: PathBuf },
365    /// A referenced file is missing
366    MissingFile { name: String, expected: PathBuf },
367    /// An unreferenced file was found in the container
368    UnreferencedFile { path: PathBuf },
369    /// Invalid directory structure
370    InvalidStructure { message: String },
371}
372
373impl std::fmt::Display for ValidationIssue {
374    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
375        match self {
376            ValidationIssue::MissingArrowFile { expected } => {
377                write!(f, "Missing Arrow file: {:?}", expected)
378            }
379            ValidationIssue::MissingSensorContainer { expected } => {
380                write!(f, "Missing sensor container directory: {:?}", expected)
381            }
382            ValidationIssue::MissingFile { name, expected } => {
383                write!(f, "Missing file for sample '{}': {:?}", name, expected)
384            }
385            ValidationIssue::UnreferencedFile { path } => {
386                write!(f, "Unreferenced file in container: {:?}", path)
387            }
388            ValidationIssue::InvalidStructure { message } => {
389                write!(f, "Invalid structure: {}", message)
390            }
391        }
392    }
393}
394
395/// Validate the structure of a dataset directory.
396///
397/// Checks that the directory follows the EdgeFirst Dataset Format:
398/// - Arrow file exists at expected location
399/// - Sensor container directory exists
400/// - All files referenced in Arrow file exist in container
401/// - Reports any unreferenced files
402///
403/// # Arguments
404///
405/// * `dataset_dir` - Path to the snapshot root directory
406///
407/// # Returns
408///
409/// A list of validation issues (empty if valid).
410///
411/// # Example
412///
413/// ```rust,no_run
414/// use edgefirst_client::format::validate_dataset_structure;
415/// use std::path::Path;
416///
417/// let issues = validate_dataset_structure(Path::new("my_dataset"))?;
418/// if issues.is_empty() {
419///     println!("Dataset structure is valid!");
420/// } else {
421///     for issue in &issues {
422///         eprintln!("Issue: {}", issue);
423///     }
424/// }
425/// # Ok::<(), edgefirst_client::Error>(())
426/// ```
427#[cfg(feature = "polars")]
428pub fn validate_dataset_structure(dataset_dir: &Path) -> Result<Vec<ValidationIssue>, Error> {
429    let mut issues = Vec::new();
430
431    // Get the dataset name from the directory name
432    let dataset_name = dataset_dir
433        .file_name()
434        .and_then(|n| n.to_str())
435        .ok_or_else(|| Error::InvalidParameters("Invalid dataset directory path".to_owned()))?;
436
437    // Check for Arrow file
438    let arrow_path = dataset_dir.join(format!("{}.arrow", dataset_name));
439    if !arrow_path.exists() {
440        issues.push(ValidationIssue::MissingArrowFile {
441            expected: arrow_path.clone(),
442        });
443        // Can't continue validation without Arrow file
444        return Ok(issues);
445    }
446
447    // Check for sensor container
448    let container_path = dataset_dir.join(dataset_name);
449    if !container_path.exists() {
450        issues.push(ValidationIssue::MissingSensorContainer {
451            expected: container_path.clone(),
452        });
453        // Can't continue validation without container
454        return Ok(issues);
455    }
456
457    // Resolve files and check for missing ones
458    let resolved = resolve_files_with_container(&arrow_path, &container_path)?;
459
460    // Track which files were referenced
461    let mut referenced_files: std::collections::HashSet<PathBuf> = std::collections::HashSet::new();
462
463    for file in &resolved {
464        match &file.path {
465            Some(path) => {
466                referenced_files.insert(path.clone());
467            }
468            None => {
469                issues.push(ValidationIssue::MissingFile {
470                    name: file.name.clone(),
471                    expected: file.expected_path.clone(),
472                });
473            }
474        }
475    }
476
477    // Find unreferenced files in container
478    for entry in WalkDir::new(&container_path)
479        .into_iter()
480        .filter_map(|e| e.ok())
481        .filter(|e| e.file_type().is_file())
482    {
483        let path = entry.path().to_path_buf();
484
485        // Check if this file is an image file
486        let is_image = path
487            .extension()
488            .and_then(|e| e.to_str())
489            .map(|e| {
490                matches!(
491                    e.to_lowercase().as_str(),
492                    "jpg" | "jpeg" | "png" | "pcd" | "bin"
493                )
494            })
495            .unwrap_or(false);
496
497        if is_image && !referenced_files.contains(&path) {
498            issues.push(ValidationIssue::UnreferencedFile { path });
499        }
500    }
501
502    Ok(issues)
503}
504
505/// Generate an Arrow file from a folder of images.
506///
507/// Scans the folder for image files and creates an Arrow annotation file
508/// with null annotations (for unannotated datasets). This is useful for
509/// importing existing image collections into EdgeFirst.
510///
511/// # Arguments
512///
513/// * `folder` - Path to the folder containing images
514/// * `output` - Path where the Arrow file should be written
515/// * `detect_sequences` - If true, attempt to detect sequences from naming
516///   patterns
517///
518/// # Returns
519///
520/// The number of samples (images) included in the Arrow file.
521///
522/// # Sequence Detection
523///
524/// When `detect_sequences` is true, the function looks for patterns like:
525/// - `{name}_{number}.{ext}` → sequence with frame number
526/// - `{sequence}/{name}_{number}.{ext}` → sequence in subdirectory
527///
528/// # Example
529///
530/// ```rust,no_run
531/// use edgefirst_client::format::generate_arrow_from_folder;
532/// use std::path::Path;
533///
534/// // Generate Arrow file from images
535/// let count = generate_arrow_from_folder(
536///     Path::new("my_images"),
537///     Path::new("my_dataset/my_dataset.arrow"),
538///     true, // detect sequences
539/// )?;
540/// println!("Created Arrow file with {} samples", count);
541/// # Ok::<(), edgefirst_client::Error>(())
542/// ```
543#[cfg(feature = "polars")]
544pub fn generate_arrow_from_folder(
545    folder: &Path,
546    output: &Path,
547    detect_sequences: bool,
548) -> Result<usize, Error> {
549    use polars::prelude::*;
550    use std::io::BufWriter;
551
552    // Collect all image files
553    let image_files: Vec<PathBuf> = WalkDir::new(folder)
554        .into_iter()
555        .filter_map(|e| e.ok())
556        .filter(|e| e.file_type().is_file())
557        .filter(|e| {
558            e.path()
559                .extension()
560                .and_then(|ext| ext.to_str())
561                .map(|ext| {
562                    matches!(
563                        ext.to_lowercase().as_str(),
564                        "jpg" | "jpeg" | "png" | "pcd" | "bin"
565                    )
566                })
567                .unwrap_or(false)
568        })
569        .map(|e| e.path().to_path_buf())
570        .collect();
571
572    if image_files.is_empty() {
573        return Err(Error::InvalidParameters(
574            "No image files found in folder".to_owned(),
575        ));
576    }
577
578    // Parse each image file to extract name and frame
579    let mut names: Vec<String> = Vec::new();
580    let mut frames: Vec<Option<u64>> = Vec::new();
581
582    for path in &image_files {
583        let (name, frame) = parse_image_filename(path, folder, detect_sequences);
584        names.push(name);
585        frames.push(frame);
586    }
587
588    // Build the DataFrame with the 2026.04 schema — only emit name and frame
589    // columns (no null geometry columns; per the column-presence = data-intent
590    // rule, absent columns mean no data of that type).
591    let name_series = Series::new("name".into(), &names);
592    let frame_series = Series::new("frame".into(), &frames);
593
594    let mut df = DataFrame::new_infer_height(vec![name_series.into(), frame_series.into()])?;
595
596    // Create output directory if needed
597    if let Some(parent) = output.parent() {
598        std::fs::create_dir_all(parent)?;
599    }
600
601    // Write the Arrow file
602    let file = File::create(output)?;
603    let writer = BufWriter::new(file);
604    IpcWriter::new(writer)
605        .finish(&mut df)
606        .map_err(|e| Error::InvalidParameters(format!("Failed to write Arrow file: {}", e)))?;
607
608    Ok(image_files.len())
609}
610
611/// Parse an image filename to extract sample name and frame number.
612fn parse_image_filename(path: &Path, root: &Path, detect_sequences: bool) -> (String, Option<u64>) {
613    let stem = path
614        .file_stem()
615        .and_then(|s| s.to_str())
616        .unwrap_or("unknown");
617
618    // Remove .camera suffix if present
619    let clean_stem = stem.strip_suffix(".camera").unwrap_or(stem);
620
621    if !detect_sequences {
622        return (clean_stem.to_string(), None);
623    }
624
625    // Try to detect sequence pattern: name_frame
626    // Look for trailing number separated by underscore
627    if let Some(idx) = clean_stem.rfind('_') {
628        let (name_part, frame_part) = clean_stem.split_at(idx);
629        let frame_str = &frame_part[1..]; // Skip the underscore
630
631        if let Ok(frame) = frame_str.parse::<u64>() {
632            // Check if this might be in a sequence directory
633            let relative = path.strip_prefix(root).unwrap_or(path);
634            if relative.components().count() > 1 {
635                // In a subdirectory - this is likely a sequence
636                return (name_part.to_string(), Some(frame));
637            }
638
639            // Also detect if multiple files share the same prefix
640            // (This is a heuristic - files in root with _N pattern are likely sequences)
641            return (name_part.to_string(), Some(frame));
642        }
643    }
644
645    // No sequence detected
646    (clean_stem.to_string(), None)
647}
648
649/// Get the expected sensor container path for a dataset directory.
650///
651/// # Arguments
652///
653/// * `dataset_dir` - Path to the snapshot root directory
654///
655/// # Returns
656///
657/// The expected path to the sensor container directory.
658pub fn get_sensor_container_path(dataset_dir: &Path) -> Option<PathBuf> {
659    let dataset_name = dataset_dir.file_name()?.to_str()?;
660    Some(dataset_dir.join(dataset_name))
661}
662
663/// Get the expected Arrow file path for a dataset directory.
664///
665/// # Arguments
666///
667/// * `dataset_dir` - Path to the snapshot root directory
668///
669/// # Returns
670///
671/// The expected path to the Arrow annotation file.
672pub fn get_arrow_path(dataset_dir: &Path) -> Option<PathBuf> {
673    let dataset_name = dataset_dir.file_name()?.to_str()?;
674    Some(dataset_dir.join(format!("{}.arrow", dataset_name)))
675}
676
677#[cfg(test)]
678mod tests {
679    use super::*;
680    use std::io::Write;
681    use tempfile::TempDir;
682
683    /// Create a test image file (minimal JPEG).
684    fn create_test_image(path: &Path) {
685        // Minimal valid JPEG (smallest possible)
686        let jpeg_data: &[u8] = &[
687            0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x00,
688            0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0xFF, 0xDB, 0x00, 0x43, 0x00, 0x08, 0x06, 0x06,
689            0x07, 0x06, 0x05, 0x08, 0x07, 0x07, 0x07, 0x09, 0x09, 0x08, 0x0A, 0x0C, 0x14, 0x0D,
690            0x0C, 0x0B, 0x0B, 0x0C, 0x19, 0x12, 0x13, 0x0F, 0x14, 0x1D, 0x1A, 0x1F, 0x1E, 0x1D,
691            0x1A, 0x1C, 0x1C, 0x20, 0x24, 0x2E, 0x27, 0x20, 0x22, 0x2C, 0x23, 0x1C, 0x1C, 0x28,
692            0x37, 0x29, 0x2C, 0x30, 0x31, 0x34, 0x34, 0x34, 0x1F, 0x27, 0x39, 0x3D, 0x38, 0x32,
693            0x3C, 0x2E, 0x33, 0x34, 0x32, 0xFF, 0xC0, 0x00, 0x0B, 0x08, 0x00, 0x01, 0x00, 0x01,
694            0x01, 0x01, 0x11, 0x00, 0xFF, 0xC4, 0x00, 0x1F, 0x00, 0x00, 0x01, 0x05, 0x01, 0x01,
695            0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02,
696            0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xC4, 0x00, 0xB5, 0x10,
697            0x00, 0x02, 0x01, 0x03, 0x03, 0x02, 0x04, 0x03, 0x05, 0x05, 0x04, 0x04, 0x00, 0x00,
698            0x01, 0x7D, 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06,
699            0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xA1, 0x08, 0x23, 0x42,
700            0xB1, 0xC1, 0x15, 0x52, 0xD1, 0xF0, 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0A, 0x16,
701            0x17, 0x18, 0x19, 0x1A, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x34, 0x35, 0x36, 0x37,
702            0x38, 0x39, 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x53, 0x54, 0x55,
703            0x56, 0x57, 0x58, 0x59, 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x73,
704            0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
705            0x8A, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5,
706            0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA,
707            0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
708            0xD7, 0xD8, 0xD9, 0xDA, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA,
709            0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFF, 0xDA, 0x00, 0x08,
710            0x01, 0x01, 0x00, 0x00, 0x3F, 0x00, 0xFB, 0xD5, 0xDB, 0x20, 0xA8, 0xF1, 0x4D, 0x9E,
711            0xBA, 0x79, 0xC5, 0x14, 0x51, 0x40, 0xFF, 0xD9,
712        ];
713
714        if let Some(parent) = path.parent() {
715            std::fs::create_dir_all(parent).unwrap();
716        }
717        let mut file = File::create(path).unwrap();
718        file.write_all(jpeg_data).unwrap();
719    }
720
721    #[test]
722    fn test_get_arrow_path() {
723        let dir = Path::new("/data/my_dataset");
724        let arrow = get_arrow_path(dir).unwrap();
725        assert_eq!(arrow, PathBuf::from("/data/my_dataset/my_dataset.arrow"));
726    }
727
728    #[test]
729    fn test_get_sensor_container_path() {
730        let dir = Path::new("/data/my_dataset");
731        let container = get_sensor_container_path(dir).unwrap();
732        assert_eq!(container, PathBuf::from("/data/my_dataset/my_dataset"));
733    }
734
735    #[test]
736    fn test_parse_image_filename_standalone() {
737        let root = Path::new("/data");
738        let path = Path::new("/data/image.jpg");
739
740        let (name, frame) = parse_image_filename(path, root, true);
741        assert_eq!(name, "image");
742        assert_eq!(frame, None);
743    }
744
745    #[test]
746    fn test_parse_image_filename_camera_extension() {
747        let root = Path::new("/data");
748        let path = Path::new("/data/sample.camera.jpeg");
749
750        let (name, frame) = parse_image_filename(path, root, true);
751        assert_eq!(name, "sample");
752        assert_eq!(frame, None);
753    }
754
755    #[test]
756    fn test_parse_image_filename_sequence() {
757        let root = Path::new("/data");
758        let path = Path::new("/data/seq/seq_001.camera.jpeg");
759
760        let (name, frame) = parse_image_filename(path, root, true);
761        assert_eq!(name, "seq");
762        assert_eq!(frame, Some(1));
763    }
764
765    #[test]
766    fn test_parse_image_filename_no_sequence_detection() {
767        let root = Path::new("/data");
768        let path = Path::new("/data/seq/seq_001.camera.jpeg");
769
770        let (name, frame) = parse_image_filename(path, root, false);
771        assert_eq!(name, "seq_001");
772        assert_eq!(frame, None);
773    }
774
775    #[test]
776    fn test_build_file_index() {
777        let temp_dir = TempDir::new().unwrap();
778        let root = temp_dir.path();
779
780        // Create test files
781        create_test_image(&root.join("image1.jpg"));
782        create_test_image(&root.join("sub/image2.camera.jpeg"));
783
784        let index = build_file_index(root).unwrap();
785
786        // Check that files are indexed
787        assert!(index.contains_key("image1.jpg"));
788        assert!(index.contains_key("image2.camera.jpeg"));
789
790        // Check stem indexing
791        assert!(index.contains_key("image1"));
792        assert!(index.contains_key("image2"));
793    }
794
795    #[test]
796    fn test_find_matching_file() {
797        let temp_dir = TempDir::new().unwrap();
798        let root = temp_dir.path();
799
800        // Create test files
801        create_test_image(&root.join("sample.camera.jpeg"));
802        create_test_image(&root.join("seq/seq_001.camera.jpeg"));
803
804        let index = build_file_index(root).unwrap();
805
806        // Find standalone file
807        let found = find_matching_file(&index, "sample", None);
808        assert!(found.is_some());
809
810        // Find sequence file
811        let found = find_matching_file(&index, "seq", Some(1));
812        assert!(found.is_some());
813
814        // Missing file
815        let found = find_matching_file(&index, "nonexistent", None);
816        assert!(found.is_none());
817    }
818
819    #[cfg(feature = "polars")]
820    #[test]
821    fn test_generate_arrow_from_folder() {
822        use polars::prelude::*;
823
824        let temp_dir = TempDir::new().unwrap();
825        let root = temp_dir.path();
826
827        // Create test images
828        let images_dir = root.join("images");
829        create_test_image(&images_dir.join("photo1.jpg"));
830        create_test_image(&images_dir.join("photo2.png"));
831        create_test_image(&images_dir.join("seq/seq_001.camera.jpeg"));
832        create_test_image(&images_dir.join("seq/seq_002.camera.jpeg"));
833
834        // Generate Arrow file
835        let arrow_path = root.join("output.arrow");
836        let count = generate_arrow_from_folder(&images_dir, &arrow_path, true).unwrap();
837
838        assert_eq!(count, 4);
839        assert!(arrow_path.exists());
840
841        // Verify Arrow file content
842        let mut file = File::open(&arrow_path).unwrap();
843        let df = IpcReader::new(&mut file).finish().unwrap();
844
845        assert_eq!(df.height(), 4);
846        assert_eq!(df.width(), 2); // 2026.04 schema: only name + frame
847        assert!(df.column("name").is_ok());
848        assert!(df.column("frame").is_ok());
849    }
850
851    #[cfg(feature = "polars")]
852    #[test]
853    fn test_resolve_arrow_files() {
854        use polars::prelude::*;
855        use std::io::BufWriter;
856
857        let temp_dir = TempDir::new().unwrap();
858        let root = temp_dir.path();
859
860        // Create a simple Arrow file
861        let names = Series::new("name".into(), &["sample1", "sample2", "seq"]);
862        let frames: Vec<Option<u64>> = vec![None, None, Some(1)];
863        let frame_series = Series::new("frame".into(), &frames);
864
865        let mut df = DataFrame::new_infer_height(vec![names.into(), frame_series.into()]).unwrap();
866
867        let arrow_path = root.join("test.arrow");
868        let file = File::create(&arrow_path).unwrap();
869        let writer = BufWriter::new(file);
870        IpcWriter::new(writer).finish(&mut df).unwrap();
871
872        // Test resolution
873        let resolved = resolve_arrow_files(&arrow_path).unwrap();
874
875        assert_eq!(resolved.len(), 3);
876        assert!(resolved.contains_key("sample1"));
877        assert!(resolved.contains_key("sample2"));
878        assert!(resolved.contains_key("seq"));
879    }
880
881    #[cfg(feature = "polars")]
882    #[test]
883    fn test_validate_dataset_structure_valid() {
884        use polars::prelude::*;
885        use std::io::BufWriter;
886
887        let temp_dir = TempDir::new().unwrap();
888        let dataset_dir = temp_dir.path().join("my_dataset");
889        std::fs::create_dir_all(&dataset_dir).unwrap();
890
891        // Create Arrow file
892        let names = Series::new("name".into(), &["image1"]);
893        let frames: Vec<Option<u64>> = vec![None];
894        let frame_series = Series::new("frame".into(), &frames);
895
896        let mut df = DataFrame::new_infer_height(vec![names.into(), frame_series.into()]).unwrap();
897
898        let arrow_path = dataset_dir.join("my_dataset.arrow");
899        let file = File::create(&arrow_path).unwrap();
900        let writer = BufWriter::new(file);
901        IpcWriter::new(writer).finish(&mut df).unwrap();
902
903        // Create sensor container with matching file
904        let container = dataset_dir.join("my_dataset");
905        create_test_image(&container.join("image1.camera.jpeg"));
906
907        // Validate
908        let issues = validate_dataset_structure(&dataset_dir).unwrap();
909
910        // Should have no missing file issues
911        let missing_files: Vec<_> = issues
912            .iter()
913            .filter(|i| matches!(i, ValidationIssue::MissingFile { .. }))
914            .collect();
915        assert!(
916            missing_files.is_empty(),
917            "Unexpected missing files: {:?}",
918            missing_files
919        );
920    }
921
922    #[cfg(feature = "polars")]
923    #[test]
924    fn test_validate_dataset_structure_missing_arrow() {
925        let temp_dir = TempDir::new().unwrap();
926        let dataset_dir = temp_dir.path().join("my_dataset");
927        std::fs::create_dir_all(&dataset_dir).unwrap();
928
929        let issues = validate_dataset_structure(&dataset_dir).unwrap();
930
931        assert_eq!(issues.len(), 1);
932        assert!(matches!(
933            &issues[0],
934            ValidationIssue::MissingArrowFile { .. }
935        ));
936    }
937
938    #[test]
939    fn test_image_extensions() {
940        assert!(IMAGE_EXTENSIONS.contains(&"jpg"));
941        assert!(IMAGE_EXTENSIONS.contains(&"jpeg"));
942        assert!(IMAGE_EXTENSIONS.contains(&"png"));
943        assert!(IMAGE_EXTENSIONS.contains(&"camera.jpeg"));
944    }
945
946    #[test]
947    fn test_validation_issue_display() {
948        let issue = ValidationIssue::MissingFile {
949            name: "test".to_string(),
950            expected: PathBuf::from("test.jpg"),
951        };
952        let display = format!("{}", issue);
953        assert!(display.contains("test"));
954        assert!(display.contains("test.jpg"));
955    }
956}