edgefirst_client/
format.rs

1//! EdgeFirst Dataset Format utilities.
2//!
3//! This module provides tools for working with the EdgeFirst Dataset Format
4//! as documented in DATASET_FORMAT.md. It enables:
5//!
6//! - Reading and resolving file paths from Arrow annotation files
7//! - Generating Arrow files from folders of images (with null annotations)
8//! - Validating dataset directory structures
9//! - (Future) Converting from other formats (COCO, DarkNet, YOLO, etc.)
10//!
11//! # EdgeFirst Dataset Format
12//!
13//! A dataset in EdgeFirst format consists of:
14//! - An Arrow file (`{dataset_name}.arrow`) containing annotation metadata
15//! - A sensor container directory (`{dataset_name}/`) with image/sensor files
16//!
17//! ## Supported Structures
18//!
19//! **Sequence-based** (frame column is not null):
20//! ```text
21//! dataset_name/
22//! ├── dataset_name.arrow
23//! └── dataset_name/
24//!     └── sequence_name/
25//!         ├── sequence_name_001.camera.jpeg
26//!         └── sequence_name_002.camera.jpeg
27//! ```
28//!
29//! **Image-based** (frame column is null):
30//! ```text
31//! dataset_name/
32//! ├── dataset_name.arrow
33//! └── dataset_name/
34//!     ├── image1.jpg
35//!     └── image2.png
36//! ```
37//!
38//! # Example
39//!
40//! ```rust,no_run
41//! use edgefirst_client::format::{resolve_arrow_files, validate_dataset_structure};
42//! use std::path::Path;
43//!
44//! // Resolve all files referenced by an Arrow file
45//! let arrow_path = Path::new("my_dataset/my_dataset.arrow");
46//! let files = resolve_arrow_files(arrow_path)?;
47//! for (name, path) in &files {
48//!     println!("{}: {:?}", name, path);
49//! }
50//!
51//! // Validate the dataset structure
52//! let issues = validate_dataset_structure(Path::new("my_dataset"))?;
53//! if !issues.is_empty() {
54//!     for issue in &issues {
55//!         eprintln!("Warning: {}", issue);
56//!     }
57//! }
58//! # Ok::<(), edgefirst_client::Error>(())
59//! ```
60
61use std::{
62    collections::HashMap,
63    fs::File,
64    path::{Path, PathBuf},
65};
66
67use walkdir::WalkDir;
68
69use crate::Error;
70
71/// Image file extensions supported by EdgeFirst.
72pub const IMAGE_EXTENSIONS: &[&str] = &[
73    "jpg",
74    "jpeg",
75    "png",
76    "camera.jpeg",
77    "camera.png",
78    "camera.jpg",
79];
80
81/// Resolve all file paths referenced by an Arrow annotation file.
82///
83/// Reads the Arrow file and extracts the `name` and `frame` columns to
84/// determine which image files are referenced. Returns a map from sample
85/// name to the expected relative file path within the sensor container.
86///
87/// # Arguments
88///
89/// * `arrow_path` - Path to the Arrow annotation file
90///
91/// # Returns
92///
93/// A map from sample name (e.g., "deer_001") to relative file path within
94/// the sensor container (e.g., "deer/deer_001.camera.jpeg").
95///
96/// # Errors
97///
98/// Returns an error if:
99/// * Arrow file cannot be read
100/// * Arrow file is missing required columns
101/// * Arrow file has invalid data types
102///
103/// # Example
104///
105/// ```rust,no_run
106/// use edgefirst_client::format::resolve_arrow_files;
107/// use std::path::Path;
108///
109/// let arrow_path = Path::new("dataset/dataset.arrow");
110/// let files = resolve_arrow_files(arrow_path)?;
111///
112/// for (name, relative_path) in &files {
113///     println!("Sample '{}' -> {:?}", name, relative_path);
114/// }
115/// # Ok::<(), edgefirst_client::Error>(())
116/// ```
117#[cfg(feature = "polars")]
118pub fn resolve_arrow_files(arrow_path: &Path) -> Result<HashMap<String, PathBuf>, Error> {
119    use polars::prelude::*;
120
121    let mut file = File::open(arrow_path).map_err(|e| {
122        Error::InvalidParameters(format!("Cannot open Arrow file {:?}: {}", arrow_path, e))
123    })?;
124
125    let df = IpcReader::new(&mut file).finish().map_err(|e| {
126        Error::InvalidParameters(format!("Failed to read Arrow file {:?}: {}", arrow_path, e))
127    })?;
128
129    // Get the name column (required)
130    let names = df
131        .column("name")
132        .map_err(|e| Error::InvalidParameters(format!("Missing 'name' column: {}", e)))?
133        .str()
134        .map_err(|e| Error::InvalidParameters(format!("Invalid 'name' column type: {}", e)))?;
135
136    // Get the frame column (optional - determines sequence vs standalone)
137    let frames = df.column("frame").ok();
138
139    let mut result = HashMap::new();
140
141    for idx in 0..df.height() {
142        // Extract sample name
143        let name = match names.get(idx) {
144            Some(n) => n.to_string(),
145            None => continue, // Skip null names
146        };
147
148        // Skip if we've already processed this sample name
149        if result.contains_key(&name) {
150            continue;
151        }
152
153        // Check if this is a sequence sample (frame is not null)
154        let frame = frames.and_then(|col| {
155            // Try as u64 first, then u32
156            col.u64()
157                .ok()
158                .and_then(|s| s.get(idx))
159                .or_else(|| col.u32().ok().and_then(|s| s.get(idx).map(|v| v as u64)))
160        });
161
162        // Build the relative path based on whether this is a sequence or standalone
163        let relative_path = if let Some(frame_num) = frame {
164            // Sequence: name/name_frame.camera.jpeg
165            // The name column contains the sequence name
166            PathBuf::from(&name).join(format!("{}_{:03}.camera.jpeg", name, frame_num))
167        } else {
168            // Standalone: name.jpg (or similar - we'll resolve actual extension later)
169            PathBuf::from(format!("{}.camera.jpeg", name))
170        };
171
172        result.insert(name, relative_path);
173    }
174
175    Ok(result)
176}
177
178/// Information about a resolved sample file.
179#[derive(Debug, Clone)]
180pub struct ResolvedFile {
181    /// Sample name from the Arrow file
182    pub name: String,
183    /// Frame number (None for standalone images)
184    pub frame: Option<u64>,
185    /// Actual file path on disk (if found)
186    pub path: Option<PathBuf>,
187    /// Expected relative path within sensor container
188    pub expected_path: PathBuf,
189}
190
191/// Resolve Arrow file references against actual files in a sensor container.
192///
193/// This function reads an Arrow file, extracts sample references, and attempts
194/// to match them against actual files in the sensor container directory.
195///
196/// # Arguments
197///
198/// * `arrow_path` - Path to the Arrow annotation file
199/// * `sensor_container` - Path to the sensor container directory
200///
201/// # Returns
202///
203/// A list of resolved files with match information.
204///
205/// # Example
206///
207/// ```rust,no_run
208/// use edgefirst_client::format::resolve_files_with_container;
209/// use std::path::Path;
210///
211/// let resolved = resolve_files_with_container(
212///     Path::new("dataset/dataset.arrow"),
213///     Path::new("dataset/dataset"),
214/// )?;
215///
216/// for file in &resolved {
217///     match &file.path {
218///         Some(p) => println!("Found: {} -> {:?}", file.name, p),
219///         None => println!("Missing: {} (expected {:?})", file.name, file.expected_path),
220///     }
221/// }
222/// # Ok::<(), edgefirst_client::Error>(())
223/// ```
224#[cfg(feature = "polars")]
225pub fn resolve_files_with_container(
226    arrow_path: &Path,
227    sensor_container: &Path,
228) -> Result<Vec<ResolvedFile>, Error> {
229    use polars::prelude::*;
230
231    let mut file = File::open(arrow_path).map_err(|e| {
232        Error::InvalidParameters(format!("Cannot open Arrow file {:?}: {}", arrow_path, e))
233    })?;
234
235    let df = IpcReader::new(&mut file).finish().map_err(|e| {
236        Error::InvalidParameters(format!("Failed to read Arrow file {:?}: {}", arrow_path, e))
237    })?;
238
239    // Build an index of all files in the sensor container
240    let file_index = build_file_index(sensor_container)?;
241
242    // Get the name column (required)
243    let names = df
244        .column("name")
245        .map_err(|e| Error::InvalidParameters(format!("Missing 'name' column: {}", e)))?
246        .str()
247        .map_err(|e| Error::InvalidParameters(format!("Invalid 'name' column type: {}", e)))?;
248
249    // Get the frame column (optional)
250    let frames = df.column("frame").ok();
251
252    let mut result = Vec::new();
253    let mut seen_samples: HashMap<String, bool> = HashMap::new();
254
255    for idx in 0..df.height() {
256        let name = match names.get(idx) {
257            Some(n) => n.to_string(),
258            None => continue,
259        };
260
261        // Create unique key for deduplication (name + frame)
262        let frame = frames.and_then(|col| {
263            col.u64()
264                .ok()
265                .and_then(|s| s.get(idx))
266                .or_else(|| col.u32().ok().and_then(|s| s.get(idx).map(|v| v as u64)))
267        });
268
269        let sample_key = match frame {
270            Some(f) => format!("{}_{}", name, f),
271            None => name.clone(),
272        };
273
274        // Skip duplicates
275        if seen_samples.contains_key(&sample_key) {
276            continue;
277        }
278        seen_samples.insert(sample_key.clone(), true);
279
280        // Build expected path and try to find actual file
281        let expected_path = if let Some(frame_num) = frame {
282            PathBuf::from(&name).join(format!("{}_{:03}.camera.jpeg", name, frame_num))
283        } else {
284            PathBuf::from(format!("{}.camera.jpeg", name))
285        };
286
287        // Try to find the actual file using flexible matching
288        let actual_path = find_matching_file(&file_index, &name, frame);
289
290        result.push(ResolvedFile {
291            name,
292            frame,
293            path: actual_path,
294            expected_path,
295        });
296    }
297
298    Ok(result)
299}
300
301/// Build an index of all files in a directory for fast lookup.
302fn build_file_index(root: &Path) -> Result<HashMap<String, PathBuf>, Error> {
303    let mut index = HashMap::new();
304
305    if !root.exists() {
306        return Ok(index);
307    }
308
309    for entry in WalkDir::new(root)
310        .into_iter()
311        .filter_map(|e| e.ok())
312        .filter(|e| e.file_type().is_file())
313    {
314        let path = entry.path().to_path_buf();
315        if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
316            // Index by full filename
317            index.insert(filename.to_lowercase(), path.clone());
318
319            // Also index by stem (without extension) for flexible matching
320            if let Some(stem) = path.file_stem().and_then(|s| s.to_str()) {
321                // Handle double extensions like .camera.jpeg
322                let clean_stem = stem.strip_suffix(".camera").unwrap_or(stem).to_lowercase();
323                index.entry(clean_stem).or_insert_with(|| path.clone());
324            }
325        }
326    }
327
328    Ok(index)
329}
330
331/// Find a matching file in the index using flexible matching.
332fn find_matching_file(
333    index: &HashMap<String, PathBuf>,
334    name: &str,
335    frame: Option<u64>,
336) -> Option<PathBuf> {
337    let search_key = match frame {
338        Some(f) => format!("{}_{:03}", name, f).to_lowercase(),
339        None => name.to_lowercase(),
340    };
341
342    // Try exact filename match first
343    for ext in IMAGE_EXTENSIONS {
344        let key = format!("{}.{}", search_key, ext);
345        if let Some(path) = index.get(&key) {
346            return Some(path.clone());
347        }
348    }
349
350    // Try stem match
351    if let Some(path) = index.get(&search_key) {
352        return Some(path.clone());
353    }
354
355    None
356}
357
358/// Validation issue found in dataset structure.
359#[derive(Debug, Clone, PartialEq, Eq)]
360pub enum ValidationIssue {
361    /// Arrow file is missing
362    MissingArrowFile { expected: PathBuf },
363    /// Sensor container directory is missing
364    MissingSensorContainer { expected: PathBuf },
365    /// A referenced file is missing
366    MissingFile { name: String, expected: PathBuf },
367    /// An unreferenced file was found in the container
368    UnreferencedFile { path: PathBuf },
369    /// Invalid directory structure
370    InvalidStructure { message: String },
371}
372
373impl std::fmt::Display for ValidationIssue {
374    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
375        match self {
376            ValidationIssue::MissingArrowFile { expected } => {
377                write!(f, "Missing Arrow file: {:?}", expected)
378            }
379            ValidationIssue::MissingSensorContainer { expected } => {
380                write!(f, "Missing sensor container directory: {:?}", expected)
381            }
382            ValidationIssue::MissingFile { name, expected } => {
383                write!(f, "Missing file for sample '{}': {:?}", name, expected)
384            }
385            ValidationIssue::UnreferencedFile { path } => {
386                write!(f, "Unreferenced file in container: {:?}", path)
387            }
388            ValidationIssue::InvalidStructure { message } => {
389                write!(f, "Invalid structure: {}", message)
390            }
391        }
392    }
393}
394
395/// Validate the structure of a dataset directory.
396///
397/// Checks that the directory follows the EdgeFirst Dataset Format:
398/// - Arrow file exists at expected location
399/// - Sensor container directory exists
400/// - All files referenced in Arrow file exist in container
401/// - Reports any unreferenced files
402///
403/// # Arguments
404///
405/// * `dataset_dir` - Path to the snapshot root directory
406///
407/// # Returns
408///
409/// A list of validation issues (empty if valid).
410///
411/// # Example
412///
413/// ```rust,no_run
414/// use edgefirst_client::format::validate_dataset_structure;
415/// use std::path::Path;
416///
417/// let issues = validate_dataset_structure(Path::new("my_dataset"))?;
418/// if issues.is_empty() {
419///     println!("Dataset structure is valid!");
420/// } else {
421///     for issue in &issues {
422///         eprintln!("Issue: {}", issue);
423///     }
424/// }
425/// # Ok::<(), edgefirst_client::Error>(())
426/// ```
427#[cfg(feature = "polars")]
428pub fn validate_dataset_structure(dataset_dir: &Path) -> Result<Vec<ValidationIssue>, Error> {
429    let mut issues = Vec::new();
430
431    // Get the dataset name from the directory name
432    let dataset_name = dataset_dir
433        .file_name()
434        .and_then(|n| n.to_str())
435        .ok_or_else(|| Error::InvalidParameters("Invalid dataset directory path".to_owned()))?;
436
437    // Check for Arrow file
438    let arrow_path = dataset_dir.join(format!("{}.arrow", dataset_name));
439    if !arrow_path.exists() {
440        issues.push(ValidationIssue::MissingArrowFile {
441            expected: arrow_path.clone(),
442        });
443        // Can't continue validation without Arrow file
444        return Ok(issues);
445    }
446
447    // Check for sensor container
448    let container_path = dataset_dir.join(dataset_name);
449    if !container_path.exists() {
450        issues.push(ValidationIssue::MissingSensorContainer {
451            expected: container_path.clone(),
452        });
453        // Can't continue validation without container
454        return Ok(issues);
455    }
456
457    // Resolve files and check for missing ones
458    let resolved = resolve_files_with_container(&arrow_path, &container_path)?;
459
460    // Track which files were referenced
461    let mut referenced_files: std::collections::HashSet<PathBuf> = std::collections::HashSet::new();
462
463    for file in &resolved {
464        match &file.path {
465            Some(path) => {
466                referenced_files.insert(path.clone());
467            }
468            None => {
469                issues.push(ValidationIssue::MissingFile {
470                    name: file.name.clone(),
471                    expected: file.expected_path.clone(),
472                });
473            }
474        }
475    }
476
477    // Find unreferenced files in container
478    for entry in WalkDir::new(&container_path)
479        .into_iter()
480        .filter_map(|e| e.ok())
481        .filter(|e| e.file_type().is_file())
482    {
483        let path = entry.path().to_path_buf();
484
485        // Check if this file is an image file
486        let is_image = path
487            .extension()
488            .and_then(|e| e.to_str())
489            .map(|e| {
490                matches!(
491                    e.to_lowercase().as_str(),
492                    "jpg" | "jpeg" | "png" | "pcd" | "bin"
493                )
494            })
495            .unwrap_or(false);
496
497        if is_image && !referenced_files.contains(&path) {
498            issues.push(ValidationIssue::UnreferencedFile { path });
499        }
500    }
501
502    Ok(issues)
503}
504
505/// Generate an Arrow file from a folder of images.
506///
507/// Scans the folder for image files and creates an Arrow annotation file
508/// with null annotations (for unannotated datasets). This is useful for
509/// importing existing image collections into EdgeFirst.
510///
511/// # Arguments
512///
513/// * `folder` - Path to the folder containing images
514/// * `output` - Path where the Arrow file should be written
515/// * `detect_sequences` - If true, attempt to detect sequences from naming
516///   patterns
517///
518/// # Returns
519///
520/// The number of samples (images) included in the Arrow file.
521///
522/// # Sequence Detection
523///
524/// When `detect_sequences` is true, the function looks for patterns like:
525/// - `{name}_{number}.{ext}` → sequence with frame number
526/// - `{sequence}/{name}_{number}.{ext}` → sequence in subdirectory
527///
528/// # Example
529///
530/// ```rust,no_run
531/// use edgefirst_client::format::generate_arrow_from_folder;
532/// use std::path::Path;
533///
534/// // Generate Arrow file from images
535/// let count = generate_arrow_from_folder(
536///     Path::new("my_images"),
537///     Path::new("my_dataset/my_dataset.arrow"),
538///     true, // detect sequences
539/// )?;
540/// println!("Created Arrow file with {} samples", count);
541/// # Ok::<(), edgefirst_client::Error>(())
542/// ```
543#[cfg(feature = "polars")]
544pub fn generate_arrow_from_folder(
545    folder: &Path,
546    output: &Path,
547    detect_sequences: bool,
548) -> Result<usize, Error> {
549    use polars::prelude::*;
550    use std::io::BufWriter;
551
552    // Collect all image files
553    let image_files: Vec<PathBuf> = WalkDir::new(folder)
554        .into_iter()
555        .filter_map(|e| e.ok())
556        .filter(|e| e.file_type().is_file())
557        .filter(|e| {
558            e.path()
559                .extension()
560                .and_then(|ext| ext.to_str())
561                .map(|ext| {
562                    matches!(
563                        ext.to_lowercase().as_str(),
564                        "jpg" | "jpeg" | "png" | "pcd" | "bin"
565                    )
566                })
567                .unwrap_or(false)
568        })
569        .map(|e| e.path().to_path_buf())
570        .collect();
571
572    if image_files.is_empty() {
573        return Err(Error::InvalidParameters(
574            "No image files found in folder".to_owned(),
575        ));
576    }
577
578    // Parse each image file to extract name and frame
579    let mut names: Vec<String> = Vec::new();
580    let mut frames: Vec<Option<u64>> = Vec::new();
581
582    for path in &image_files {
583        let (name, frame) = parse_image_filename(path, folder, detect_sequences);
584        names.push(name);
585        frames.push(frame);
586    }
587
588    // Build the DataFrame with the 2025.10 schema
589    let name_series = Series::new("name".into(), &names);
590    let frame_series = Series::new("frame".into(), &frames);
591
592    // Create null columns for annotations
593    let null_strings: Vec<Option<&str>> = vec![None; names.len()];
594    let null_u64s: Vec<Option<u64>> = vec![None; names.len()];
595
596    let object_id_series = Series::new("object_id".into(), &null_strings);
597    let label_series = Series::new("label".into(), &null_strings);
598    let label_index_series = Series::new("label_index".into(), &null_u64s);
599    let group_series = Series::new("group".into(), &null_strings);
600
601    // Null geometry columns - use Option<Series> like annotations_dataframe does
602    let null_series_vec: Vec<Option<Series>> = vec![None; names.len()];
603
604    let mask_series = Series::new("mask".into(), null_series_vec.clone())
605        .cast(&DataType::List(Box::new(DataType::Float32)))?;
606
607    let box2d_series = Series::new("box2d".into(), null_series_vec.clone())
608        .cast(&DataType::Array(Box::new(DataType::Float32), 4))?;
609
610    let box3d_series = Series::new("box3d".into(), null_series_vec)
611        .cast(&DataType::Array(Box::new(DataType::Float32), 6))?;
612
613    let mut df = DataFrame::new(vec![
614        name_series.into(),
615        frame_series.into(),
616        object_id_series.into(),
617        label_series.into(),
618        label_index_series.into(),
619        group_series.into(),
620        mask_series.into(),
621        box2d_series.into(),
622        box3d_series.into(),
623    ])?;
624
625    // Create output directory if needed
626    if let Some(parent) = output.parent() {
627        std::fs::create_dir_all(parent)?;
628    }
629
630    // Write the Arrow file
631    let file = File::create(output)?;
632    let writer = BufWriter::new(file);
633    IpcWriter::new(writer)
634        .finish(&mut df)
635        .map_err(|e| Error::InvalidParameters(format!("Failed to write Arrow file: {}", e)))?;
636
637    Ok(image_files.len())
638}
639
640/// Parse an image filename to extract sample name and frame number.
641fn parse_image_filename(path: &Path, root: &Path, detect_sequences: bool) -> (String, Option<u64>) {
642    let stem = path
643        .file_stem()
644        .and_then(|s| s.to_str())
645        .unwrap_or("unknown");
646
647    // Remove .camera suffix if present
648    let clean_stem = stem.strip_suffix(".camera").unwrap_or(stem);
649
650    if !detect_sequences {
651        return (clean_stem.to_string(), None);
652    }
653
654    // Try to detect sequence pattern: name_frame
655    // Look for trailing number separated by underscore
656    if let Some(idx) = clean_stem.rfind('_') {
657        let (name_part, frame_part) = clean_stem.split_at(idx);
658        let frame_str = &frame_part[1..]; // Skip the underscore
659
660        if let Ok(frame) = frame_str.parse::<u64>() {
661            // Check if this might be in a sequence directory
662            let relative = path.strip_prefix(root).unwrap_or(path);
663            if relative.components().count() > 1 {
664                // In a subdirectory - this is likely a sequence
665                return (name_part.to_string(), Some(frame));
666            }
667
668            // Also detect if multiple files share the same prefix
669            // (This is a heuristic - files in root with _N pattern are likely sequences)
670            return (name_part.to_string(), Some(frame));
671        }
672    }
673
674    // No sequence detected
675    (clean_stem.to_string(), None)
676}
677
678/// Get the expected sensor container path for a dataset directory.
679///
680/// # Arguments
681///
682/// * `dataset_dir` - Path to the snapshot root directory
683///
684/// # Returns
685///
686/// The expected path to the sensor container directory.
687pub fn get_sensor_container_path(dataset_dir: &Path) -> Option<PathBuf> {
688    let dataset_name = dataset_dir.file_name()?.to_str()?;
689    Some(dataset_dir.join(dataset_name))
690}
691
692/// Get the expected Arrow file path for a dataset directory.
693///
694/// # Arguments
695///
696/// * `dataset_dir` - Path to the snapshot root directory
697///
698/// # Returns
699///
700/// The expected path to the Arrow annotation file.
701pub fn get_arrow_path(dataset_dir: &Path) -> Option<PathBuf> {
702    let dataset_name = dataset_dir.file_name()?.to_str()?;
703    Some(dataset_dir.join(format!("{}.arrow", dataset_name)))
704}
705
706#[cfg(test)]
707mod tests {
708    use super::*;
709    use std::io::Write;
710    use tempfile::TempDir;
711
712    /// Create a test image file (minimal JPEG).
713    fn create_test_image(path: &Path) {
714        // Minimal valid JPEG (smallest possible)
715        let jpeg_data: &[u8] = &[
716            0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x00,
717            0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0xFF, 0xDB, 0x00, 0x43, 0x00, 0x08, 0x06, 0x06,
718            0x07, 0x06, 0x05, 0x08, 0x07, 0x07, 0x07, 0x09, 0x09, 0x08, 0x0A, 0x0C, 0x14, 0x0D,
719            0x0C, 0x0B, 0x0B, 0x0C, 0x19, 0x12, 0x13, 0x0F, 0x14, 0x1D, 0x1A, 0x1F, 0x1E, 0x1D,
720            0x1A, 0x1C, 0x1C, 0x20, 0x24, 0x2E, 0x27, 0x20, 0x22, 0x2C, 0x23, 0x1C, 0x1C, 0x28,
721            0x37, 0x29, 0x2C, 0x30, 0x31, 0x34, 0x34, 0x34, 0x1F, 0x27, 0x39, 0x3D, 0x38, 0x32,
722            0x3C, 0x2E, 0x33, 0x34, 0x32, 0xFF, 0xC0, 0x00, 0x0B, 0x08, 0x00, 0x01, 0x00, 0x01,
723            0x01, 0x01, 0x11, 0x00, 0xFF, 0xC4, 0x00, 0x1F, 0x00, 0x00, 0x01, 0x05, 0x01, 0x01,
724            0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02,
725            0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xFF, 0xC4, 0x00, 0xB5, 0x10,
726            0x00, 0x02, 0x01, 0x03, 0x03, 0x02, 0x04, 0x03, 0x05, 0x05, 0x04, 0x04, 0x00, 0x00,
727            0x01, 0x7D, 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06,
728            0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xA1, 0x08, 0x23, 0x42,
729            0xB1, 0xC1, 0x15, 0x52, 0xD1, 0xF0, 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0A, 0x16,
730            0x17, 0x18, 0x19, 0x1A, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x34, 0x35, 0x36, 0x37,
731            0x38, 0x39, 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x53, 0x54, 0x55,
732            0x56, 0x57, 0x58, 0x59, 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x73,
733            0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
734            0x8A, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5,
735            0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA,
736            0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
737            0xD7, 0xD8, 0xD9, 0xDA, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA,
738            0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFF, 0xDA, 0x00, 0x08,
739            0x01, 0x01, 0x00, 0x00, 0x3F, 0x00, 0xFB, 0xD5, 0xDB, 0x20, 0xA8, 0xF1, 0x4D, 0x9E,
740            0xBA, 0x79, 0xC5, 0x14, 0x51, 0x40, 0xFF, 0xD9,
741        ];
742
743        if let Some(parent) = path.parent() {
744            std::fs::create_dir_all(parent).unwrap();
745        }
746        let mut file = File::create(path).unwrap();
747        file.write_all(jpeg_data).unwrap();
748    }
749
750    #[test]
751    fn test_get_arrow_path() {
752        let dir = Path::new("/data/my_dataset");
753        let arrow = get_arrow_path(dir).unwrap();
754        assert_eq!(arrow, PathBuf::from("/data/my_dataset/my_dataset.arrow"));
755    }
756
757    #[test]
758    fn test_get_sensor_container_path() {
759        let dir = Path::new("/data/my_dataset");
760        let container = get_sensor_container_path(dir).unwrap();
761        assert_eq!(container, PathBuf::from("/data/my_dataset/my_dataset"));
762    }
763
764    #[test]
765    fn test_parse_image_filename_standalone() {
766        let root = Path::new("/data");
767        let path = Path::new("/data/image.jpg");
768
769        let (name, frame) = parse_image_filename(path, root, true);
770        assert_eq!(name, "image");
771        assert_eq!(frame, None);
772    }
773
774    #[test]
775    fn test_parse_image_filename_camera_extension() {
776        let root = Path::new("/data");
777        let path = Path::new("/data/sample.camera.jpeg");
778
779        let (name, frame) = parse_image_filename(path, root, true);
780        assert_eq!(name, "sample");
781        assert_eq!(frame, None);
782    }
783
784    #[test]
785    fn test_parse_image_filename_sequence() {
786        let root = Path::new("/data");
787        let path = Path::new("/data/seq/seq_001.camera.jpeg");
788
789        let (name, frame) = parse_image_filename(path, root, true);
790        assert_eq!(name, "seq");
791        assert_eq!(frame, Some(1));
792    }
793
794    #[test]
795    fn test_parse_image_filename_no_sequence_detection() {
796        let root = Path::new("/data");
797        let path = Path::new("/data/seq/seq_001.camera.jpeg");
798
799        let (name, frame) = parse_image_filename(path, root, false);
800        assert_eq!(name, "seq_001");
801        assert_eq!(frame, None);
802    }
803
804    #[test]
805    fn test_build_file_index() {
806        let temp_dir = TempDir::new().unwrap();
807        let root = temp_dir.path();
808
809        // Create test files
810        create_test_image(&root.join("image1.jpg"));
811        create_test_image(&root.join("sub/image2.camera.jpeg"));
812
813        let index = build_file_index(root).unwrap();
814
815        // Check that files are indexed
816        assert!(index.contains_key("image1.jpg"));
817        assert!(index.contains_key("image2.camera.jpeg"));
818
819        // Check stem indexing
820        assert!(index.contains_key("image1"));
821        assert!(index.contains_key("image2"));
822    }
823
824    #[test]
825    fn test_find_matching_file() {
826        let temp_dir = TempDir::new().unwrap();
827        let root = temp_dir.path();
828
829        // Create test files
830        create_test_image(&root.join("sample.camera.jpeg"));
831        create_test_image(&root.join("seq/seq_001.camera.jpeg"));
832
833        let index = build_file_index(root).unwrap();
834
835        // Find standalone file
836        let found = find_matching_file(&index, "sample", None);
837        assert!(found.is_some());
838
839        // Find sequence file
840        let found = find_matching_file(&index, "seq", Some(1));
841        assert!(found.is_some());
842
843        // Missing file
844        let found = find_matching_file(&index, "nonexistent", None);
845        assert!(found.is_none());
846    }
847
848    #[cfg(feature = "polars")]
849    #[test]
850    fn test_generate_arrow_from_folder() {
851        use polars::prelude::*;
852
853        let temp_dir = TempDir::new().unwrap();
854        let root = temp_dir.path();
855
856        // Create test images
857        let images_dir = root.join("images");
858        create_test_image(&images_dir.join("photo1.jpg"));
859        create_test_image(&images_dir.join("photo2.png"));
860        create_test_image(&images_dir.join("seq/seq_001.camera.jpeg"));
861        create_test_image(&images_dir.join("seq/seq_002.camera.jpeg"));
862
863        // Generate Arrow file
864        let arrow_path = root.join("output.arrow");
865        let count = generate_arrow_from_folder(&images_dir, &arrow_path, true).unwrap();
866
867        assert_eq!(count, 4);
868        assert!(arrow_path.exists());
869
870        // Verify Arrow file content
871        let mut file = File::open(&arrow_path).unwrap();
872        let df = IpcReader::new(&mut file).finish().unwrap();
873
874        assert_eq!(df.height(), 4);
875        assert!(df.column("name").is_ok());
876        assert!(df.column("frame").is_ok());
877        assert!(df.column("label").is_ok());
878    }
879
880    #[cfg(feature = "polars")]
881    #[test]
882    fn test_resolve_arrow_files() {
883        use polars::prelude::*;
884        use std::io::BufWriter;
885
886        let temp_dir = TempDir::new().unwrap();
887        let root = temp_dir.path();
888
889        // Create a simple Arrow file
890        let names = Series::new("name".into(), &["sample1", "sample2", "seq"]);
891        let frames: Vec<Option<u64>> = vec![None, None, Some(1)];
892        let frame_series = Series::new("frame".into(), &frames);
893
894        let mut df = DataFrame::new(vec![names.into(), frame_series.into()]).unwrap();
895
896        let arrow_path = root.join("test.arrow");
897        let file = File::create(&arrow_path).unwrap();
898        let writer = BufWriter::new(file);
899        IpcWriter::new(writer).finish(&mut df).unwrap();
900
901        // Test resolution
902        let resolved = resolve_arrow_files(&arrow_path).unwrap();
903
904        assert_eq!(resolved.len(), 3);
905        assert!(resolved.contains_key("sample1"));
906        assert!(resolved.contains_key("sample2"));
907        assert!(resolved.contains_key("seq"));
908    }
909
910    #[cfg(feature = "polars")]
911    #[test]
912    fn test_validate_dataset_structure_valid() {
913        use polars::prelude::*;
914        use std::io::BufWriter;
915
916        let temp_dir = TempDir::new().unwrap();
917        let dataset_dir = temp_dir.path().join("my_dataset");
918        std::fs::create_dir_all(&dataset_dir).unwrap();
919
920        // Create Arrow file
921        let names = Series::new("name".into(), &["image1"]);
922        let frames: Vec<Option<u64>> = vec![None];
923        let frame_series = Series::new("frame".into(), &frames);
924
925        let mut df = DataFrame::new(vec![names.into(), frame_series.into()]).unwrap();
926
927        let arrow_path = dataset_dir.join("my_dataset.arrow");
928        let file = File::create(&arrow_path).unwrap();
929        let writer = BufWriter::new(file);
930        IpcWriter::new(writer).finish(&mut df).unwrap();
931
932        // Create sensor container with matching file
933        let container = dataset_dir.join("my_dataset");
934        create_test_image(&container.join("image1.camera.jpeg"));
935
936        // Validate
937        let issues = validate_dataset_structure(&dataset_dir).unwrap();
938
939        // Should have no missing file issues
940        let missing_files: Vec<_> = issues
941            .iter()
942            .filter(|i| matches!(i, ValidationIssue::MissingFile { .. }))
943            .collect();
944        assert!(
945            missing_files.is_empty(),
946            "Unexpected missing files: {:?}",
947            missing_files
948        );
949    }
950
951    #[cfg(feature = "polars")]
952    #[test]
953    fn test_validate_dataset_structure_missing_arrow() {
954        let temp_dir = TempDir::new().unwrap();
955        let dataset_dir = temp_dir.path().join("my_dataset");
956        std::fs::create_dir_all(&dataset_dir).unwrap();
957
958        let issues = validate_dataset_structure(&dataset_dir).unwrap();
959
960        assert_eq!(issues.len(), 1);
961        assert!(matches!(
962            &issues[0],
963            ValidationIssue::MissingArrowFile { .. }
964        ));
965    }
966
967    #[test]
968    fn test_image_extensions() {
969        assert!(IMAGE_EXTENSIONS.contains(&"jpg"));
970        assert!(IMAGE_EXTENSIONS.contains(&"jpeg"));
971        assert!(IMAGE_EXTENSIONS.contains(&"png"));
972        assert!(IMAGE_EXTENSIONS.contains(&"camera.jpeg"));
973    }
974
975    #[test]
976    fn test_validation_issue_display() {
977        let issue = ValidationIssue::MissingFile {
978            name: "test".to_string(),
979            expected: PathBuf::from("test.jpg"),
980        };
981        let display = format!("{}", issue);
982        assert!(display.contains("test"));
983        assert!(display.contains("test.jpg"));
984    }
985}