Skip to main content

bids_core/
file.rs

1//! BIDS file representation with type detection, entity access, and companion lookup.
2//!
3//! [`BidsFile`] is the central type representing a single file in a BIDS dataset.
4//! It carries the file's path, extracted entities, metadata from JSON sidecars,
5//! and provides methods for reading JSON/TSV content, finding companion files,
6//! and copying/symlinking.
7
8use serde::{Deserialize, Serialize};
9use std::path::{Path, PathBuf};
10
11use crate::entities::{Entities, EntityValue};
12use crate::metadata::BidsMetadata;
13
14/// How to copy a file to a new location.
15///
16/// # Example
17///
18/// ```
19/// use bids_core::file::CopyMode;
20///
21/// let mode = CopyMode::Symlink;
22/// let mode_from_bool: CopyMode = true.into(); // Symlink
23/// assert_eq!(mode, mode_from_bool);
24/// ```
25#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
26pub enum CopyMode {
27    /// Create a regular file copy (default).
28    #[default]
29    Copy,
30    /// Create a symbolic link (falls back to copy on non-Unix platforms).
31    Symlink,
32}
33
34impl From<bool> for CopyMode {
35    /// `true` → `Symlink`, `false` → `Copy` (backwards compatibility).
36    fn from(symbolic: bool) -> Self {
37        if symbolic {
38            CopyMode::Symlink
39        } else {
40            CopyMode::Copy
41        }
42    }
43}
44
45impl std::fmt::Display for CopyMode {
46    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
47        match self {
48            Self::Copy => write!(f, "copy"),
49            Self::Symlink => write!(f, "symlink"),
50        }
51    }
52}
53
54/// The type of a BIDS file, determining what extra operations are available.
55///
56/// Inferred automatically from file extensions by [`FileType::from_path()`].
57#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
58#[non_exhaustive]
59pub enum FileType {
60    /// Generic file (no special handling)
61    Generic,
62    /// Tabular data file (.tsv, .tsv.gz)
63    Data,
64    /// Neuroimaging file (.nii, .nii.gz, .gii, .dtseries.nii, .func.gii)
65    Image,
66    /// JSON sidecar
67    Json,
68    /// EEG data file (.edf, .bdf, .set, .vhdr, .eeg, .fdt)
69    Eeg,
70    /// MEG data file (.fif, .ds, .sqd, .con, .raw, .pdf)
71    Meg,
72    /// PET image (.nii, .nii.gz — identified by suffix, but this catches
73    /// the common `.blood.tsv` companion pattern)
74    Pet,
75    /// Microscopy image (.tif, .tiff, .ome.tif, .ome.tiff, .png, .svs)
76    Microscopy,
77    /// NIRS data file (.snirf)
78    Nirs,
79}
80
81impl std::fmt::Display for FileType {
82    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
83        match self {
84            Self::Generic => write!(f, "generic"),
85            Self::Data => write!(f, "data"),
86            Self::Image => write!(f, "image"),
87            Self::Json => write!(f, "json"),
88            Self::Eeg => write!(f, "eeg"),
89            Self::Meg => write!(f, "meg"),
90            Self::Pet => write!(f, "pet"),
91            Self::Microscopy => write!(f, "microscopy"),
92            Self::Nirs => write!(f, "nirs"),
93        }
94    }
95}
96
97/// Extension-to-FileType mapping table.
98///
99/// Compound extensions (`.nii.gz`) must come before simple ones (`.nii`)
100/// so that `ends_with` matching works correctly.
101const EXTENSION_MAP: &[(&[&str], FileType)] = &[
102    (
103        &[".dtseries.nii", ".func.gii", ".nii.gz", ".nii", ".gii"],
104        FileType::Image,
105    ),
106    (&[".tsv.gz", ".tsv"], FileType::Data),
107    (&[".json"], FileType::Json),
108    (
109        &[".edf", ".bdf", ".set", ".vhdr", ".eeg", ".fdt"],
110        FileType::Eeg,
111    ),
112    (
113        &[".fif", ".ds", ".sqd", ".con", ".raw", ".pdf"],
114        FileType::Meg,
115    ),
116    (&[".snirf"], FileType::Nirs),
117    (
118        &[".ome.tif", ".ome.tiff", ".tif", ".tiff", ".svs"],
119        FileType::Microscopy,
120    ),
121];
122
123impl FileType {
124    /// Infer file type from the file extension(s).
125    #[must_use]
126    pub fn from_path(path: &Path) -> Self {
127        let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
128
129        for &(extensions, file_type) in EXTENSION_MAP {
130            if extensions.iter().any(|ext| name.ends_with(ext)) {
131                return file_type;
132            }
133        }
134        FileType::Generic
135    }
136}
137
138/// Represents a single file in a BIDS dataset.
139///
140/// This is the Rust equivalent of PyBIDS' `BIDSFile` hierarchy
141/// (BIDSFile, BIDSDataFile, BIDSImageFile, BIDSJSONFile).
142///
143/// # Example
144///
145/// ```
146/// use bids_core::file::BidsFile;
147///
148/// let f = BidsFile::new("/data/sub-01/eeg/sub-01_task-rest_eeg.edf");
149/// assert_eq!(f.suffix(), Some("eeg"));
150/// assert_eq!(f.extension(), ".edf");
151/// assert_eq!(f.file_type, bids_core::FileType::Eeg);
152/// ```
153#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct BidsFile {
155    /// Absolute path to the file.
156    pub path: PathBuf,
157    /// Just the filename component.
158    pub filename: String,
159    /// The parent directory.
160    pub dirname: PathBuf,
161    /// Whether this entry represents a directory.
162    pub is_dir: bool,
163    /// File type determined from extension.
164    pub file_type: FileType,
165    /// Entities extracted from the filename.
166    pub entities: Entities,
167    /// Metadata loaded from JSON sidecars (populated during metadata indexing).
168    ///
169    /// **Note:** This field is skipped during serialization/deserialization.
170    /// If you serialize a `BidsFile` and deserialize it back, metadata will be
171    /// empty. Use `BidsLayout::get_metadata()` to re-populate after
172    /// deserialization, or serialize metadata separately.
173    #[serde(skip)]
174    pub metadata: BidsMetadata,
175}
176
177impl BidsFile {
178    /// Create a new BidsFile from a path.
179    pub fn new(path: impl AsRef<Path>) -> Self {
180        let path = path.as_ref().to_path_buf();
181        let filename = path
182            .file_name()
183            .map(|n| n.to_string_lossy().to_string())
184            .unwrap_or_default();
185        let dirname = path
186            .parent()
187            .map(std::path::Path::to_path_buf)
188            .unwrap_or_default();
189        let is_dir = filename.is_empty();
190        let file_type = FileType::from_path(&path);
191
192        Self {
193            path,
194            filename,
195            dirname,
196            is_dir,
197            file_type,
198            entities: Entities::new(),
199            metadata: BidsMetadata::new(),
200        }
201    }
202
203    /// Set entities on this file, returning `self` for chaining.
204    #[must_use]
205    pub fn with_entities(mut self, entities: Entities) -> Self {
206        self.entities = entities;
207        self
208    }
209
210    /// Set metadata on this file, returning `self` for chaining.
211    #[must_use]
212    pub fn with_metadata(mut self, metadata: BidsMetadata) -> Self {
213        self.metadata = metadata;
214        self
215    }
216
217    /// Get the path relative to a root directory.
218    #[must_use]
219    pub fn relpath(&self, root: &Path) -> Option<PathBuf> {
220        self.path
221            .strip_prefix(root)
222            .ok()
223            .map(std::path::Path::to_path_buf)
224    }
225
226    /// Get a combined view of filename entities and metadata.
227    #[must_use]
228    pub fn get_entities(&self, metadata: Option<bool>) -> Entities {
229        match metadata {
230            Some(true) => {
231                // Only metadata entities
232                self.metadata
233                    .iter()
234                    .map(|(k, v)| (k.clone(), EntityValue::Json(v.clone())))
235                    .collect()
236            }
237            Some(false) => self.entities.clone(),
238            None => {
239                let mut merged = self.entities.clone();
240                for (k, v) in self.metadata.iter() {
241                    if !merged.contains_key(k) {
242                        merged.insert(k.clone(), EntityValue::Json(v.clone()));
243                    }
244                }
245                merged
246            }
247        }
248    }
249
250    /// Get metadata for this file (from JSON sidecars).
251    #[must_use]
252    pub fn get_metadata(&self) -> &BidsMetadata {
253        &self.metadata
254    }
255
256    /// Get the full extension, including compound extensions like `.tsv.gz`.
257    ///
258    /// Compound extensions are checked first (longest match wins).
259    #[must_use]
260    pub fn extension(&self) -> &str {
261        const COMPOUND_EXTENSIONS: &[&str] = &[
262            ".dtseries.nii",
263            ".func.gii",
264            ".ome.tif",
265            ".ome.tiff",
266            ".nii.gz",
267            ".tsv.gz",
268        ];
269        let name = &self.filename;
270        for ext in COMPOUND_EXTENSIONS {
271            if name.ends_with(ext) {
272                return ext;
273            }
274        }
275        name.rfind('.').map(|start| &name[start..]).unwrap_or("")
276    }
277
278    /// Get the suffix (the part before the extension, after the last underscore).
279    #[must_use]
280    pub fn suffix(&self) -> Option<&str> {
281        let stem = self.filename.split('.').next()?;
282        stem.rsplit('_').next()
283    }
284
285    /// Find a companion file by replacing the suffix and extension.
286    ///
287    /// E.g., for `sub-01_task-rest_eeg.edf`, `companion("channels", "tsv")`
288    /// returns `sub-01_task-rest_channels.tsv` in the same directory.
289    #[must_use]
290    pub fn companion(&self, suffix: &str, ext: &str) -> PathBuf {
291        let stem = self.filename.split('.').next().unwrap_or("");
292        let base = stem.rsplit_once('_').map(|(b, _)| b).unwrap_or(stem);
293        self.dirname.join(format!("{base}_{suffix}.{ext}"))
294    }
295
296    /// Read a JSON file and return as `serde_json::Value`.
297    ///
298    /// # Errors
299    ///
300    /// Returns a `BidsError::FileType` if this file is not a JSON file, or
301    /// an I/O / JSON parse error.
302    pub fn get_json(&self) -> Result<serde_json::Value, crate::error::BidsError> {
303        if self.file_type != FileType::Json {
304            return Err(crate::error::BidsError::FileType(format!(
305                "{} is not a JSON file",
306                self.path.display()
307            )));
308        }
309        let contents = std::fs::read_to_string(&self.path)?;
310        let val: serde_json::Value = serde_json::from_str(&contents)?;
311        Ok(val)
312    }
313
314    /// Read a JSON file and return as a `HashMap`.
315    ///
316    /// # Errors
317    ///
318    /// Returns an error if the file is not JSON, can't be read, or the
319    /// top-level JSON value is not an object.
320    pub fn get_dict(
321        &self,
322    ) -> Result<std::collections::HashMap<String, serde_json::Value>, crate::error::BidsError> {
323        let val = self.get_json()?;
324        match val {
325            serde_json::Value::Object(map) => Ok(map.into_iter().collect()),
326            _ => Err(crate::error::BidsError::FileType(format!(
327                "{} is a JSON containing {}, not an object",
328                self.path.display(),
329                val
330            ))),
331        }
332    }
333
334    /// Read a TSV/TSV.GZ file and return rows as `Vec<HashMap<String, String>>`.
335    ///
336    /// Handles both plain `.tsv` and gzip-compressed `.tsv.gz` files. The
337    /// BIDS sentinel value `n/a` is automatically converted to empty strings.
338    ///
339    /// For bulk TSV processing, prefer `bids_io::read_tsv` / `bids_io::read_tsv_gz`
340    /// which share the same parsing logic but don't require a `BidsFile`.
341    ///
342    /// # Errors
343    ///
344    /// Returns an error if the file cannot be read, is empty, or is not a
345    /// tabular data file.
346    ///
347    /// # Example
348    ///
349    /// ```no_run
350    /// # use bids_core::file::BidsFile;
351    /// let f = BidsFile::new("/data/sub-01/eeg/sub-01_events.tsv");
352    /// let rows = f.get_df().unwrap();
353    /// for row in &rows {
354    ///     println!("onset={}, type={}", row["onset"], row["trial_type"]);
355    /// }
356    /// ```
357    pub fn get_df(
358        &self,
359    ) -> Result<Vec<std::collections::HashMap<String, String>>, crate::error::BidsError> {
360        let file = std::fs::File::open(&self.path)?;
361        let reader: Box<dyn std::io::Read> = if self.filename.ends_with(".tsv.gz") {
362            Box::new(flate2::read::GzDecoder::new(file))
363        } else {
364            Box::new(file)
365        };
366        parse_tsv_reader(reader)
367    }
368
369    /// Copy this file to a new location.
370    ///
371    /// Accepts a [`CopyMode`] to control whether the file is copied or
372    /// symlinked. Prefer the enum over a bare boolean for clarity.
373    ///
374    /// # Errors
375    ///
376    /// Returns an I/O error if the source can't be read or the destination
377    /// can't be created.
378    pub fn copy_to(
379        &self,
380        new_path: &std::path::Path,
381        mode: CopyMode,
382    ) -> Result<(), crate::error::BidsError> {
383        if let Some(parent) = new_path.parent() {
384            std::fs::create_dir_all(parent)?;
385        }
386        match mode {
387            CopyMode::Symlink => {
388                #[cfg(unix)]
389                std::os::unix::fs::symlink(&self.path, new_path)?;
390                #[cfg(not(unix))]
391                std::fs::copy(&self.path, new_path)?;
392            }
393            CopyMode::Copy => {
394                std::fs::copy(&self.path, new_path)?;
395            }
396        }
397        Ok(())
398    }
399}
400
401impl std::fmt::Display for BidsFile {
402    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
403        write!(f, "<BidsFile '{}'>", self.path.display())
404    }
405}
406
407impl PartialEq for BidsFile {
408    fn eq(&self, other: &Self) -> bool {
409        self.path == other.path
410    }
411}
412
413impl Eq for BidsFile {}
414
415impl PartialOrd for BidsFile {
416    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
417        Some(self.cmp(other))
418    }
419}
420
421impl Ord for BidsFile {
422    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
423        natural_cmp(&self.path.to_string_lossy(), &other.path.to_string_lossy())
424    }
425}
426
427/// Natural sort comparison: numeric substrings are compared as numbers.
428fn natural_cmp(a: &str, b: &str) -> std::cmp::Ordering {
429    use std::cmp::Ordering;
430
431    let mut ai = a.chars().peekable();
432    let mut bi = b.chars().peekable();
433
434    loop {
435        match (ai.peek(), bi.peek()) {
436            (None, None) => return Ordering::Equal,
437            (None, Some(_)) => return Ordering::Less,
438            (Some(_), None) => return Ordering::Greater,
439            (Some(&ac), Some(&bc)) => {
440                if ac.is_ascii_digit() && bc.is_ascii_digit() {
441                    let mut an = String::new();
442                    while let Some(&c) = ai.peek() {
443                        if c.is_ascii_digit() {
444                            an.push(c);
445                            ai.next();
446                        } else {
447                            break;
448                        }
449                    }
450                    let mut bn = String::new();
451                    while let Some(&c) = bi.peek() {
452                        if c.is_ascii_digit() {
453                            bn.push(c);
454                            bi.next();
455                        } else {
456                            break;
457                        }
458                    }
459                    let av: u64 = an.parse().unwrap_or(0);
460                    let bv: u64 = bn.parse().unwrap_or(0);
461                    match av.cmp(&bv) {
462                        Ordering::Equal => {}
463                        ord => return ord,
464                    }
465                } else {
466                    let al = ac.to_lowercase().next().unwrap_or(ac);
467                    let bl = bc.to_lowercase().next().unwrap_or(bc);
468                    match al.cmp(&bl) {
469                        Ordering::Equal => {
470                            ai.next();
471                            bi.next();
472                        }
473                        ord => return ord,
474                    }
475                }
476            }
477        }
478    }
479}
480
481/// Parse TSV rows from any reader. Shared logic for `BidsFile::get_df()`.
482///
483/// The `n/a` sentinel is converted to empty strings per BIDS convention.
484pub(crate) fn parse_tsv_reader(
485    reader: impl std::io::Read,
486) -> Result<Vec<std::collections::HashMap<String, String>>, crate::error::BidsError> {
487    use std::io::{BufRead, BufReader};
488
489    let mut lines = BufReader::new(reader).lines();
490    let header_line = lines
491        .next()
492        .ok_or_else(|| crate::error::BidsError::Csv("Empty TSV file".into()))??;
493    let headers: Vec<String> = header_line
494        .split('\t')
495        .map(|s| s.trim().to_string())
496        .collect();
497
498    let mut rows = Vec::new();
499    for line_result in lines {
500        let line = line_result?;
501        if line.trim().is_empty() {
502            continue;
503        }
504        let values: Vec<&str> = line.split('\t').collect();
505        let mut row = std::collections::HashMap::new();
506        for (i, header) in headers.iter().enumerate() {
507            let val = values.get(i).copied().unwrap_or("").trim();
508            row.insert(
509                header.clone(),
510                if val == "n/a" {
511                    String::new()
512                } else {
513                    val.to_string()
514                },
515            );
516        }
517        rows.push(row);
518    }
519    Ok(rows)
520}
521
522#[cfg(test)]
523mod tests {
524    use super::*;
525
526    #[test]
527    fn test_file_type_detection() {
528        assert_eq!(
529            FileType::from_path(Path::new("sub-01_T1w.nii.gz")),
530            FileType::Image
531        );
532        assert_eq!(
533            FileType::from_path(Path::new("sub-01_events.tsv")),
534            FileType::Data
535        );
536        assert_eq!(
537            FileType::from_path(Path::new("sub-01_eeg.json")),
538            FileType::Json
539        );
540        assert_eq!(
541            FileType::from_path(Path::new("sub-01_eeg.edf")),
542            FileType::Eeg
543        );
544        assert_eq!(
545            FileType::from_path(Path::new("sub-01_eeg.bdf")),
546            FileType::Eeg
547        );
548        assert_eq!(
549            FileType::from_path(Path::new("sub-01_meg.fif")),
550            FileType::Meg
551        );
552        assert_eq!(
553            FileType::from_path(Path::new("sub-01_nirs.snirf")),
554            FileType::Nirs
555        );
556        assert_eq!(
557            FileType::from_path(Path::new("sub-01_sample-A_FLUO.tif")),
558            FileType::Microscopy
559        );
560        assert_eq!(FileType::from_path(Path::new("README")), FileType::Generic);
561    }
562
563    #[test]
564    fn test_bids_file() {
565        let f = BidsFile::new("/data/sub-01/eeg/sub-01_task-rest_eeg.edf");
566        assert_eq!(f.filename, "sub-01_task-rest_eeg.edf");
567        assert_eq!(f.file_type, FileType::Eeg);
568        assert_eq!(f.suffix(), Some("eeg"));
569        assert_eq!(f.extension(), ".edf");
570    }
571
572    #[test]
573    fn test_natural_sort() {
574        let mut files: Vec<String> = vec![
575            "sub-10".into(),
576            "sub-2".into(),
577            "sub-1".into(),
578            "sub-20".into(),
579        ];
580        files.sort_by(|a, b| natural_cmp(a, b));
581        assert_eq!(files, vec!["sub-1", "sub-2", "sub-10", "sub-20"]);
582    }
583}