Skip to main content

bids_core/
entities.rs

1//! BIDS entity definitions, parsing, and canonical ordering.
2//!
3//! Entities are the key-value pairs encoded in BIDS filenames (e.g.,
4//! `sub-01`, `task-rest`, `run-02`). This module provides:
5//!
6//! - [`Entity`] — A named entity definition with a regex pattern for extraction
7//! - [`EntityValue`] — A typed value (string, padded int, float, bool, JSON)
8//! - [`parse_file_entities()`] — Extract all entities from a file path
9//! - [`ENTITY_ORDER`] — Canonical BIDS entity ordering
10//!
11//! All other crates use these types to represent and match BIDS entities.
12
13use indexmap::IndexMap;
14use regex::Regex;
15use serde::{Deserialize, Serialize};
16use std::collections::HashMap;
17use std::sync::OnceLock;
18
19use crate::padded_int::PaddedInt;
20
21/// Represents a single entity defined in configuration.
22///
23/// Corresponds to PyBIDS `Entity` — a named key (e.g., "subject", "task")
24/// with a regex pattern to extract values from file paths.
25///
26/// # Example
27///
28/// ```
29/// use bids_core::Entity;
30///
31/// let ent = Entity::new("subject", r"[/\\]+sub-([a-zA-Z0-9]+)");
32/// let val = ent.match_path("/sub-01/anat/sub-01_T1w.nii.gz");
33/// assert_eq!(val.unwrap().as_str_lossy(), "01");
34/// ```
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct Entity {
37    pub name: String,
38    pub pattern: String,
39    #[serde(default)]
40    pub mandatory: bool,
41    #[serde(default)]
42    pub directory: Option<String>,
43    #[serde(default = "default_dtype")]
44    pub dtype: String,
45
46    /// Lazily compiled regex — uses `OnceLock` so matching only needs `&self`.
47    #[serde(skip)]
48    compiled_regex: OnceLock<Option<Regex>>,
49}
50
51fn default_dtype() -> String {
52    "str".to_string()
53}
54
55impl std::fmt::Display for Entity {
56    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
57        write!(f, "Entity('{}', dtype={})", self.name, self.dtype)?;
58        if self.mandatory {
59            write!(f, " [mandatory]")?;
60        }
61        Ok(())
62    }
63}
64
65impl Entity {
66    /// Create a new entity.
67    ///
68    /// Eagerly compiles the regex. If the pattern is invalid, the entity will
69    /// never match any path. Prefer [`Entity::try_new()`] when working with
70    /// user-supplied patterns so you can surface the error.
71    pub fn new(name: &str, pattern: &str) -> Self {
72        let lock = OnceLock::new();
73        let compiled = Regex::new(pattern).ok();
74        #[cfg(debug_assertions)]
75        if compiled.is_none() {
76            log::warn!("invalid regex pattern for entity '{name}': {pattern}");
77        }
78        let _ = lock.set(compiled);
79        Self {
80            name: name.to_string(),
81            pattern: pattern.to_string(),
82            mandatory: false,
83            directory: None,
84            dtype: "str".to_string(),
85            compiled_regex: lock,
86        }
87    }
88
89    /// Create a new entity, returning an error if the regex pattern is invalid.
90    ///
91    /// # Errors
92    ///
93    /// Returns `regex::Error` if `pattern` is not a valid regular expression.
94    pub fn try_new(name: &str, pattern: &str) -> Result<Self, regex::Error> {
95        let compiled = Regex::new(pattern)?;
96        let lock = OnceLock::new();
97        let _ = lock.set(Some(compiled));
98        Ok(Self {
99            name: name.to_string(),
100            pattern: pattern.to_string(),
101            mandatory: false,
102            directory: None,
103            dtype: "str".to_string(),
104            compiled_regex: lock,
105        })
106    }
107
108    /// Set the data type (`"str"`, `"int"`, `"float"`, `"bool"`).
109    #[must_use]
110    pub fn with_dtype(mut self, dtype: &str) -> Self {
111        self.dtype = dtype.to_string();
112        self
113    }
114
115    /// Set the directory pattern for this entity.
116    #[must_use]
117    pub fn with_directory(mut self, directory: &str) -> Self {
118        self.directory = Some(directory.to_string());
119        self
120    }
121
122    /// Mark this entity as mandatory.
123    #[must_use]
124    pub fn with_mandatory(mut self, mandatory: bool) -> Self {
125        self.mandatory = mandatory;
126        self
127    }
128
129    /// Return the compiled regex, lazily compiling on first access.
130    ///
131    /// Returns `None` if the pattern is invalid. Only requires `&self`.
132    pub fn regex(&self) -> Option<&Regex> {
133        self.compiled_regex
134            .get_or_init(|| Regex::new(&self.pattern).ok())
135            .as_ref()
136    }
137
138    /// Match the entity pattern against a file path.
139    /// Returns the captured value if found.
140    ///
141    /// Only requires `&self` (no mutable borrow needed).
142    pub fn match_path(&self, path: &str) -> Option<EntityValue> {
143        let regex = self.regex()?;
144        let caps = regex.captures(path)?;
145        let val_str = caps.get(1)?.as_str();
146        Some(self.coerce_value(val_str))
147    }
148
149    /// Coerce a string value to the appropriate type.
150    pub fn coerce_value(&self, val: &str) -> EntityValue {
151        match self.dtype.as_str() {
152            "int" => EntityValue::Int(PaddedInt::new(val)),
153            "float" => EntityValue::Float(val.parse().unwrap_or(0.0)),
154            "bool" => EntityValue::Bool(val.parse().unwrap_or(false)),
155            _ => EntityValue::Str(val.to_string()),
156        }
157    }
158}
159
160/// A typed entity value, preserving the original representation where needed.
161///
162/// Most entity values are strings (e.g., `sub-01` → `Str("01")`), but some
163/// are typed as integers (preserving zero-padding via [`PaddedInt`]),
164/// floats, or booleans. The [`Json`](EntityValue::Json) variant is used for
165/// metadata values merged from JSON sidecars.
166///
167/// # Conversions
168///
169/// - `From<&str>` and `From<String>` → `EntityValue::Str`
170/// - `From<i32>` and `From<i64>` → `EntityValue::Int`
171/// - `From<f64>` → `EntityValue::Float`
172/// - `From<bool>` → `EntityValue::Bool`
173#[derive(Debug, Clone, Serialize, Deserialize)]
174#[serde(untagged)]
175pub enum EntityValue {
176    Str(String),
177    Int(PaddedInt),
178    Float(f64),
179    Bool(bool),
180    Json(serde_json::Value),
181}
182
183impl From<&str> for EntityValue {
184    fn from(s: &str) -> Self {
185        EntityValue::Str(s.to_string())
186    }
187}
188
189impl From<String> for EntityValue {
190    fn from(s: String) -> Self {
191        EntityValue::Str(s)
192    }
193}
194
195impl From<i32> for EntityValue {
196    fn from(v: i32) -> Self {
197        EntityValue::Int(PaddedInt::from(v))
198    }
199}
200
201impl From<i64> for EntityValue {
202    fn from(v: i64) -> Self {
203        EntityValue::Int(PaddedInt::from(v))
204    }
205}
206
207impl From<f64> for EntityValue {
208    fn from(v: f64) -> Self {
209        EntityValue::Float(v)
210    }
211}
212
213impl From<bool> for EntityValue {
214    fn from(v: bool) -> Self {
215        EntityValue::Bool(v)
216    }
217}
218
219impl EntityValue {
220    /// Get the value as a string representation.
221    ///
222    /// Returns a `Cow::Borrowed` for `Str` variants (zero-copy) and
223    /// `Cow::Owned` for all others.
224    #[must_use]
225    pub fn as_str_lossy(&self) -> std::borrow::Cow<'_, str> {
226        match self {
227            EntityValue::Str(s) => std::borrow::Cow::Borrowed(s),
228            EntityValue::Int(i) => std::borrow::Cow::Owned(i.to_string()),
229            EntityValue::Float(f) => std::borrow::Cow::Owned(f.to_string()),
230            EntityValue::Bool(b) => std::borrow::Cow::Owned(b.to_string()),
231            EntityValue::Json(v) => std::borrow::Cow::Owned(v.to_string()),
232        }
233    }
234
235    /// Try to extract the value as an `i64`.
236    ///
237    /// Returns `Some` for `Int`, `Float` (truncated), and `Str` (parsed).
238    #[must_use]
239    pub fn as_i64(&self) -> Option<i64> {
240        match self {
241            EntityValue::Int(p) => Some(p.value()),
242            EntityValue::Float(f) => Some(*f as i64),
243            EntityValue::Str(s) => s.parse().ok(),
244            _ => None,
245        }
246    }
247
248    /// Try to extract the value as an `f64`.
249    #[must_use]
250    pub fn as_f64(&self) -> Option<f64> {
251        match self {
252            EntityValue::Float(f) => Some(*f),
253            EntityValue::Int(p) => Some(p.value() as f64),
254            EntityValue::Str(s) => s.parse().ok(),
255            _ => None,
256        }
257    }
258
259    /// Try to extract the value as a `bool`.
260    #[must_use]
261    pub fn as_bool(&self) -> Option<bool> {
262        match self {
263            EntityValue::Bool(b) => Some(*b),
264            EntityValue::Str(s) => s.parse().ok(),
265            _ => None,
266        }
267    }
268
269    /// Returns `true` if this is a `Str` variant.
270    #[must_use]
271    pub fn is_str(&self) -> bool {
272        matches!(self, EntityValue::Str(_))
273    }
274
275    /// Returns `true` if this is an `Int` variant.
276    #[must_use]
277    pub fn is_int(&self) -> bool {
278        matches!(self, EntityValue::Int(_))
279    }
280}
281
282impl std::fmt::Display for EntityValue {
283    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
284        write!(f, "{}", self.as_str_lossy())
285    }
286}
287
288impl PartialEq for EntityValue {
289    fn eq(&self, other: &Self) -> bool {
290        // Use canonical string form for all comparisons so that Eq and Hash
291        // are consistent — two values are equal iff they produce the same
292        // canonical string.  This avoids the previous bug where float epsilon
293        // comparison could disagree with the string-based Hash.
294        *self.as_str_lossy() == *other.as_str_lossy()
295    }
296}
297
298impl Eq for EntityValue {}
299
300impl std::hash::Hash for EntityValue {
301    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
302        self.as_str_lossy().hash(state);
303    }
304}
305
306/// A map from entity names to their typed values for a single file.
307///
308/// Uses `IndexMap` to preserve insertion order, which gives deterministic
309/// iteration in canonical BIDS entity order when entities are inserted
310/// following [`ENTITY_ORDER`].
311pub type Entities = IndexMap<String, EntityValue>;
312
313/// A map from entity names to string values (used in variables/collections).
314pub type StringEntities = HashMap<String, String>;
315
316/// Standard BIDS entities in their canonical order.
317pub const ENTITY_ORDER: &[&str] = &[
318    "subject",
319    "session",
320    "sample",
321    "task",
322    "tracksys",
323    "acquisition",
324    "ceagent",
325    "staining",
326    "tracer",
327    "reconstruction",
328    "direction",
329    "run",
330    "modality",
331    "echo",
332    "flip",
333    "inversion",
334    "mtransfer",
335    "part",
336    "processing",
337    "hemisphere",
338    "space",
339    "split",
340    "recording",
341    "chunk",
342    "atlas",
343    "resolution",
344    "density",
345    "label",
346    "description",
347    "suffix",
348    "extension",
349    "datatype",
350];
351
352/// Parse entities from a filename using the provided entity definitions.
353///
354/// Only requires `&[Entity]` — no mutable borrow needed thanks to lazy regex
355/// compilation via `OnceLock`.
356#[must_use]
357pub fn parse_file_entities(path: &str, entities: &[Entity]) -> Entities {
358    let mut result = Entities::new();
359    for entity in entities.iter() {
360        if let Some(val) = entity.match_path(path) {
361            result.insert(entity.name.clone(), val);
362        }
363    }
364    result
365}
366
367/// Sort entity keys according to the canonical BIDS ordering.
368#[must_use]
369pub fn sort_entities(entities: &Entities) -> Vec<(String, EntityValue)> {
370    let mut pairs: Vec<_> = entities
371        .iter()
372        .map(|(k, v)| (k.clone(), v.clone()))
373        .collect();
374
375    pairs.sort_by_key(|(k, _)| {
376        ENTITY_ORDER
377            .iter()
378            .position(|&e| e == k.as_str())
379            .unwrap_or(ENTITY_ORDER.len())
380    });
381
382    pairs
383}
384
385#[cfg(test)]
386mod tests {
387    use super::*;
388
389    #[test]
390    fn test_entity_matching() {
391        let ent = Entity::new("subject", r"[/\\]+sub-([a-zA-Z0-9]+)");
392        let val = ent.match_path("/sub-01/anat/sub-01_T1w.nii.gz");
393        assert!(val.is_some());
394        assert_eq!(val.unwrap().as_str_lossy(), "01");
395    }
396
397    #[test]
398    fn test_int_entity() {
399        let ent = Entity::new("run", r"[_/\\]+run-(\d+)").with_dtype("int");
400        let val = ent.match_path("sub-01_task-rest_run-02_bold.nii.gz");
401        assert!(val.is_some());
402        match val.unwrap() {
403            EntityValue::Int(p) => {
404                assert_eq!(p.value(), 2);
405                assert_eq!(p.to_string(), "02");
406            }
407            _ => panic!("Expected Int"),
408        }
409    }
410
411    #[test]
412    fn test_parse_file_entities() {
413        let entities = vec![
414            Entity::new("subject", r"[/\\]+sub-([a-zA-Z0-9]+)"),
415            Entity::new("session", r"[_/\\]+ses-([a-zA-Z0-9]+)"),
416            Entity::new("task", r"[_/\\]+task-([a-zA-Z0-9]+)"),
417            Entity::new("suffix", r"[_/\\]([a-zA-Z0-9]+)\.[^/\\]+$"),
418        ];
419        let result = parse_file_entities(
420            "/sub-01/ses-02/eeg/sub-01_ses-02_task-rest_eeg.edf",
421            &entities,
422        );
423        assert_eq!(result.get("subject").unwrap().as_str_lossy(), "01");
424        assert_eq!(result.get("session").unwrap().as_str_lossy(), "02");
425        assert_eq!(result.get("task").unwrap().as_str_lossy(), "rest");
426    }
427}