Skip to main content

bids_layout/
indexer.rs

1//! Dataset indexer: walks the filesystem and populates the SQLite database.
2//!
3//! Recursively walks a BIDS dataset directory, extracts entities from each
4//! file using regex patterns, indexes JSON sidecar metadata following the
5//! BIDS inheritance principle, and records file associations.
6
7use bids_core::config::Config;
8use bids_core::entities::Entity;
9use bids_core::error::Result;
10use bids_core::file::BidsFile;
11use bids_io::json::read_json_sidecar;
12use bids_validate::{should_force_index, should_ignore};
13use regex::Regex;
14use std::collections::{HashMap, HashSet};
15use std::path::{Path, PathBuf};
16use walkdir::WalkDir;
17
18use crate::db::Database;
19
20/// Collect unique entities from configs, preserving first-seen order.
21fn collect_unique_entities(configs: &[Config]) -> Vec<Entity> {
22    let mut all = Vec::new();
23    let mut seen = HashSet::new();
24    for config in configs {
25        for entity in &config.entities {
26            if seen.insert(entity.name.clone()) {
27                all.push(entity.clone());
28            }
29        }
30    }
31    all
32}
33
34/// Extract entities from a path and insert the file + tags into the database.
35fn index_single_file(path: &Path, db: &Database, entities: &[Entity]) -> Result<()> {
36    let mut bf = BidsFile::new(path);
37    let path_str = path.to_string_lossy();
38    for entity in entities {
39        if let Some(val) = entity.match_path(&path_str) {
40            bf.entities.insert(entity.name.clone(), val);
41        }
42    }
43    db.insert_file(&bf)?;
44    let file_path_str = path_str.into_owned();
45    for (name, val) in &bf.entities {
46        db.insert_tag(&file_path_str, name, &val.as_str_lossy(), "str", false)?;
47    }
48    Ok(())
49}
50
51/// Options controlling how a BIDS dataset directory is indexed.
52///
53/// These options determine which files are included in the index, whether
54/// BIDS validation is enforced, and whether JSON sidecar metadata is loaded.
55pub struct IndexerOptions {
56    pub validate: bool,
57    pub ignore: Vec<Regex>,
58    pub force_index: Vec<Regex>,
59    pub index_metadata: bool,
60    pub config_filename: String,
61}
62
63impl Default for IndexerOptions {
64    fn default() -> Self {
65        Self {
66            validate: true,
67            ignore: bids_validate::DEFAULT_IGNORE.clone(),
68            force_index: Vec::new(),
69            index_metadata: true,
70            config_filename: "layout_config.json".to_string(),
71        }
72    }
73}
74
75/// Index a BIDS dataset directory into the database.
76///
77/// Walks the dataset directory tree, extracts BIDS entities from each file
78/// using the provided configuration, stores files and tags in the database,
79/// and optionally indexes JSON sidecar metadata with inheritance resolution
80/// and file association tracking.
81///
82/// Files in the `derivatives/` directory at the root level are excluded
83/// (derivatives should be added separately via `BidsLayout::add_derivatives`).
84///
85/// Bulk inserts use a single SQLite transaction for dramatically better
86/// performance on large datasets (100× faster than autocommit per-file).
87pub fn index_dataset(
88    root: &Path,
89    db: &Database,
90    configs: &[Config],
91    options: &IndexerOptions,
92) -> Result<()> {
93    // Collect all entities from configs, deduplicating by name.
94    let mut all_entities = collect_unique_entities(configs);
95
96    // Begin a transaction for bulk inserts — avoids per-file fsync.
97    db.begin_transaction()?;
98
99    let result = index_files(root, db, &mut all_entities, options);
100    if result.is_err() {
101        let _ = db.rollback_transaction();
102        return result;
103    }
104
105    // Index .zarr directories as single files
106    index_zarr_dirs(
107        root,
108        db,
109        &all_entities,
110        &options.ignore,
111        &options.force_index,
112    )?;
113
114    db.commit_transaction()?;
115
116    // Index metadata from JSON sidecars (separate transaction)
117    if options.index_metadata {
118        db.begin_transaction()?;
119        let md_result = index_metadata(root, db);
120        if md_result.is_err() {
121            let _ = db.rollback_transaction();
122            return md_result;
123        }
124        db.commit_transaction()?;
125    }
126
127    Ok(())
128}
129
130/// Walk and index files (called within a transaction).
131fn index_files(
132    root: &Path,
133    db: &Database,
134    all_entities: &mut Vec<Entity>,
135    options: &IndexerOptions,
136) -> Result<()> {
137    // Walk the directory tree
138    for entry in WalkDir::new(root)
139        .follow_links(true)
140        .into_iter()
141        .filter_entry(|e| {
142            // Skip derivatives directory at root level
143            if let Ok(rel) = e.path().strip_prefix(root) {
144                let rel_str = rel.to_string_lossy();
145                if rel_str == "derivatives" || rel_str.starts_with("derivatives/") {
146                    return false;
147                }
148            }
149            // Skip ignored directories early
150            if e.file_type().is_dir()
151                && should_ignore(e.path(), root, &options.ignore)
152                && !should_force_index(e.path(), root, &options.force_index)
153            {
154                return false;
155            }
156            true
157        })
158        .filter_map(std::result::Result::ok)
159    {
160        let path = entry.path();
161
162        // Skip directories themselves
163        if entry.file_type().is_dir() {
164            // Check for per-directory config files
165            let config_file = path.join(&options.config_filename);
166            if config_file.exists()
167                && let Ok(cfg) = Config::from_file(&config_file)
168            {
169                for entity in &cfg.entities {
170                    if !all_entities.iter().any(|e| e.name == entity.name) {
171                        all_entities.push(entity.clone());
172                    }
173                }
174            }
175            continue;
176        }
177
178        // Skip the config filename itself
179        if path
180            .file_name()
181            .is_some_and(|n| n.to_str() == Some(&options.config_filename))
182        {
183            continue;
184        }
185
186        // Check ignore/force patterns
187        let is_ignored = should_ignore(path, root, &options.ignore);
188        let is_forced = should_force_index(path, root, &options.force_index);
189
190        if is_ignored && !is_forced {
191            continue;
192        }
193
194        // Optional BIDS validation
195        if !is_forced && options.validate && !is_bids_valid(path, root) {
196            continue;
197        }
198
199        // Handle symlinks that point to directories (treat as dirs, skip)
200        if path.is_dir() {
201            continue;
202        }
203
204        // Handle .zarr directories as files
205        let path_str_raw = path.to_string_lossy();
206        if path_str_raw.contains(".zarr/") {
207            continue; // Skip files inside .zarr directories
208        }
209
210        index_single_file(path, db, all_entities)?;
211    }
212
213    Ok(())
214}
215
216/// Index .zarr directories as single file entries.
217fn index_zarr_dirs(
218    root: &Path,
219    db: &Database,
220    entities: &[Entity],
221    _ignore: &[Regex],
222    _force: &[Regex],
223) -> Result<()> {
224    for entry in WalkDir::new(root)
225        .follow_links(true)
226        .into_iter()
227        .filter_map(std::result::Result::ok)
228    {
229        let path = entry.path();
230        if entry.file_type().is_dir()
231            && let Some(ext) = path.extension()
232            && ext == "zarr"
233        {
234            index_single_file(path, db, entities)?;
235        }
236    }
237    Ok(())
238}
239
240/// Basic BIDS validity check for a file path.
241fn is_bids_valid(path: &Path, root: &Path) -> bool {
242    let rel = match path.strip_prefix(root) {
243        Ok(r) => r,
244        Err(_) => return false,
245    };
246    let rel_str = rel.to_string_lossy();
247
248    // Root-level files are always valid
249    if !rel_str.contains('/') && !rel_str.contains('\\') {
250        return true;
251    }
252
253    // Must be inside a sub-* directory
254    let first_component = rel
255        .components()
256        .next()
257        .and_then(|c| c.as_os_str().to_str())
258        .unwrap_or("");
259    first_component.starts_with("sub-")
260}
261
262/// Index metadata from JSON sidecar files.
263fn index_metadata(root: &Path, db: &Database) -> Result<()> {
264    let all_paths = db.all_file_paths()?;
265
266    // Separate JSON files and data files
267    let mut json_files: HashSet<PathBuf> = HashSet::new();
268    let mut data_files: Vec<String> = Vec::new();
269
270    for path_str in &all_paths {
271        let path = PathBuf::from(path_str);
272        if path.extension().is_some_and(|e| e == "json") {
273            json_files.insert(path);
274        } else {
275            data_files.push(path_str.clone());
276        }
277    }
278
279    // Build existing tags to avoid duplicates and detect conflicts
280    let mut existing_tags: HashMap<String, String> = HashMap::new();
281    for path_str in &all_paths {
282        let tags = db.get_tags(path_str)?;
283        for (entity_name, value, _, _) in &tags {
284            existing_tags.insert(format!("{path_str}_{entity_name}"), value.clone());
285        }
286    }
287
288    let mut seen_assocs: HashSet<String> = HashSet::new();
289
290    for data_path_str in &data_files {
291        let data_path = PathBuf::from(data_path_str);
292        let data_tags = db.get_tags(data_path_str)?;
293
294        let suffix = data_tags
295            .iter()
296            .find(|(n, _, _, _)| n == "suffix")
297            .map(|(_, v, _, _)| v.clone());
298        let extension = data_tags
299            .iter()
300            .find(|(n, _, _, _)| n == "extension")
301            .map(|(_, v, _, _)| v.clone());
302
303        let suffix = match suffix {
304            Some(s) => s,
305            None => continue,
306        };
307
308        let data_entities: HashMap<String, String> = data_tags
309            .iter()
310            .filter(|(n, _, _, _)| n != "suffix" && n != "extension")
311            .map(|(n, v, _, _)| (n.clone(), v.clone()))
312            .collect();
313
314        // Walk up directory tree finding matching JSON sidecars
315        let mut dir = data_path.parent();
316        let mut sidecar_stack: Vec<PathBuf> = Vec::new();
317
318        while let Some(current_dir) = dir {
319            for json_path in &json_files {
320                if json_path.parent() != Some(current_dir) {
321                    continue;
322                }
323
324                let json_stem = json_path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
325                let json_suffix = json_stem.rsplit('_').next().unwrap_or("");
326                if json_suffix != suffix {
327                    continue;
328                }
329
330                let json_entities = extract_kv_pairs(json_stem);
331                let all_match = json_entities
332                    .iter()
333                    .all(|(k, v)| data_entities.get(k).is_none_or(|dv| dv == v));
334
335                if all_match {
336                    sidecar_stack.push(json_path.clone());
337
338                    let assoc_key =
339                        format!("{}#{}#Metadata", json_path.to_string_lossy(), data_path_str);
340                    if seen_assocs.insert(assoc_key) {
341                        db.insert_association(
342                            &json_path.to_string_lossy(),
343                            data_path_str,
344                            "Metadata",
345                        )?;
346                    }
347                }
348            }
349
350            if current_dir == root {
351                break;
352            }
353            dir = current_dir.parent();
354        }
355
356        // Create parent/child chain for JSON inheritance
357        for i in 0..sidecar_stack.len() {
358            if i + 1 < sidecar_stack.len() {
359                let src = sidecar_stack[i].to_string_lossy().to_string();
360                let dst = sidecar_stack[i + 1].to_string_lossy().to_string();
361                let key1 = format!("{src}#{dst}#Child");
362                if seen_assocs.insert(key1) {
363                    db.insert_association(&src, &dst, "Child")?;
364                    db.insert_association(&dst, &src, "Parent")?;
365                }
366            }
367        }
368
369        // Merge sidecars: least specific first
370        sidecar_stack.reverse();
371        let mut merged_metadata: indexmap::IndexMap<String, serde_json::Value> =
372            indexmap::IndexMap::new();
373        for sidecar_path in &sidecar_stack {
374            if let Ok(md) = read_json_sidecar(sidecar_path) {
375                for (k, v) in md {
376                    merged_metadata.insert(k, v);
377                }
378            }
379        }
380
381        // Write metadata tags, checking for conflicts
382        for (key, value) in &merged_metadata {
383            if value.is_null() {
384                continue;
385            }
386
387            let tag_key = format!("{data_path_str}_{key}");
388            let val_str = match value {
389                serde_json::Value::String(s) => s.clone(),
390                other => other.to_string(),
391            };
392
393            if let Some(existing_val) = existing_tags.get(&tag_key) {
394                if *existing_val != val_str {
395                    log::warn!(
396                        "conflicting metadata for '{key}' on {data_path_str}: '{existing_val}' vs '{val_str}'"
397                    );
398                }
399                continue;
400            }
401            db.insert_tag(data_path_str, key, &val_str, "json", true)?;
402        }
403
404        // Handle IntendedFor
405        if let Some(intended) = merged_metadata.get("IntendedFor") {
406            let subject = data_entities.get("subject").cloned().unwrap_or_default();
407            index_intended_for(db, data_path_str, intended, root, &subject)?;
408        }
409
410        // Link companion files (events↔bold, bvec/bval↔DWI)
411        index_companion_associations(
412            db,
413            data_path_str,
414            &suffix,
415            extension.as_deref(),
416            &data_entities,
417        )?;
418    }
419
420    Ok(())
421}
422
423/// Resolve and record IntendedFor associations from metadata.
424fn index_intended_for(
425    db: &Database,
426    data_path: &str,
427    intended: &serde_json::Value,
428    root: &Path,
429    subject: &str,
430) -> Result<()> {
431    let intents: Vec<&str> = match intended {
432        serde_json::Value::String(s) => vec![s.as_str()],
433        serde_json::Value::Array(arr) => arr.iter().filter_map(|v| v.as_str()).collect(),
434        _ => vec![],
435    };
436
437    for intent in intents {
438        if let Some(target) = bids_validate::resolve_intended_for(intent, root, subject) {
439            let target_str = target.to_string_lossy();
440            db.insert_association(data_path, &target_str, "IntendedFor")?;
441            db.insert_association(&target_str, data_path, "InformedBy")?;
442        }
443    }
444    Ok(())
445}
446
447/// Link companion files (events/physio/stim/sbref ↔ bold/eeg, bvec/bval ↔ DWI).
448fn index_companion_associations(
449    db: &Database,
450    data_path: &str,
451    suffix: &str,
452    extension: Option<&str>,
453    data_entities: &HashMap<String, String>,
454) -> Result<()> {
455    if extension.is_none() {
456        return Ok(());
457    }
458
459    if matches!(suffix, "events" | "physio" | "stim" | "sbref") {
460        let mut filters: Vec<(String, Vec<String>, bool)> = data_entities
461            .iter()
462            .filter(|(k, _)| matches!(k.as_str(), "subject" | "session" | "task" | "run"))
463            .map(|(k, v)| (k.clone(), vec![v.clone()], false))
464            .collect();
465        filters.push(("suffix".into(), vec!["bold".into(), "eeg".into()], false));
466
467        if let Ok(images) = db.query_files(&filters) {
468            for img in &images {
469                db.insert_association(data_path, img, "IntendedFor")?;
470                db.insert_association(img, data_path, "InformedBy")?;
471            }
472        }
473    }
474
475    if suffix == "dwi" && matches!(extension, Some(".bvec" | ".bval")) {
476        let mut filters: Vec<(String, Vec<String>, bool)> = data_entities
477            .iter()
478            .filter(|(k, _)| matches!(k.as_str(), "subject" | "session" | "run" | "acquisition"))
479            .map(|(k, v)| (k.clone(), vec![v.clone()], false))
480            .collect();
481        filters.push(("suffix".into(), vec!["dwi".into()], false));
482        filters.push((
483            "extension".into(),
484            vec![".nii".into(), ".nii.gz".into()],
485            false,
486        ));
487
488        if let Ok(images) = db.query_files(&filters) {
489            for img in &images {
490                db.insert_association(data_path, img, "IntendedFor")?;
491                db.insert_association(img, data_path, "InformedBy")?;
492            }
493        }
494    }
495
496    Ok(())
497}
498
499/// Extract key-value pairs from a BIDS filename stem.
500fn extract_kv_pairs(stem: &str) -> Vec<(String, String)> {
501    let mut pairs = Vec::new();
502    for part in stem.split('_') {
503        if let Some(idx) = part.find('-') {
504            let key = &part[..idx];
505            let val = &part[idx + 1..];
506            let entity_name = match key {
507                "sub" => "subject",
508                "ses" => "session",
509                "acq" => "acquisition",
510                "ce" => "ceagent",
511                "rec" => "reconstruction",
512                "dir" => "direction",
513                "mod" => "modality",
514                "trc" => "tracer",
515                other => other,
516            };
517            pairs.push((entity_name.to_string(), val.to_string()));
518        }
519    }
520    pairs
521}