Skip to main content

bids_layout/
db.rs

1//! SQLite database backend for the BIDS file index.
2//!
3//! Stores files, entity tags, metadata, and file associations in a SQLite
4//! database (in-memory or on-disk). Supports REGEXP queries via a custom
5//! function and transactional bulk inserts for fast indexing.
6
7use bids_core::error::{BidsError, Result};
8use bids_core::file::BidsFile;
9use rusqlite::{Connection, functions::FunctionFlags, params};
10use std::path::Path;
11
12/// Convert a `rusqlite::Error` into a `BidsError::Database`.
13fn db_err(e: rusqlite::Error) -> BidsError {
14    BidsError::Database(e.to_string())
15}
16
17/// Manages the SQLite database backing a [`BidsLayout`](crate::BidsLayout) index.
18///
19/// Stores the complete file index for a BIDS dataset, including:
20/// - **Files** — Path, filename, directory, file type for every indexed file
21/// - **Tags** — Entity-value pairs (subject=01, task=rest, etc.) and metadata
22///   key-value pairs from JSON sidecars
23/// - **Associations** — Relationships between files (IntendedFor, Metadata
24///   inheritance parent/child, InformedBy)
25/// - **Layout info** — Root path and config names for database reloading
26///
27/// The database supports both in-memory operation (fast, ephemeral) and
28/// on-disk persistence (for caching large dataset indices). It registers
29/// a custom `REGEXP` function for SQLite to support regex-based queries.
30///
31/// # Schema
32///
33/// ```sql
34/// CREATE TABLE files (path TEXT PRIMARY KEY, filename TEXT, dirname TEXT, ...);
35/// CREATE TABLE tags (file_path TEXT, entity_name TEXT, value TEXT, ...);
36/// CREATE TABLE associations (src TEXT, dst TEXT, kind TEXT, ...);
37/// CREATE TABLE layout_info (root TEXT PRIMARY KEY, config TEXT, ...);
38/// ```
39pub struct Database {
40    conn: Connection,
41}
42
43impl Database {
44    /// Create a new in-memory database.
45    pub fn in_memory() -> Result<Self> {
46        let conn = Connection::open_in_memory().map_err(db_err)?;
47        let db = Self { conn };
48        db.register_regexp()?;
49        db.create_tables()?;
50        Ok(db)
51    }
52
53    /// Open or create a database at the given path.
54    pub fn open(path: &Path) -> Result<Self> {
55        if let Some(parent) = path.parent() {
56            std::fs::create_dir_all(parent)?;
57        }
58        let conn = Connection::open(path).map_err(db_err)?;
59        let db = Self { conn };
60        db.register_regexp()?;
61        db.create_tables()?;
62        Ok(db)
63    }
64
65    /// Check if a database file exists.
66    pub fn exists(path: &Path) -> bool {
67        path.exists()
68    }
69
70    /// Register REGEXP function for SQLite.
71    ///
72    /// Caches compiled regexes so the same pattern is only compiled once per
73    /// query, not once per row. Uses SQLite's auxiliary data API for O(1) reuse.
74    fn register_regexp(&self) -> Result<()> {
75        use std::cell::RefCell;
76        use std::collections::HashMap;
77
78        // Thread-local regex cache shared across all REGEXP invocations on this
79        // connection. Avoids recompiling the same pattern for every row.
80        thread_local! {
81            static CACHE: RefCell<HashMap<String, regex::Regex>> = RefCell::new(HashMap::new());
82        }
83
84        self.conn
85            .create_scalar_function(
86                "regexp",
87                2,
88                FunctionFlags::SQLITE_UTF8 | FunctionFlags::SQLITE_DETERMINISTIC,
89                |ctx| {
90                    let pattern = ctx
91                        .get_raw(0)
92                        .as_str()
93                        .map_err(|e| rusqlite::Error::UserFunctionError(Box::new(e)))?;
94                    let value = ctx
95                        .get_raw(1)
96                        .as_str()
97                        .map_err(|e| rusqlite::Error::UserFunctionError(Box::new(e)))?;
98                    CACHE.with(|cache| {
99                        let mut cache = cache.borrow_mut();
100                        if let Some(re) = cache.get(pattern) {
101                            return Ok(re.is_match(value));
102                        }
103                        let re = regex::Regex::new(pattern)
104                            .map_err(|e| rusqlite::Error::UserFunctionError(Box::new(e)))?;
105                        let result = re.is_match(value);
106                        cache.insert(pattern.to_string(), re);
107                        Ok(result)
108                    })
109                },
110            )
111            .map_err(db_err)?;
112        Ok(())
113    }
114
115    fn create_tables(&self) -> Result<()> {
116        self.conn
117            .execute_batch(
118                "CREATE TABLE IF NOT EXISTS files (
119                path TEXT PRIMARY KEY,
120                filename TEXT NOT NULL,
121                dirname TEXT NOT NULL,
122                is_dir INTEGER NOT NULL DEFAULT 0,
123                file_type TEXT NOT NULL DEFAULT 'Generic'
124            );
125            CREATE TABLE IF NOT EXISTS tags (
126                file_path TEXT NOT NULL,
127                entity_name TEXT NOT NULL,
128                value TEXT NOT NULL,
129                dtype TEXT NOT NULL DEFAULT 'str',
130                is_metadata INTEGER NOT NULL DEFAULT 0,
131                PRIMARY KEY (file_path, entity_name)
132            );
133            CREATE TABLE IF NOT EXISTS associations (
134                src TEXT NOT NULL,
135                dst TEXT NOT NULL,
136                kind TEXT NOT NULL,
137                PRIMARY KEY (src, dst, kind)
138            );
139            CREATE TABLE IF NOT EXISTS layout_info (
140                root TEXT PRIMARY KEY,
141                config TEXT,
142                derivatives TEXT
143            );
144            CREATE INDEX IF NOT EXISTS idx_tags_file ON tags(file_path);
145            CREATE INDEX IF NOT EXISTS idx_tags_entity ON tags(entity_name);
146            CREATE INDEX IF NOT EXISTS idx_tags_value ON tags(value);
147            CREATE INDEX IF NOT EXISTS idx_assoc_src ON associations(src);
148            CREATE INDEX IF NOT EXISTS idx_assoc_dst ON associations(dst);
149            ",
150            )
151            .map_err(db_err)?;
152        Ok(())
153    }
154
155    /// Begin an explicit transaction for bulk operations.
156    ///
157    /// Wrapping many inserts in a single transaction avoids per-statement
158    /// fsyncs, giving ~100× better insert throughput on large datasets.
159    pub fn begin_transaction(&self) -> Result<()> {
160        self.conn
161            .execute_batch("BEGIN TRANSACTION")
162            .map_err(db_err)?;
163        Ok(())
164    }
165
166    /// Commit the current transaction.
167    pub fn commit_transaction(&self) -> Result<()> {
168        self.conn.execute_batch("COMMIT").map_err(db_err)?;
169        Ok(())
170    }
171
172    /// Roll back the current transaction.
173    pub fn rollback_transaction(&self) -> Result<()> {
174        self.conn.execute_batch("ROLLBACK").map_err(db_err)?;
175        Ok(())
176    }
177
178    /// Insert a file into the database.
179    pub fn insert_file(&self, file: &BidsFile) -> Result<()> {
180        self.conn
181            .execute(
182                "INSERT OR REPLACE INTO files (path, filename, dirname, is_dir, file_type)
183             VALUES (?1, ?2, ?3, ?4, ?5)",
184                params![
185                    file.path.to_string_lossy().as_ref(),
186                    file.filename,
187                    file.dirname.to_string_lossy().as_ref(),
188                    file.is_dir as i32,
189                    format!("{:?}", file.file_type),
190                ],
191            )
192            .map_err(db_err)?;
193        Ok(())
194    }
195
196    /// Insert a tag (entity-value pair for a file).
197    pub fn insert_tag(
198        &self,
199        file_path: &str,
200        entity_name: &str,
201        value: &str,
202        dtype: &str,
203        is_metadata: bool,
204    ) -> Result<()> {
205        self.conn
206            .execute(
207                "INSERT OR REPLACE INTO tags (file_path, entity_name, value, dtype, is_metadata)
208             VALUES (?1, ?2, ?3, ?4, ?5)",
209                params![file_path, entity_name, value, dtype, is_metadata as i32],
210            )
211            .map_err(db_err)?;
212        Ok(())
213    }
214
215    /// Insert a file association.
216    pub fn insert_association(&self, src: &str, dst: &str, kind: &str) -> Result<()> {
217        self.conn
218            .execute(
219                "INSERT OR IGNORE INTO associations (src, dst, kind) VALUES (?1, ?2, ?3)",
220                params![src, dst, kind],
221            )
222            .map_err(db_err)?;
223        Ok(())
224    }
225
226    /// Store layout info.
227    pub fn set_layout_info(&self, root: &str, config: &str) -> Result<()> {
228        self.conn
229            .execute(
230                "INSERT OR REPLACE INTO layout_info (root, config) VALUES (?1, ?2)",
231                params![root, config],
232            )
233            .map_err(db_err)?;
234        Ok(())
235    }
236
237    /// Get layout info (root, config).
238    pub fn get_layout_info(&self) -> Result<Option<(String, String)>> {
239        let mut stmt = self
240            .conn
241            .prepare("SELECT root, config FROM layout_info LIMIT 1")
242            .map_err(db_err)?;
243
244        let result = stmt
245            .query_row([], |row| {
246                Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
247            })
248            .ok();
249        Ok(result)
250    }
251
252    /// Query all file paths.
253    pub fn all_file_paths(&self) -> Result<Vec<String>> {
254        let mut stmt = self
255            .conn
256            .prepare("SELECT path FROM files WHERE is_dir = 0")
257            .map_err(db_err)?;
258        let paths: std::result::Result<Vec<String>, _> = stmt
259            .query_map([], |row| row.get(0))
260            .map_err(db_err)?
261            .collect();
262        paths.map_err(db_err)
263    }
264
265    /// Get tags for a specific file.
266    pub fn get_tags(&self, file_path: &str) -> Result<Vec<(String, String, String, bool)>> {
267        let mut stmt = self
268            .conn
269            .prepare("SELECT entity_name, value, dtype, is_metadata FROM tags WHERE file_path = ?1")
270            .map_err(db_err)?;
271        let tags: std::result::Result<Vec<_>, _> = stmt
272            .query_map(params![file_path], |row| {
273                Ok((
274                    row.get::<_, String>(0)?,
275                    row.get::<_, String>(1)?,
276                    row.get::<_, String>(2)?,
277                    row.get::<_, bool>(3)?,
278                ))
279            })
280            .map_err(db_err)?
281            .collect();
282        tags.map_err(db_err)
283    }
284
285    /// Get all unique values for a given entity.
286    pub fn get_unique_entity_values(&self, entity_name: &str) -> Result<Vec<String>> {
287        let mut stmt = self
288            .conn
289            .prepare("SELECT DISTINCT value FROM tags WHERE entity_name = ?1 ORDER BY value")
290            .map_err(db_err)?;
291        let values: std::result::Result<Vec<String>, _> = stmt
292            .query_map(params![entity_name], |row| row.get(0))
293            .map_err(db_err)?
294            .collect();
295        values.map_err(db_err)
296    }
297
298    /// Get all unique entity names.
299    pub fn get_entity_names(&self) -> Result<Vec<String>> {
300        let mut stmt = self
301            .conn
302            .prepare("SELECT DISTINCT entity_name FROM tags ORDER BY entity_name")
303            .map_err(db_err)?;
304        let names: std::result::Result<Vec<String>, _> = stmt
305            .query_map([], |row| row.get(0))
306            .map_err(db_err)?
307            .collect();
308        names.map_err(db_err)
309    }
310
311    /// Query files with advanced filtering including Query::None/Any and regex.
312    ///
313    /// Filter types:
314    /// - Normal: (entity, values, false) — entity must match one of values
315    /// - Regex: `(entity, [pattern], true)` — entity must match regex
316    /// - Query::None: (entity, ["__NONE__"], false) — entity must NOT exist
317    /// - Query::Any: (entity, ["__ANY__"], false) — entity must exist (any value)
318    pub fn query_files(&self, filters: &[(String, Vec<String>, bool)]) -> Result<Vec<String>> {
319        if filters.is_empty() {
320            return self.all_file_paths();
321        }
322
323        let mut sql = String::from("SELECT f.path FROM files f WHERE f.is_dir = 0");
324        let mut bind_values: Vec<String> = Vec::new();
325
326        for (i, (entity_name, values, is_regex)) in filters.iter().enumerate() {
327            use std::fmt::Write;
328
329            // Check for special Query types
330            if values.len() == 1 {
331                match values[0].as_str() {
332                    "__NONE__" => {
333                        write!(sql, " AND NOT EXISTS (SELECT 1 FROM tags t{i} WHERE t{i}.file_path = f.path AND t{i}.entity_name = ?)").unwrap();
334                        bind_values.push(entity_name.clone());
335                        continue;
336                    }
337                    "__ANY__" => {
338                        write!(sql, " AND EXISTS (SELECT 1 FROM tags t{i} WHERE t{i}.file_path = f.path AND t{i}.entity_name = ?)").unwrap();
339                        bind_values.push(entity_name.clone());
340                        continue;
341                    }
342                    "__OPTIONAL__" => continue,
343                    _ => {}
344                }
345            }
346
347            if *is_regex && !values.is_empty() {
348                write!(sql, " AND EXISTS (SELECT 1 FROM tags t{i} WHERE t{i}.file_path = f.path AND t{i}.entity_name = ? AND t{i}.value REGEXP ?)").unwrap();
349                bind_values.push(entity_name.clone());
350                bind_values.push(values[0].clone());
351            } else if values.len() == 1 {
352                write!(sql, " AND EXISTS (SELECT 1 FROM tags t{i} WHERE t{i}.file_path = f.path AND t{i}.entity_name = ? AND t{i}.value = ?)").unwrap();
353                bind_values.push(entity_name.clone());
354                bind_values.push(values[0].clone());
355            } else if !values.is_empty() {
356                let placeholders = "?,".repeat(values.len());
357                let placeholders = &placeholders[..placeholders.len() - 1];
358                write!(sql, " AND EXISTS (SELECT 1 FROM tags t{i} WHERE t{i}.file_path = f.path AND t{i}.entity_name = ? AND t{i}.value IN ({placeholders}))").unwrap();
359                bind_values.push(entity_name.clone());
360                bind_values.extend(values.iter().cloned());
361            }
362        }
363
364        sql.push_str(" ORDER BY f.path");
365
366        let mut stmt = self.conn.prepare(&sql).map_err(db_err)?;
367        let params: Vec<&dyn rusqlite::types::ToSql> = bind_values
368            .iter()
369            .map(|v| v as &dyn rusqlite::types::ToSql)
370            .collect();
371        let paths: std::result::Result<Vec<String>, _> = stmt
372            .query_map(params.as_slice(), |row| row.get::<_, String>(0))
373            .map_err(db_err)?
374            .collect();
375        paths.map_err(db_err)
376    }
377
378    /// Get distinct directories for files matching filters and having a target entity.
379    pub fn query_directories(
380        &self,
381        target_entity: &str,
382        filters: &[(String, Vec<String>, bool)],
383    ) -> Result<Vec<String>> {
384        let paths = self.query_files(filters)?;
385        let mut dirs = std::collections::BTreeSet::new();
386        for path_str in &paths {
387            // Check if the file has the target entity
388            let tags = self.get_tags(path_str)?;
389            if tags.iter().any(|(n, _, _, _)| n == target_entity)
390                && let Some(parent) = std::path::Path::new(path_str).parent()
391            {
392                dirs.insert(parent.to_string_lossy().to_string());
393            }
394        }
395        Ok(dirs.into_iter().collect())
396    }
397
398    /// Get associated files for a given source file.
399    pub fn get_associations(&self, src: &str, kind: Option<&str>) -> Result<Vec<(String, String)>> {
400        let (sql, params_vec): (String, Vec<String>) = if let Some(k) = kind {
401            (
402                "SELECT dst, kind FROM associations WHERE src = ?1 AND kind = ?2".to_string(),
403                vec![src.to_string(), k.to_string()],
404            )
405        } else {
406            (
407                "SELECT dst, kind FROM associations WHERE src = ?1".to_string(),
408                vec![src.to_string()],
409            )
410        };
411
412        let mut stmt = self.conn.prepare(&sql).map_err(db_err)?;
413        let params: Vec<&dyn rusqlite::types::ToSql> = params_vec
414            .iter()
415            .map(|v| v as &dyn rusqlite::types::ToSql)
416            .collect();
417        let assocs: std::result::Result<Vec<_>, _> = stmt
418            .query_map(params.as_slice(), |row| {
419                Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
420            })
421            .map_err(db_err)?
422            .collect();
423        assocs.map_err(db_err)
424    }
425
426    /// Get the total number of indexed files.
427    pub fn file_count(&self) -> Result<usize> {
428        let mut stmt = self
429            .conn
430            .prepare("SELECT COUNT(*) FROM files WHERE is_dir = 0")
431            .map_err(db_err)?;
432        let count: i64 = stmt.query_row([], |row| row.get(0)).map_err(db_err)?;
433        Ok(count as usize)
434    }
435
436    /// Save the current in-memory database to a file.
437    pub fn save_to(&self, path: &Path) -> Result<()> {
438        if let Some(parent) = path.parent() {
439            std::fs::create_dir_all(parent)?;
440        }
441        let mut backup = Connection::open(path).map_err(db_err)?;
442        let b = rusqlite::backup::Backup::new(&self.conn, &mut backup).map_err(db_err)?;
443        b.step(-1).map_err(db_err)?;
444        Ok(())
445    }
446}
447
448#[cfg(test)]
449mod tests {
450    use super::*;
451    use bids_core::file::BidsFile;
452
453    #[test]
454    fn test_database_operations() {
455        let db = Database::in_memory().unwrap();
456        let file = BidsFile::new("/data/sub-01/eeg/sub-01_task-rest_eeg.edf");
457        db.insert_file(&file).unwrap();
458        db.insert_tag(
459            "/data/sub-01/eeg/sub-01_task-rest_eeg.edf",
460            "subject",
461            "01",
462            "str",
463            false,
464        )
465        .unwrap();
466        db.insert_tag(
467            "/data/sub-01/eeg/sub-01_task-rest_eeg.edf",
468            "task",
469            "rest",
470            "str",
471            false,
472        )
473        .unwrap();
474
475        let paths = db.all_file_paths().unwrap();
476        assert_eq!(paths.len(), 1);
477
478        let subjects = db.get_unique_entity_values("subject").unwrap();
479        assert_eq!(subjects, vec!["01"]);
480
481        let filters = vec![("subject".to_string(), vec!["01".to_string()], false)];
482        let results = db.query_files(&filters).unwrap();
483        assert_eq!(results.len(), 1);
484    }
485
486    #[test]
487    fn test_query_none_any() {
488        let db = Database::in_memory().unwrap();
489        let f1 = BidsFile::new("/data/sub-01_task-rest_eeg.edf");
490        let f2 = BidsFile::new("/data/sub-02_eeg.edf");
491        db.insert_file(&f1).unwrap();
492        db.insert_file(&f2).unwrap();
493        db.insert_tag(
494            "/data/sub-01_task-rest_eeg.edf",
495            "subject",
496            "01",
497            "str",
498            false,
499        )
500        .unwrap();
501        db.insert_tag(
502            "/data/sub-01_task-rest_eeg.edf",
503            "task",
504            "rest",
505            "str",
506            false,
507        )
508        .unwrap();
509        db.insert_tag("/data/sub-02_eeg.edf", "subject", "02", "str", false)
510            .unwrap();
511
512        // Query::Any — task must exist
513        let filters = vec![("task".to_string(), vec!["__ANY__".to_string()], false)];
514        let results = db.query_files(&filters).unwrap();
515        assert_eq!(results.len(), 1);
516        assert!(results[0].contains("sub-01"));
517
518        // Query::None — task must NOT exist
519        let filters = vec![("task".to_string(), vec!["__NONE__".to_string()], false)];
520        let results = db.query_files(&filters).unwrap();
521        assert_eq!(results.len(), 1);
522        assert!(results[0].contains("sub-02"));
523    }
524
525    #[test]
526    fn test_transactions() {
527        let db = Database::in_memory().unwrap();
528        db.begin_transaction().unwrap();
529
530        for i in 0..100 {
531            let path = format!("/data/sub-{:03}_eeg.edf", i);
532            let f = BidsFile::new(&path);
533            db.insert_file(&f).unwrap();
534            db.insert_tag(&path, "subject", &format!("{:03}", i), "str", false)
535                .unwrap();
536        }
537
538        db.commit_transaction().unwrap();
539
540        assert_eq!(db.file_count().unwrap(), 100);
541        let subjects = db.get_unique_entity_values("subject").unwrap();
542        assert_eq!(subjects.len(), 100);
543    }
544
545    #[test]
546    fn test_transaction_rollback() {
547        let db = Database::in_memory().unwrap();
548
549        // Insert one file outside transaction
550        let f = BidsFile::new("/data/sub-01_eeg.edf");
551        db.insert_file(&f).unwrap();
552        assert_eq!(db.file_count().unwrap(), 1);
553
554        // Start transaction, insert more, rollback
555        db.begin_transaction().unwrap();
556        let f2 = BidsFile::new("/data/sub-02_eeg.edf");
557        db.insert_file(&f2).unwrap();
558        db.rollback_transaction().unwrap();
559
560        // Should only have the first file
561        assert_eq!(db.file_count().unwrap(), 1);
562    }
563
564    #[test]
565    fn test_regexp_query() {
566        let db = Database::in_memory().unwrap();
567        let f1 = BidsFile::new("/data/sub-01_eeg.edf");
568        let f2 = BidsFile::new("/data/sub-02_eeg.edf");
569        db.insert_file(&f1).unwrap();
570        db.insert_file(&f2).unwrap();
571        db.insert_tag("/data/sub-01_eeg.edf", "subject", "01", "str", false)
572            .unwrap();
573        db.insert_tag("/data/sub-02_eeg.edf", "subject", "02", "str", false)
574            .unwrap();
575
576        let filters = vec![("subject".to_string(), vec!["0[12]".to_string()], true)];
577        let results = db.query_files(&filters).unwrap();
578        assert_eq!(results.len(), 2);
579    }
580}