Skip to main content

mago_database/
lib.rs

1//! High-performance file database for PHP projects.
2//!
3//! This crate provides an efficient in-memory database for managing collections of PHP source files.
4//! It offers two complementary database types optimized for different access patterns:
5//!
6//! - [`Database`]: Mutable builder optimized for modifications (add, update, delete)
7//! - [`ReadDatabase`]: Immutable snapshot optimized for high-performance reads
8//!
9//! # Architecture
10//!
11//! The database uses a two-phase approach:
12//!
13//! 1. **Build Phase**: Use [`Database`] to load files, make modifications, and track changes
14//! 2. **Query Phase**: Convert to [`ReadDatabase`] via [`Database::read_only`] for fast lookups
15//!
16//! # Key Features
17//!
18//! - **Fast Lookups**: O(1) average-time access by ID, name, or filesystem path
19//! - **Change Tracking**: Record and batch apply file modifications via [`ChangeLog`]
20//! - **Deterministic Iteration**: [`ReadDatabase`] guarantees consistent iteration order
21//! - **Parallel Operations**: Concurrent file I/O and processing support
22//! - **Type Safety**: Strong typing with stable [`FileId`] handles
23//!
24//! # Common Workflow
25//!
26//! ## Loading Files
27//!
28//! Use [`loader::DatabaseLoader`] to scan a project directory:
29//!
30//! The loader handles file discovery, exclusion patterns, and parallel loading.
31//!
32//! ## Querying Files
33//!
34//! Both database types implement [`DatabaseReader`] for uniform access:
35//!
36//! ## Modifying Files
37//!
38//! Use [`ChangeLog`] to batch modifications:
39//!
40//! Changes can be applied to the database and optionally written to disk in parallel.
41//!
42//! # Performance Characteristics
43//!
44//! ## Database (Mutable)
45//!
46//! - Add/Update/Delete: O(1) average
47//! - Lookup by ID/name: O(1) average
48//! - Iteration: Unordered
49//! - Memory: ~2x file count (maps for bidirectional lookup)
50//!
51//! ## `ReadDatabase` (Immutable)
52//!
53//! - Creation: O(n log n) for sorting
54//! - Lookup by ID/name/path: O(1) average
55//! - Iteration: Deterministic, sorted by `FileId`
56//! - Memory: ~3x file count (vector + 3 index maps)
57//!
58//! # Thread Safety
59//!
60//! [`Database`] is not thread-safe and should be used from a single thread during construction.
61//! [`ReadDatabase`] can be freely shared across threads for concurrent read access.
62
63use std::borrow::Cow;
64use std::path::Path;
65use std::path::PathBuf;
66use std::sync::Arc;
67
68use foldhash::HashMap;
69use foldhash::HashMapExt;
70use rayon::iter::IntoParallelIterator;
71use rayon::iter::ParallelIterator;
72use serde::Deserialize;
73use serde::Serialize;
74
75use crate::change::Change;
76use crate::change::ChangeLog;
77use crate::error::DatabaseError;
78use crate::exclusion::Exclusion;
79use crate::file::File;
80use crate::file::FileId;
81use crate::file::FileType;
82use crate::file::line_starts;
83use crate::operation::FilesystemOperation;
84
85mod utils;
86
87pub mod change;
88pub mod error;
89pub mod exclusion;
90pub mod file;
91pub mod loader;
92pub mod matcher;
93pub mod watcher;
94
95mod operation;
96
97/// Configuration for database loading and watching.
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct DatabaseConfiguration<'a> {
100    pub workspace: Cow<'a, Path>,
101    /// Paths or glob patterns for source files.
102    /// Can be directory paths (e.g., "src") or glob patterns (e.g., "src/**/*.php")
103    pub paths: Vec<Cow<'a, str>>,
104    /// Paths or glob patterns for included files.
105    /// Can be directory paths (e.g., "vendor") or glob patterns (e.g., "vendor/**/*.php")
106    pub includes: Vec<Cow<'a, str>>,
107    pub excludes: Vec<Exclusion<'a>>,
108    pub extensions: Vec<Cow<'a, str>>,
109    /// Settings for glob pattern matching behavior.
110    pub glob: GlobSettings,
111}
112
113/// Settings for glob pattern matching behavior.
114///
115/// All defaults match the `globset` crate defaults for backwards compatibility.
116#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
117pub struct GlobSettings {
118    /// Match patterns case-insensitively.
119    ///
120    /// Default: `false`.
121    pub case_insensitive: bool,
122    /// When `true`, a single `*` does not match path separators (`/`).
123    /// This makes `src/*/Test` match only `src/foo/Test`, not `src/foo/bar/Test`.
124    /// Use `**` for recursive matching.
125    ///
126    /// Default: `false`.
127    pub literal_separator: bool,
128    /// Whether `\` escapes special characters in patterns.
129    ///
130    /// Default: `true`.
131    pub backslash_escape: bool,
132    /// Whether an empty case in alternates is allowed (e.g., `{,a}` matches `""` and `"a"`).
133    ///
134    /// Default: `false`.
135    pub empty_alternates: bool,
136}
137
138impl Default for GlobSettings {
139    fn default() -> Self {
140        Self {
141            case_insensitive: false,
142            literal_separator: false,
143            backslash_escape: !std::path::is_separator('\\'),
144            empty_alternates: false,
145        }
146    }
147}
148
149impl<'a> DatabaseConfiguration<'a> {
150    pub fn new(
151        workspace: &'a Path,
152        paths: Vec<&'a str>,
153        includes: Vec<&'a str>,
154        excludes: Vec<Exclusion<'a>>,
155        extensions: Vec<&'a str>,
156    ) -> Self {
157        let paths = paths.into_iter().map(Cow::Borrowed).collect();
158        let includes = includes.into_iter().map(Cow::Borrowed).collect();
159
160        let excludes = excludes
161            .into_iter()
162            .filter_map(|exclusion| match exclusion {
163                Exclusion::Path(p) => Some(if p.is_absolute() {
164                    Exclusion::Path(p)
165                } else {
166                    workspace.join(p).canonicalize().ok().map(Cow::Owned).map(Exclusion::Path)?
167                }),
168                Exclusion::Pattern(pat) => Some(Exclusion::Pattern(pat)),
169            })
170            .collect();
171
172        let extensions = extensions.into_iter().map(Cow::Borrowed).collect();
173
174        Self {
175            workspace: Cow::Borrowed(workspace),
176            paths,
177            includes,
178            excludes,
179            extensions,
180            glob: GlobSettings::default(),
181        }
182    }
183
184    #[must_use]
185    pub fn into_static(self) -> DatabaseConfiguration<'static> {
186        DatabaseConfiguration {
187            workspace: Cow::Owned(self.workspace.into_owned()),
188            paths: self.paths.into_iter().map(|s| Cow::Owned(s.into_owned())).collect(),
189            includes: self.includes.into_iter().map(|s| Cow::Owned(s.into_owned())).collect(),
190            excludes: self
191                .excludes
192                .into_iter()
193                .map(|e| match e {
194                    Exclusion::Path(p) => Exclusion::Path(Cow::Owned(p.into_owned())),
195                    Exclusion::Pattern(pat) => Exclusion::Pattern(Cow::Owned(pat.into_owned())),
196                })
197                .collect(),
198            extensions: self.extensions.into_iter().map(|s| Cow::Owned(s.into_owned())).collect(),
199            glob: self.glob,
200        }
201    }
202}
203
204/// Mutable database for managing project files with add/update/delete operations.
205#[derive(Debug, Clone, Serialize, Deserialize)]
206pub struct Database<'a> {
207    files: HashMap<Cow<'static, str>, Arc<File>>,
208    id_to_name: HashMap<FileId, Cow<'static, str>>,
209    pub(crate) configuration: DatabaseConfiguration<'a>,
210}
211
212/// Immutable, read-optimized snapshot of the database.
213#[derive(Debug)]
214pub struct ReadDatabase {
215    files: Vec<Arc<File>>,
216    id_to_index: HashMap<FileId, usize>,
217    name_to_index: HashMap<Cow<'static, str>, usize>,
218    path_to_index: HashMap<PathBuf, usize>,
219}
220
221impl<'a> Database<'a> {
222    #[must_use]
223    pub fn new(configuration: DatabaseConfiguration<'a>) -> Self {
224        Self { files: HashMap::default(), id_to_name: HashMap::default(), configuration }
225    }
226
227    #[must_use]
228    pub fn single(file: File, configuration: DatabaseConfiguration<'a>) -> Self {
229        let mut db = Self::new(configuration);
230        db.add(file);
231        db
232    }
233
234    /// Reserves capacity for at least `additional` more files.
235    pub fn reserve(&mut self, additional: usize) {
236        self.files.reserve(additional);
237        self.id_to_name.reserve(additional);
238    }
239
240    pub fn add(&mut self, file: File) -> FileId {
241        let name = file.name.clone();
242        let id = file.id;
243
244        if let Some(old_file) = self.files.insert(name.clone(), Arc::new(file)) {
245            self.id_to_name.remove(&old_file.id);
246        }
247
248        self.id_to_name.insert(id, name);
249
250        id
251    }
252
253    /// Updates a file's content using its stable `FileId`.
254    ///
255    /// This recalculates derived data like file size, line endings, and `FileRevision`.
256    /// If another `ReadDatabase` snapshot holds a reference to the file (preventing in-place
257    /// mutation), a new `Arc<File>` is created with the updated contents.
258    ///
259    /// Returns `true` if a file with the given ID was found and updated.
260    pub fn update(&mut self, id: FileId, new_contents: Cow<'static, str>) -> bool {
261        let Some(name) = self.id_to_name.get(&id) else {
262            return false;
263        };
264
265        let Some(arc) = self.files.get_mut(name) else {
266            return false;
267        };
268
269        if let Some(file) = Arc::get_mut(arc) {
270            file.contents = new_contents;
271            file.size = file.contents.len() as u32;
272            file.lines = line_starts(file.contents.as_ref());
273        } else {
274            // other Arc clones exist (e.g., from a ReadDatabase snapshot).
275            // Create a new File with updated contents and replace the Arc.
276            let old = &**arc;
277            *arc = Arc::new(File::new(old.name.clone(), old.file_type, old.path.clone(), new_contents));
278        }
279
280        true
281    }
282
283    /// Deletes a file from the database using its stable `FileId`.
284    ///
285    /// Returns `true` if a file with the given ID was found and removed.
286    pub fn delete(&mut self, id: FileId) -> bool {
287        if let Some(name) = self.id_to_name.remove(&id) { self.files.remove(&name).is_some() } else { false }
288    }
289
290    /// Commits a [`ChangeLog`], applying all its recorded operations to the database
291    /// and optionally writing them to the filesystem.
292    ///
293    /// # Arguments
294    ///
295    /// * `change_log`: The log of changes to apply.
296    /// * `write_to_disk`: If `true`, changes for files that have a filesystem
297    ///   path will be written to disk in parallel.
298    ///
299    /// # Errors
300    ///
301    /// Returns a [`DatabaseError`] if the log cannot be consumed or if any
302    /// filesystem operation fails.
303    pub fn commit(&mut self, change_log: ChangeLog, write_to_disk: bool) -> Result<(), DatabaseError> {
304        let changes = change_log.into_inner()?;
305        let mut fs_operations = if write_to_disk { Vec::new() } else { Vec::with_capacity(0) };
306
307        for change in changes {
308            match change {
309                Change::Add(file) => {
310                    if write_to_disk && let Some(path) = &file.path {
311                        fs_operations.push(FilesystemOperation::Write(path.clone(), file.contents.clone()));
312                    }
313
314                    self.add(file);
315                }
316                Change::Update(id, contents) => {
317                    if write_to_disk
318                        && let Ok(file) = self.get(&id)
319                        && let Some(path) = &file.path
320                    {
321                        fs_operations.push(FilesystemOperation::Write(path.clone(), contents.clone()));
322                    }
323
324                    self.update(id, contents);
325                }
326                Change::Delete(id) => {
327                    if write_to_disk
328                        && let Ok(file) = self.get(&id)
329                        && let Some(path) = &file.path
330                    {
331                        fs_operations.push(FilesystemOperation::Delete(path.clone()));
332                    }
333
334                    self.delete(id);
335                }
336            }
337        }
338
339        if write_to_disk {
340            fs_operations.into_par_iter().try_for_each(|op| -> Result<(), DatabaseError> { op.execute() })?;
341        }
342
343        Ok(())
344    }
345
346    /// Creates an independent, immutable snapshot of the database.
347    ///
348    /// This is a potentially expensive one-time operation as it **clones** all file
349    /// data. The resulting [`ReadDatabase`] is highly optimized for fast reads and
350    /// guarantees a deterministic iteration order. The original `Database` is not
351    /// consumed and can continue to be used.
352    #[must_use]
353    pub fn read_only(&self) -> ReadDatabase {
354        let mut files_vec: Vec<Arc<File>> = self.files.values().cloned().collect();
355        files_vec.sort_unstable_by_key(|f| f.id);
356
357        let mut id_to_index = HashMap::with_capacity(files_vec.len());
358        let mut name_to_index = HashMap::with_capacity(files_vec.len());
359        let mut path_to_index = HashMap::with_capacity(files_vec.len());
360
361        for (index, file) in files_vec.iter().enumerate() {
362            id_to_index.insert(file.id, index);
363            name_to_index.insert(file.name.clone(), index);
364            if let Some(path) = &file.path {
365                path_to_index.insert(path.clone(), index);
366            }
367        }
368
369        ReadDatabase { files: files_vec, id_to_index, name_to_index, path_to_index }
370    }
371}
372
373impl ReadDatabase {
374    #[must_use]
375    pub fn empty() -> Self {
376        Self {
377            files: Vec::with_capacity(0),
378            id_to_index: HashMap::with_capacity(0),
379            name_to_index: HashMap::with_capacity(0),
380            path_to_index: HashMap::with_capacity(0),
381        }
382    }
383
384    /// Creates a new `ReadDatabase` containing only a single file.
385    ///
386    /// This is a convenience constructor for situations, such as testing or
387    /// single-file tools, where an operation requires a [`DatabaseReader`]
388    /// implementation but only needs to be aware of one file.
389    ///
390    /// # Arguments
391    ///
392    /// * `file`: The single `File` to include in the database.
393    #[must_use]
394    pub fn single(file: File) -> Self {
395        let mut id_to_index = HashMap::with_capacity(1);
396        let mut name_to_index = HashMap::with_capacity(1);
397        let mut path_to_index = HashMap::with_capacity(1);
398
399        id_to_index.insert(file.id, 0);
400        name_to_index.insert(file.name.clone(), 0);
401        if let Some(path) = &file.path {
402            path_to_index.insert(path.clone(), 0);
403        }
404
405        Self { files: vec![Arc::new(file)], id_to_index, name_to_index, path_to_index }
406    }
407}
408
409/// A universal interface for reading data from any database implementation.
410///
411/// This trait provides a common API for querying file data, abstracting over
412/// whether the underlying source is the mutable [`Database`] or the read-optimized
413/// [`ReadDatabase`]. This allows for writing generic code that can operate on either.
414pub trait DatabaseReader {
415    /// Retrieves a file's stable ID using its logical name.
416    fn get_id(&self, name: &str) -> Option<FileId>;
417
418    /// Retrieves a reference to a file using its stable `FileId`.
419    ///
420    /// # Errors
421    ///
422    /// Returns `DatabaseError::FileNotFound` if no file with the given ID exists.
423    fn get(&self, id: &FileId) -> Result<Arc<File>, DatabaseError>;
424
425    /// Retrieves a reference to a file using its stable `FileId`.
426    ///
427    /// # Errors
428    ///
429    /// Returns `DatabaseError::FileNotFound` if no file with the given ID exists.
430    fn get_ref(&self, id: &FileId) -> Result<&File, DatabaseError>;
431
432    /// Retrieves a reference to a file using its logical name.
433    ///
434    /// # Errors
435    ///
436    /// Returns `DatabaseError::FileNotFound` if no file with the given name exists.
437    fn get_by_name(&self, name: &str) -> Result<Arc<File>, DatabaseError>;
438
439    /// Retrieves a reference to a file by its absolute filesystem path.
440    ///
441    /// # Errors
442    ///
443    /// Returns `DatabaseError::FileNotFound` if no file with the given path exists.
444    fn get_by_path(&self, path: &Path) -> Result<Arc<File>, DatabaseError>;
445
446    /// Returns an iterator over all files in the database.
447    ///
448    /// The order is not guaranteed for `Database`, but is sorted by `FileId`
449    /// for `ReadDatabase`, providing deterministic iteration.
450    fn files(&self) -> impl Iterator<Item = Arc<File>>;
451
452    /// Returns an iterator over all files of a specific `FileType`.
453    fn files_with_type(&self, file_type: FileType) -> impl Iterator<Item = Arc<File>> {
454        self.files().filter(move |file| file.file_type == file_type)
455    }
456
457    /// Returns an iterator over all files that do not match a specific `FileType`.
458    fn files_without_type(&self, file_type: FileType) -> impl Iterator<Item = Arc<File>> {
459        self.files().filter(move |file| file.file_type != file_type)
460    }
461
462    /// Returns an iterator over the stable IDs of all files in the database.
463    fn file_ids(&self) -> impl Iterator<Item = FileId> {
464        self.files().map(|file| file.id)
465    }
466
467    /// Returns an iterator over the stable IDs of all files of a specific `FileType`.
468    fn file_ids_with_type(&self, file_type: FileType) -> impl Iterator<Item = FileId> {
469        self.files_with_type(file_type).map(|file| file.id)
470    }
471
472    /// Returns an iterator over the stable IDs of all files that do not match a specific `FileType`.
473    fn file_ids_without_type(&self, file_type: FileType) -> impl Iterator<Item = FileId> {
474        self.files_without_type(file_type).map(|file| file.id)
475    }
476
477    /// Returns the total number of files in the database.
478    fn len(&self) -> usize;
479
480    /// Returns `true` if the database contains no files.
481    fn is_empty(&self) -> bool {
482        self.len() == 0
483    }
484}
485
486impl DatabaseReader for Database<'_> {
487    fn get_id(&self, name: &str) -> Option<FileId> {
488        self.files.get(name).map(|f| f.id)
489    }
490
491    fn get(&self, id: &FileId) -> Result<Arc<File>, DatabaseError> {
492        let name = self.id_to_name.get(id).ok_or(DatabaseError::FileNotFound)?;
493        let file = self.files.get(name).ok_or(DatabaseError::FileNotFound)?;
494
495        Ok(file.clone())
496    }
497
498    fn get_ref(&self, id: &FileId) -> Result<&File, DatabaseError> {
499        let name = self.id_to_name.get(id).ok_or(DatabaseError::FileNotFound)?;
500        self.files.get(name).map(std::convert::AsRef::as_ref).ok_or(DatabaseError::FileNotFound)
501    }
502
503    fn get_by_name(&self, name: &str) -> Result<Arc<File>, DatabaseError> {
504        self.files.get(name).cloned().ok_or(DatabaseError::FileNotFound)
505    }
506
507    fn get_by_path(&self, path: &Path) -> Result<Arc<File>, DatabaseError> {
508        self.files.values().find(|file| file.path.as_deref() == Some(path)).cloned().ok_or(DatabaseError::FileNotFound)
509    }
510
511    fn files(&self) -> impl Iterator<Item = Arc<File>> {
512        self.files.values().cloned()
513    }
514
515    fn len(&self) -> usize {
516        self.files.len()
517    }
518}
519
520impl DatabaseReader for ReadDatabase {
521    fn get_id(&self, name: &str) -> Option<FileId> {
522        self.name_to_index.get(name).and_then(|&i| self.files.get(i)).map(|f| f.id)
523    }
524
525    fn get(&self, id: &FileId) -> Result<Arc<File>, DatabaseError> {
526        let index = self.id_to_index.get(id).ok_or(DatabaseError::FileNotFound)?;
527
528        self.files.get(*index).cloned().ok_or(DatabaseError::FileNotFound)
529    }
530
531    fn get_ref(&self, id: &FileId) -> Result<&File, DatabaseError> {
532        let index = self.id_to_index.get(id).ok_or(DatabaseError::FileNotFound)?;
533
534        self.files.get(*index).map(std::convert::AsRef::as_ref).ok_or(DatabaseError::FileNotFound)
535    }
536
537    fn get_by_name(&self, name: &str) -> Result<Arc<File>, DatabaseError> {
538        self.name_to_index.get(name).and_then(|&i| self.files.get(i)).cloned().ok_or(DatabaseError::FileNotFound)
539    }
540
541    fn get_by_path(&self, path: &Path) -> Result<Arc<File>, DatabaseError> {
542        self.path_to_index.get(path).and_then(|&i| self.files.get(i)).cloned().ok_or(DatabaseError::FileNotFound)
543    }
544
545    fn files(&self) -> impl Iterator<Item = Arc<File>> {
546        self.files.iter().cloned()
547    }
548
549    fn len(&self) -> usize {
550        self.files.len()
551    }
552}