Skip to main content

mago_database/
lib.rs

1#![allow(clippy::exhaustive_enums)]
2
3//! High-performance file database for PHP projects.
4//!
5//! This crate provides an efficient in-memory database for managing collections of PHP source files.
6//! It offers two complementary database types optimized for different access patterns:
7//!
8//! - [`Database`]: Mutable builder optimized for modifications (add, update, delete)
9//! - [`ReadDatabase`]: Immutable snapshot optimized for high-performance reads
10//!
11//! # Architecture
12//!
13//! The database uses a two-phase approach:
14//!
15//! 1. **Build Phase**: Use [`Database`] to load files, make modifications, and track changes
16//! 2. **Query Phase**: Convert to [`ReadDatabase`] via [`Database::read_only`] for fast lookups
17//!
18//! # Key Features
19//!
20//! - **Fast Lookups**: O(1) average-time access by ID, name, or filesystem path
21//! - **Change Tracking**: Record and batch apply file modifications via [`ChangeLog`]
22//! - **Deterministic Iteration**: [`ReadDatabase`] guarantees consistent iteration order
23//! - **Parallel Operations**: Concurrent file I/O and processing support
24//! - **Type Safety**: Strong typing with stable [`FileId`] handles
25//!
26//! # Common Workflow
27//!
28//! ## Loading Files
29//!
30//! Use [`loader::DatabaseLoader`] to scan a project directory:
31//!
32//! The loader handles file discovery, exclusion patterns, and parallel loading.
33//!
34//! ## Querying Files
35//!
36//! Both database types implement [`DatabaseReader`] for uniform access:
37//!
38//! ## Modifying Files
39//!
40//! Use [`ChangeLog`] to batch modifications:
41//!
42//! Changes can be applied to the database and optionally written to disk in parallel.
43//!
44//! # Performance Characteristics
45//!
46//! ## Database (Mutable)
47//!
48//! - Add/Update/Delete: O(1) average
49//! - Lookup by ID/name: O(1) average
50//! - Iteration: Unordered
51//! - Memory: ~2x file count (maps for bidirectional lookup)
52//!
53//! ## `ReadDatabase` (Immutable)
54//!
55//! - Creation: O(n log n) for sorting
56//! - Lookup by ID/name/path: O(1) average
57//! - Iteration: Deterministic, sorted by `FileId`
58//! - Memory: ~3x file count (vector + 3 index maps)
59//!
60//! # Thread Safety
61//!
62//! [`Database`] is not thread-safe and should be used from a single thread during construction.
63//! [`ReadDatabase`] can be freely shared across threads for concurrent read access.
64
65use std::borrow::Cow;
66use std::path::Path;
67use std::path::PathBuf;
68use std::sync::Arc;
69
70use foldhash::HashMap;
71use foldhash::HashMapExt;
72use rayon::iter::IntoParallelIterator;
73use rayon::iter::ParallelIterator;
74use serde::Deserialize;
75use serde::Serialize;
76
77use crate::change::Change;
78use crate::change::ChangeLog;
79use crate::error::DatabaseError;
80use crate::exclusion::Exclusion;
81use crate::file::File;
82use crate::file::FileId;
83use crate::file::FileType;
84use crate::file::line_starts;
85use crate::operation::FilesystemOperation;
86
87mod utils;
88
89pub mod change;
90pub mod error;
91pub mod exclusion;
92pub mod file;
93pub mod loader;
94pub mod matcher;
95pub mod watcher;
96
97mod operation;
98
99/// Configuration for database loading and watching.
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct DatabaseConfiguration<'config> {
102    pub workspace: Cow<'config, Path>,
103    /// Paths or glob patterns for source files.
104    /// Can be directory paths (e.g., "src") or glob patterns (e.g., "src/**/*.php")
105    pub paths: Vec<Cow<'config, [u8]>>,
106    /// Paths or glob patterns for included files.
107    /// Can be directory paths (e.g., "vendor") or glob patterns (e.g., "vendor/**/*.php")
108    pub includes: Vec<Cow<'config, [u8]>>,
109    pub excludes: Vec<Exclusion<'config>>,
110    pub extensions: Vec<Cow<'config, [u8]>>,
111    /// Settings for glob pattern matching behavior.
112    pub glob: GlobSettings,
113}
114
115/// Settings for glob pattern matching behavior.
116///
117/// All defaults match the `globset` crate defaults for backwards compatibility.
118#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
119pub struct GlobSettings {
120    /// Match patterns case-insensitively.
121    ///
122    /// Default: `false`.
123    pub case_insensitive: bool,
124    /// When `true`, a single `*` does not match path separators (`/`).
125    /// This makes `src/*/Test` match only `src/foo/Test`, not `src/foo/bar/Test`.
126    /// Use `**` for recursive matching.
127    ///
128    /// Default: `false`.
129    pub literal_separator: bool,
130    /// Whether `\` escapes special characters in patterns.
131    ///
132    /// Default: `true`.
133    pub backslash_escape: bool,
134    /// Whether an empty case in alternates is allowed (e.g., `{,a}` matches `""` and `"a"`).
135    ///
136    /// Default: `false`.
137    pub empty_alternates: bool,
138}
139
140impl Default for GlobSettings {
141    #[inline]
142    fn default() -> Self {
143        Self {
144            case_insensitive: false,
145            literal_separator: false,
146            backslash_escape: !std::path::is_separator('\\'),
147            empty_alternates: false,
148        }
149    }
150}
151
152impl<'config> DatabaseConfiguration<'config> {
153    #[inline]
154    #[must_use]
155    pub fn new(
156        workspace: &'config Path,
157        paths: Vec<&'config [u8]>,
158        includes: Vec<&'config [u8]>,
159        excludes: Vec<Exclusion<'config>>,
160        extensions: Vec<&'config [u8]>,
161    ) -> Self {
162        let paths = paths.into_iter().map(Cow::Borrowed).collect();
163        let includes = includes.into_iter().map(Cow::Borrowed).collect();
164
165        let excludes = excludes
166            .into_iter()
167            .filter_map(|exclusion| match exclusion {
168                Exclusion::Path(p) => Some(if p.is_absolute() {
169                    Exclusion::Path(p)
170                } else {
171                    workspace.join(p).canonicalize().ok().map(Cow::Owned).map(Exclusion::Path)?
172                }),
173                Exclusion::Pattern(pat) => Some(Exclusion::Pattern(pat)),
174            })
175            .collect();
176
177        let extensions = extensions.into_iter().map(Cow::Borrowed).collect();
178
179        Self {
180            workspace: Cow::Borrowed(workspace),
181            paths,
182            includes,
183            excludes,
184            extensions,
185            glob: GlobSettings::default(),
186        }
187    }
188
189    #[inline]
190    #[must_use]
191    pub fn into_static(self) -> DatabaseConfiguration<'static> {
192        DatabaseConfiguration {
193            workspace: Cow::Owned(self.workspace.into_owned()),
194            paths: self.paths.into_iter().map(|s| Cow::Owned(s.into_owned())).collect(),
195            includes: self.includes.into_iter().map(|s| Cow::Owned(s.into_owned())).collect(),
196            excludes: self
197                .excludes
198                .into_iter()
199                .map(|e| match e {
200                    Exclusion::Path(p) => Exclusion::Path(Cow::Owned(p.into_owned())),
201                    Exclusion::Pattern(pat) => Exclusion::Pattern(Cow::Owned(pat.into_owned())),
202                })
203                .collect(),
204            extensions: self.extensions.into_iter().map(|s| Cow::Owned(s.into_owned())).collect(),
205            glob: self.glob,
206        }
207    }
208}
209
210/// Mutable database for managing project files with add/update/delete operations.
211#[derive(Debug, Clone, Serialize, Deserialize)]
212#[allow(clippy::field_scoped_visibility_modifiers)]
213pub struct Database<'config> {
214    files: HashMap<Cow<'static, [u8]>, Arc<File>>,
215    id_to_name: HashMap<FileId, Cow<'static, [u8]>>,
216    pub(crate) configuration: DatabaseConfiguration<'config>,
217}
218
219/// Immutable, read-optimized snapshot of the database.
220#[derive(Debug)]
221pub struct ReadDatabase {
222    files: Vec<Arc<File>>,
223    id_to_index: HashMap<FileId, usize>,
224    name_to_index: HashMap<Cow<'static, [u8]>, usize>,
225    path_to_index: HashMap<PathBuf, usize>,
226}
227
228impl<'config> Database<'config> {
229    #[inline]
230    #[must_use]
231    pub fn new(configuration: DatabaseConfiguration<'config>) -> Self {
232        Self { files: HashMap::default(), id_to_name: HashMap::default(), configuration }
233    }
234
235    #[inline]
236    #[must_use]
237    pub fn single(file: File, configuration: DatabaseConfiguration<'config>) -> Self {
238        let mut db = Self::new(configuration);
239        db.add(file);
240        db
241    }
242
243    /// Reserves capacity for at least `additional` more files.
244    #[inline]
245    pub fn reserve(&mut self, additional: usize) {
246        self.files.reserve(additional);
247        self.id_to_name.reserve(additional);
248    }
249
250    #[inline]
251    pub fn add(&mut self, file: File) -> FileId {
252        let name = file.name.clone();
253        let id = file.id;
254
255        if let Some(old_file) = self.files.insert(name.clone(), Arc::new(file)) {
256            self.id_to_name.remove(&old_file.id);
257        }
258
259        self.id_to_name.insert(id, name);
260
261        id
262    }
263
264    /// Updates a file's content using its stable `FileId`.
265    ///
266    /// This recalculates derived data like file size, line endings, and `FileRevision`.
267    /// If another `ReadDatabase` snapshot holds a reference to the file (preventing in-place
268    /// mutation), a new `Arc<File>` is created with the updated contents.
269    ///
270    /// Returns `true` if a file with the given ID was found and updated.
271    #[inline]
272    pub fn update(&mut self, id: FileId, new_contents: Cow<'static, [u8]>) -> bool {
273        let Some(name) = self.id_to_name.get(&id) else {
274            return false;
275        };
276
277        let Some(arc) = self.files.get_mut(name) else {
278            return false;
279        };
280
281        if let Some(file) = Arc::get_mut(arc) {
282            file.contents = new_contents;
283            file.size = file.contents.len() as u32;
284            file.lines = line_starts(file.contents.as_ref());
285        } else {
286            // other Arc clones exist (e.g., from a ReadDatabase snapshot).
287            // Create a new File with updated contents and replace the Arc.
288            let old = &**arc;
289            *arc = Arc::new(File::new(old.name.clone(), old.file_type, old.path.clone(), new_contents));
290        }
291
292        true
293    }
294
295    /// Deletes a file from the database using its stable `FileId`.
296    ///
297    /// Returns `true` if a file with the given ID was found and removed.
298    #[inline]
299    pub fn delete(&mut self, id: FileId) -> bool {
300        if let Some(name) = self.id_to_name.remove(&id) { self.files.remove(&name).is_some() } else { false }
301    }
302
303    /// Commits a [`ChangeLog`], applying all its recorded operations to the database
304    /// and optionally writing them to the filesystem.
305    ///
306    /// # Arguments
307    ///
308    /// * `change_log`: The log of changes to apply.
309    /// * `write_to_disk`: If `true`, changes for files that have a filesystem
310    ///   path will be written to disk in parallel.
311    ///
312    /// # Errors
313    ///
314    /// Returns a [`DatabaseError`] if the log cannot be consumed or if any
315    /// filesystem operation fails.
316    #[inline]
317    pub fn commit(&mut self, change_log: ChangeLog, write_to_disk: bool) -> Result<(), DatabaseError> {
318        let changes = change_log.into_inner()?;
319        let mut fs_operations = if write_to_disk { Vec::new() } else { Vec::with_capacity(0) };
320
321        for change in changes {
322            match change {
323                Change::Add(file) => {
324                    if write_to_disk && let Some(path) = &file.path {
325                        fs_operations.push(FilesystemOperation::Write(path.clone(), file.contents.clone()));
326                    }
327
328                    self.add(file);
329                }
330                Change::Update(id, contents) => {
331                    if write_to_disk
332                        && let Ok(file) = self.get(&id)
333                        && let Some(path) = &file.path
334                    {
335                        fs_operations.push(FilesystemOperation::Write(path.clone(), contents.clone()));
336                    }
337
338                    self.update(id, contents);
339                }
340                Change::Delete(id) => {
341                    if write_to_disk
342                        && let Ok(file) = self.get(&id)
343                        && let Some(path) = &file.path
344                    {
345                        fs_operations.push(FilesystemOperation::Delete(path.clone()));
346                    }
347
348                    self.delete(id);
349                }
350            }
351        }
352
353        if write_to_disk {
354            fs_operations.into_par_iter().try_for_each(|op| -> Result<(), DatabaseError> { op.execute() })?;
355        }
356
357        Ok(())
358    }
359
360    /// Creates an independent, immutable snapshot of the database.
361    ///
362    /// This is a potentially expensive one-time operation as it **clones** all file
363    /// data. The resulting [`ReadDatabase`] is highly optimized for fast reads and
364    /// guarantees a deterministic iteration order. The original `Database` is not
365    /// consumed and can continue to be used.
366    #[inline]
367    #[must_use]
368    pub fn read_only(&self) -> ReadDatabase {
369        let mut files_vec: Vec<Arc<File>> = self.files.values().cloned().collect();
370        files_vec.sort_unstable_by_key(|f| f.id);
371
372        let mut id_to_index = HashMap::with_capacity(files_vec.len());
373        let mut name_to_index = HashMap::with_capacity(files_vec.len());
374        let mut path_to_index = HashMap::with_capacity(files_vec.len());
375
376        for (index, file) in files_vec.iter().enumerate() {
377            id_to_index.insert(file.id, index);
378            name_to_index.insert(file.name.clone(), index);
379            if let Some(path) = &file.path {
380                path_to_index.insert(path.clone(), index);
381            }
382        }
383
384        ReadDatabase { files: files_vec, id_to_index, name_to_index, path_to_index }
385    }
386}
387
388impl ReadDatabase {
389    #[inline]
390    #[must_use]
391    pub fn empty() -> Self {
392        Self {
393            files: Vec::with_capacity(0),
394            id_to_index: HashMap::with_capacity(0),
395            name_to_index: HashMap::with_capacity(0),
396            path_to_index: HashMap::with_capacity(0),
397        }
398    }
399
400    /// Creates a new `ReadDatabase` containing only a single file.
401    ///
402    /// This is a convenience constructor for situations, such as testing or
403    /// single-file tools, where an operation requires a [`DatabaseReader`]
404    /// implementation but only needs to be aware of one file.
405    ///
406    /// # Arguments
407    ///
408    /// * `file`: The single `File` to include in the database.
409    #[inline]
410    #[must_use]
411    pub fn single(file: File) -> Self {
412        let mut id_to_index = HashMap::with_capacity(1);
413        let mut name_to_index = HashMap::with_capacity(1);
414        let mut path_to_index = HashMap::with_capacity(1);
415
416        id_to_index.insert(file.id, 0);
417        name_to_index.insert(file.name.clone(), 0);
418        if let Some(path) = &file.path {
419            path_to_index.insert(path.clone(), 0);
420        }
421
422        Self { files: vec![Arc::new(file)], id_to_index, name_to_index, path_to_index }
423    }
424}
425
426/// A universal interface for reading data from any database implementation.
427///
428/// This trait provides a common API for querying file data, abstracting over
429/// whether the underlying source is the mutable [`Database`] or the read-optimized
430/// [`ReadDatabase`]. This allows for writing generic code that can operate on either.
431pub trait DatabaseReader {
432    /// Retrieves a file's stable ID using its logical name.
433    fn get_id(&self, name: &[u8]) -> Option<FileId>;
434
435    /// Retrieves a reference to a file using its stable `FileId`.
436    ///
437    /// # Errors
438    ///
439    /// Returns `DatabaseError::FileNotFound` if no file with the given ID exists.
440    fn get(&self, id: &FileId) -> Result<Arc<File>, DatabaseError>;
441
442    /// Retrieves a reference to a file using its stable `FileId`.
443    ///
444    /// # Errors
445    ///
446    /// Returns `DatabaseError::FileNotFound` if no file with the given ID exists.
447    fn get_ref(&self, id: &FileId) -> Result<&File, DatabaseError>;
448
449    /// Retrieves a reference to a file using its logical name.
450    ///
451    /// # Errors
452    ///
453    /// Returns `DatabaseError::FileNotFound` if no file with the given name exists.
454    fn get_by_name(&self, name: &[u8]) -> Result<Arc<File>, DatabaseError>;
455
456    /// Retrieves a reference to a file by its absolute filesystem path.
457    ///
458    /// # Errors
459    ///
460    /// Returns `DatabaseError::FileNotFound` if no file with the given path exists.
461    fn get_by_path(&self, path: &Path) -> Result<Arc<File>, DatabaseError>;
462
463    /// Returns an iterator over all files in the database.
464    ///
465    /// The order is not guaranteed for `Database`, but is sorted by `FileId`
466    /// for `ReadDatabase`, providing deterministic iteration.
467    fn files(&self) -> impl Iterator<Item = Arc<File>>;
468
469    /// Returns an iterator over all files of a specific `FileType`.
470    #[inline]
471    fn files_with_type(&self, file_type: FileType) -> impl Iterator<Item = Arc<File>> {
472        self.files().filter(move |file| file.file_type == file_type)
473    }
474
475    /// Returns an iterator over all files that do not match a specific `FileType`.
476    #[inline]
477    fn files_without_type(&self, file_type: FileType) -> impl Iterator<Item = Arc<File>> {
478        self.files().filter(move |file| file.file_type != file_type)
479    }
480
481    /// Returns an iterator over the stable IDs of all files in the database.
482    #[inline]
483    fn file_ids(&self) -> impl Iterator<Item = FileId> {
484        self.files().map(|file| file.id)
485    }
486
487    /// Returns an iterator over the stable IDs of all files of a specific `FileType`.
488    #[inline]
489    fn file_ids_with_type(&self, file_type: FileType) -> impl Iterator<Item = FileId> {
490        self.files_with_type(file_type).map(|file| file.id)
491    }
492
493    /// Returns an iterator over the stable IDs of all files that do not match a specific `FileType`.
494    #[inline]
495    fn file_ids_without_type(&self, file_type: FileType) -> impl Iterator<Item = FileId> {
496        self.files_without_type(file_type).map(|file| file.id)
497    }
498
499    /// Returns the total number of files in the database.
500    fn len(&self) -> usize;
501
502    /// Returns `true` if the database contains no files.
503    #[inline]
504    fn is_empty(&self) -> bool {
505        self.len() == 0
506    }
507}
508
509impl DatabaseReader for Database<'_> {
510    #[inline]
511    fn get_id(&self, name: &[u8]) -> Option<FileId> {
512        self.files.get(name).map(|f| f.id)
513    }
514
515    #[inline]
516    fn get(&self, id: &FileId) -> Result<Arc<File>, DatabaseError> {
517        let name = self.id_to_name.get(id).ok_or(DatabaseError::FileNotFound)?;
518        let file = self.files.get(name).ok_or(DatabaseError::FileNotFound)?;
519
520        Ok(Arc::clone(file))
521    }
522
523    #[inline]
524    fn get_ref(&self, id: &FileId) -> Result<&File, DatabaseError> {
525        let name = self.id_to_name.get(id).ok_or(DatabaseError::FileNotFound)?;
526        self.files.get(name).map(std::convert::AsRef::as_ref).ok_or(DatabaseError::FileNotFound)
527    }
528
529    #[inline]
530    fn get_by_name(&self, name: &[u8]) -> Result<Arc<File>, DatabaseError> {
531        self.files.get(name).cloned().ok_or(DatabaseError::FileNotFound)
532    }
533
534    #[inline]
535    fn get_by_path(&self, path: &Path) -> Result<Arc<File>, DatabaseError> {
536        self.files.values().find(|file| file.path.as_deref() == Some(path)).cloned().ok_or(DatabaseError::FileNotFound)
537    }
538
539    #[inline]
540    fn files(&self) -> impl Iterator<Item = Arc<File>> {
541        self.files.values().cloned()
542    }
543
544    #[inline]
545    fn len(&self) -> usize {
546        self.files.len()
547    }
548}
549
550impl DatabaseReader for ReadDatabase {
551    #[inline]
552    fn get_id(&self, name: &[u8]) -> Option<FileId> {
553        self.name_to_index.get(name).and_then(|&i| self.files.get(i)).map(|f| f.id)
554    }
555
556    #[inline]
557    fn get(&self, id: &FileId) -> Result<Arc<File>, DatabaseError> {
558        let index = self.id_to_index.get(id).ok_or(DatabaseError::FileNotFound)?;
559
560        self.files.get(*index).cloned().ok_or(DatabaseError::FileNotFound)
561    }
562
563    #[inline]
564    fn get_ref(&self, id: &FileId) -> Result<&File, DatabaseError> {
565        let index = self.id_to_index.get(id).ok_or(DatabaseError::FileNotFound)?;
566
567        self.files.get(*index).map(std::convert::AsRef::as_ref).ok_or(DatabaseError::FileNotFound)
568    }
569
570    #[inline]
571    fn get_by_name(&self, name: &[u8]) -> Result<Arc<File>, DatabaseError> {
572        self.name_to_index.get(name).and_then(|&i| self.files.get(i)).cloned().ok_or(DatabaseError::FileNotFound)
573    }
574
575    #[inline]
576    fn get_by_path(&self, path: &Path) -> Result<Arc<File>, DatabaseError> {
577        self.path_to_index.get(path).and_then(|&i| self.files.get(i)).cloned().ok_or(DatabaseError::FileNotFound)
578    }
579
580    #[inline]
581    fn files(&self) -> impl Iterator<Item = Arc<File>> {
582        self.files.iter().cloned()
583    }
584
585    #[inline]
586    fn len(&self) -> usize {
587        self.files.len()
588    }
589}