Skip to main content

mago_database/
lib.rs

1#![allow(clippy::exhaustive_enums)]
2
3//! High-performance file database for PHP projects.
4//!
5//! This crate provides an efficient in-memory database for managing collections of PHP source files.
6//! It offers two complementary database types optimized for different access patterns:
7//!
8//! - [`Database`]: Mutable builder optimized for modifications (add, update, delete)
9//! - [`ReadDatabase`]: Immutable snapshot optimized for high-performance reads
10//!
11//! # Architecture
12//!
13//! The database uses a two-phase approach:
14//!
15//! 1. **Build Phase**: Use [`Database`] to load files, make modifications, and track changes
16//! 2. **Query Phase**: Convert to [`ReadDatabase`] via [`Database::read_only`] for fast lookups
17//!
18//! # Key Features
19//!
20//! - **Fast Lookups**: O(1) average-time access by ID, name, or filesystem path
21//! - **Change Tracking**: Record and batch apply file modifications via [`ChangeLog`]
22//! - **Deterministic Iteration**: [`ReadDatabase`] guarantees consistent iteration order
23//! - **Parallel Operations**: Concurrent file I/O and processing support
24//! - **Type Safety**: Strong typing with stable [`FileId`] handles
25//!
26//! # Common Workflow
27//!
28//! ## Loading Files
29//!
30//! Use [`loader::DatabaseLoader`] to scan a project directory:
31//!
32//! The loader handles file discovery, exclusion patterns, and parallel loading.
33//!
34//! ## Querying Files
35//!
36//! Both database types implement [`DatabaseReader`] for uniform access:
37//!
38//! ## Modifying Files
39//!
40//! Use [`ChangeLog`] to batch modifications:
41//!
42//! Changes can be applied to the database and optionally written to disk in parallel.
43//!
44//! # Performance Characteristics
45//!
46//! ## Database (Mutable)
47//!
48//! - Add/Update/Delete: O(1) average
49//! - Lookup by ID/name: O(1) average
50//! - Iteration: Unordered
51//! - Memory: ~2x file count (maps for bidirectional lookup)
52//!
53//! ## `ReadDatabase` (Immutable)
54//!
55//! - Creation: O(n log n) for sorting
56//! - Lookup by ID/name/path: O(1) average
57//! - Iteration: Deterministic, sorted by `FileId`
58//! - Memory: ~3x file count (vector + 3 index maps)
59//!
60//! # Thread Safety
61//!
62//! [`Database`] is not thread-safe and should be used from a single thread during construction.
63//! [`ReadDatabase`] can be freely shared across threads for concurrent read access.
64
65use std::borrow::Cow;
66use std::path::Path;
67use std::path::PathBuf;
68use std::sync::Arc;
69
70use foldhash::HashMap;
71use foldhash::HashMapExt;
72use rayon::iter::IntoParallelIterator;
73use rayon::iter::ParallelIterator;
74
75use crate::change::Change;
76use crate::change::ChangeLog;
77use crate::error::DatabaseError;
78use crate::exclusion::Exclusion;
79use crate::file::File;
80use crate::file::FileId;
81use crate::file::FileType;
82use crate::file::line_starts;
83use crate::operation::FilesystemOperation;
84
85mod utils;
86
87pub mod change;
88pub mod error;
89pub mod exclusion;
90pub mod file;
91pub mod loader;
92pub mod matcher;
93pub mod membership;
94pub mod watcher;
95
96mod operation;
97
98/// Configuration for database loading and watching.
99#[derive(Debug, Clone)]
100#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
101pub struct DatabaseConfiguration<'config> {
102    pub workspace: Cow<'config, Path>,
103    /// Paths or glob patterns for source files.
104    /// Can be directory paths (e.g., "src") or glob patterns (e.g., "src/**/*.php")
105    pub paths: Vec<Cow<'config, [u8]>>,
106    /// Paths or glob patterns for included files.
107    /// Can be directory paths (e.g., "vendor") or glob patterns (e.g., "vendor/**/*.php")
108    pub includes: Vec<Cow<'config, [u8]>>,
109    pub patches: Vec<Cow<'config, [u8]>>,
110    pub excludes: Vec<Exclusion<'config>>,
111    pub extensions: Vec<Cow<'config, [u8]>>,
112    /// Settings for glob pattern matching behavior.
113    pub glob: GlobSettings,
114}
115
116/// Settings for glob pattern matching behavior.
117///
118/// All defaults match the `globset` crate defaults for backwards compatibility.
119#[derive(Debug, Clone, Copy)]
120#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
121pub struct GlobSettings {
122    /// Match patterns case-insensitively.
123    ///
124    /// Default: `false`.
125    pub case_insensitive: bool,
126    /// When `true`, a single `*` does not match path separators (`/`).
127    /// This makes `src/*/Test` match only `src/foo/Test`, not `src/foo/bar/Test`.
128    /// Use `**` for recursive matching.
129    ///
130    /// Default: `false`.
131    pub literal_separator: bool,
132    /// Whether `\` escapes special characters in patterns.
133    ///
134    /// Default: `true`.
135    pub backslash_escape: bool,
136    /// Whether an empty case in alternates is allowed (e.g., `{,a}` matches `""` and `"a"`).
137    ///
138    /// Default: `false`.
139    pub empty_alternates: bool,
140}
141
142impl Default for GlobSettings {
143    #[inline]
144    fn default() -> Self {
145        Self {
146            case_insensitive: false,
147            literal_separator: false,
148            backslash_escape: !std::path::is_separator('\\'),
149            empty_alternates: false,
150        }
151    }
152}
153
154impl<'config> DatabaseConfiguration<'config> {
155    #[inline]
156    #[must_use]
157    pub fn new(
158        workspace: &'config Path,
159        paths: Vec<&'config [u8]>,
160        includes: Vec<&'config [u8]>,
161        excludes: Vec<Exclusion<'config>>,
162        extensions: Vec<&'config [u8]>,
163    ) -> Self {
164        let paths = paths.into_iter().map(Cow::Borrowed).collect();
165        let includes = includes.into_iter().map(Cow::Borrowed).collect();
166
167        let excludes = excludes
168            .into_iter()
169            .filter_map(|exclusion| match exclusion {
170                Exclusion::Path(p) => Some(if p.is_absolute() {
171                    Exclusion::Path(p)
172                } else {
173                    workspace.join(p).canonicalize().ok().map(Cow::Owned).map(Exclusion::Path)?
174                }),
175                Exclusion::Pattern(pat) => Some(Exclusion::Pattern(pat)),
176            })
177            .collect();
178
179        let extensions = extensions.into_iter().map(Cow::Borrowed).collect();
180
181        Self {
182            workspace: Cow::Borrowed(workspace),
183            paths,
184            includes,
185            patches: Vec::new(),
186            excludes,
187            extensions,
188            glob: GlobSettings::default(),
189        }
190    }
191
192    #[inline]
193    #[must_use]
194    pub fn into_static(self) -> DatabaseConfiguration<'static> {
195        DatabaseConfiguration {
196            workspace: Cow::Owned(self.workspace.into_owned()),
197            paths: self.paths.into_iter().map(|s| Cow::Owned(s.into_owned())).collect(),
198            includes: self.includes.into_iter().map(|s| Cow::Owned(s.into_owned())).collect(),
199            patches: self.patches.into_iter().map(|s| Cow::Owned(s.into_owned())).collect(),
200            excludes: self
201                .excludes
202                .into_iter()
203                .map(|e| match e {
204                    Exclusion::Path(p) => Exclusion::Path(Cow::Owned(p.into_owned())),
205                    Exclusion::Pattern(pat) => Exclusion::Pattern(Cow::Owned(pat.into_owned())),
206                })
207                .collect(),
208            extensions: self.extensions.into_iter().map(|s| Cow::Owned(s.into_owned())).collect(),
209            glob: self.glob,
210        }
211    }
212}
213
214/// Mutable database for managing project files with add/update/delete operations.
215#[derive(Debug, Clone)]
216#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
217#[allow(clippy::field_scoped_visibility_modifiers)]
218pub struct Database<'config> {
219    files: HashMap<Cow<'static, [u8]>, Arc<File>>,
220    id_to_name: HashMap<FileId, Cow<'static, [u8]>>,
221    pub(crate) configuration: DatabaseConfiguration<'config>,
222}
223
224/// Immutable, read-optimized snapshot of the database.
225#[derive(Debug)]
226pub struct ReadDatabase {
227    files: Vec<Arc<File>>,
228    id_to_index: HashMap<FileId, usize>,
229    name_to_index: HashMap<Cow<'static, [u8]>, usize>,
230    path_to_index: HashMap<PathBuf, usize>,
231}
232
233impl<'config> Database<'config> {
234    #[inline]
235    #[must_use]
236    pub fn new(configuration: DatabaseConfiguration<'config>) -> Self {
237        Self { files: HashMap::default(), id_to_name: HashMap::default(), configuration }
238    }
239
240    #[inline]
241    #[must_use]
242    pub fn single(file: File, configuration: DatabaseConfiguration<'config>) -> Self {
243        let mut db = Self::new(configuration);
244        db.add(file);
245        db
246    }
247
248    /// Reserves capacity for at least `additional` more files.
249    #[inline]
250    pub fn reserve(&mut self, additional: usize) {
251        self.files.reserve(additional);
252        self.id_to_name.reserve(additional);
253    }
254
255    #[inline]
256    pub fn add(&mut self, file: File) -> FileId {
257        let name = file.name.clone();
258        let id = file.id;
259
260        if let Some(old_file) = self.files.insert(name.clone(), Arc::new(file)) {
261            self.id_to_name.remove(&old_file.id);
262        }
263
264        self.id_to_name.insert(id, name);
265
266        id
267    }
268
269    /// Updates a file's content using its stable `FileId`.
270    ///
271    /// This recalculates derived data like file size, line endings, and `FileRevision`.
272    /// If another `ReadDatabase` snapshot holds a reference to the file (preventing in-place
273    /// mutation), a new `Arc<File>` is created with the updated contents.
274    ///
275    /// Returns `true` if a file with the given ID was found and updated.
276    #[inline]
277    pub fn update(&mut self, id: FileId, new_contents: Cow<'static, [u8]>) -> bool {
278        let Some(name) = self.id_to_name.get(&id) else {
279            return false;
280        };
281
282        let Some(arc) = self.files.get_mut(name) else {
283            return false;
284        };
285
286        if let Some(file) = Arc::get_mut(arc) {
287            file.contents = new_contents;
288            file.size = file.contents.len() as u32;
289            file.lines = line_starts(file.contents.as_ref());
290        } else {
291            // other Arc clones exist (e.g., from a ReadDatabase snapshot).
292            // Create a new File with updated contents and replace the Arc.
293            let old = &**arc;
294            *arc = Arc::new(File::new(old.name.clone(), old.file_type, old.path.clone(), new_contents));
295        }
296
297        true
298    }
299
300    /// Deletes a file from the database using its stable `FileId`.
301    ///
302    /// Returns `true` if a file with the given ID was found and removed.
303    #[inline]
304    pub fn delete(&mut self, id: FileId) -> bool {
305        if let Some(name) = self.id_to_name.remove(&id) { self.files.remove(&name).is_some() } else { false }
306    }
307
308    /// Commits a [`ChangeLog`], applying all its recorded operations to the database
309    /// and optionally writing them to the filesystem.
310    ///
311    /// # Arguments
312    ///
313    /// * `change_log`: The log of changes to apply.
314    /// * `write_to_disk`: If `true`, changes for files that have a filesystem
315    ///   path will be written to disk in parallel.
316    ///
317    /// # Errors
318    ///
319    /// Returns a [`DatabaseError`] if the log cannot be consumed or if any
320    /// filesystem operation fails.
321    #[inline]
322    pub fn commit(&mut self, change_log: ChangeLog, write_to_disk: bool) -> Result<(), DatabaseError> {
323        let changes = change_log.into_inner()?;
324        let mut fs_operations = Vec::new();
325
326        for change in changes {
327            match change {
328                Change::Add(file) => {
329                    if write_to_disk && let Some(path) = &file.path {
330                        fs_operations.push(FilesystemOperation::Write(path.clone(), file.contents.clone()));
331                    }
332
333                    self.add(file);
334                }
335                Change::Update(id, contents) => {
336                    if write_to_disk
337                        && let Ok(file) = self.get(&id)
338                        && let Some(path) = &file.path
339                    {
340                        fs_operations.push(FilesystemOperation::Write(path.clone(), contents.clone()));
341                    }
342
343                    self.update(id, contents);
344                }
345                Change::Delete(id) => {
346                    if write_to_disk
347                        && let Ok(file) = self.get(&id)
348                        && let Some(path) = &file.path
349                    {
350                        fs_operations.push(FilesystemOperation::Delete(path.clone()));
351                    }
352
353                    self.delete(id);
354                }
355            }
356        }
357
358        if write_to_disk {
359            fs_operations.into_par_iter().try_for_each(|op| -> Result<(), DatabaseError> { op.execute() })?;
360        }
361
362        Ok(())
363    }
364
365    /// Creates an independent, immutable snapshot of the database.
366    ///
367    /// This is a potentially expensive one-time operation as it **clones** all file
368    /// data. The resulting [`ReadDatabase`] is highly optimized for fast reads and
369    /// guarantees a deterministic iteration order. The original `Database` is not
370    /// consumed and can continue to be used.
371    #[inline]
372    #[must_use]
373    pub fn read_only(&self) -> ReadDatabase {
374        let mut files_vec: Vec<Arc<File>> = self.files.values().cloned().collect();
375        files_vec.sort_unstable_by_key(|f| f.id);
376
377        let mut id_to_index = HashMap::with_capacity(files_vec.len());
378        let mut name_to_index = HashMap::with_capacity(files_vec.len());
379        let mut path_to_index = HashMap::with_capacity(files_vec.len());
380
381        for (index, file) in files_vec.iter().enumerate() {
382            id_to_index.insert(file.id, index);
383            name_to_index.insert(file.name.clone(), index);
384            if let Some(path) = &file.path {
385                path_to_index.insert(path.clone(), index);
386            }
387        }
388
389        ReadDatabase { files: files_vec, id_to_index, name_to_index, path_to_index }
390    }
391}
392
393impl ReadDatabase {
394    #[inline]
395    #[must_use]
396    pub fn empty() -> Self {
397        Self {
398            files: Vec::new(),
399            id_to_index: HashMap::new(),
400            name_to_index: HashMap::new(),
401            path_to_index: HashMap::new(),
402        }
403    }
404
405    /// Creates a new `ReadDatabase` containing only a single file.
406    ///
407    /// This is a convenience constructor for situations, such as testing or
408    /// single-file tools, where an operation requires a [`DatabaseReader`]
409    /// implementation but only needs to be aware of one file.
410    ///
411    /// # Arguments
412    ///
413    /// * `file`: The single `File` to include in the database.
414    #[inline]
415    #[must_use]
416    pub fn single(file: File) -> Self {
417        let mut id_to_index = HashMap::with_capacity(1);
418        let mut name_to_index = HashMap::with_capacity(1);
419        let mut path_to_index = HashMap::with_capacity(1);
420
421        id_to_index.insert(file.id, 0);
422        name_to_index.insert(file.name.clone(), 0);
423        if let Some(path) = &file.path {
424            path_to_index.insert(path.clone(), 0);
425        }
426
427        Self { files: vec![Arc::new(file)], id_to_index, name_to_index, path_to_index }
428    }
429}
430
431/// A universal interface for reading data from any database implementation.
432///
433/// This trait provides a common API for querying file data, abstracting over
434/// whether the underlying source is the mutable [`Database`] or the read-optimized
435/// [`ReadDatabase`]. This allows for writing generic code that can operate on either.
436pub trait DatabaseReader {
437    /// Retrieves a file's stable ID using its logical name.
438    fn get_id(&self, name: &[u8]) -> Option<FileId>;
439
440    /// Retrieves a reference to a file using its stable `FileId`.
441    ///
442    /// # Errors
443    ///
444    /// Returns `DatabaseError::FileNotFound` if no file with the given ID exists.
445    fn get(&self, id: &FileId) -> Result<Arc<File>, DatabaseError>;
446
447    /// Retrieves a reference to a file using its stable `FileId`.
448    ///
449    /// # Errors
450    ///
451    /// Returns `DatabaseError::FileNotFound` if no file with the given ID exists.
452    fn get_ref(&self, id: &FileId) -> Result<&File, DatabaseError>;
453
454    /// Retrieves a reference to a file using its logical name.
455    ///
456    /// # Errors
457    ///
458    /// Returns `DatabaseError::FileNotFound` if no file with the given name exists.
459    fn get_by_name(&self, name: &[u8]) -> Result<Arc<File>, DatabaseError>;
460
461    /// Retrieves a reference to a file by its absolute filesystem path.
462    ///
463    /// # Errors
464    ///
465    /// Returns `DatabaseError::FileNotFound` if no file with the given path exists.
466    fn get_by_path(&self, path: &Path) -> Result<Arc<File>, DatabaseError>;
467
468    /// Returns an iterator over all files in the database.
469    ///
470    /// The order is not guaranteed for `Database`, but is sorted by `FileId`
471    /// for `ReadDatabase`, providing deterministic iteration.
472    fn files(&self) -> impl Iterator<Item = Arc<File>>;
473
474    /// Returns an iterator over all files of a specific `FileType`.
475    #[inline]
476    fn files_with_type(&self, file_type: FileType) -> impl Iterator<Item = Arc<File>> {
477        self.files().filter(move |file| file.file_type == file_type)
478    }
479
480    /// Returns an iterator over all files that do not match a specific `FileType`.
481    #[inline]
482    fn files_without_type(&self, file_type: FileType) -> impl Iterator<Item = Arc<File>> {
483        self.files().filter(move |file| file.file_type != file_type)
484    }
485
486    /// Returns an iterator over the stable IDs of all files in the database.
487    #[inline]
488    fn file_ids(&self) -> impl Iterator<Item = FileId> {
489        self.files().map(|file| file.id)
490    }
491
492    /// Returns an iterator over the stable IDs of all files of a specific `FileType`.
493    #[inline]
494    fn file_ids_with_type(&self, file_type: FileType) -> impl Iterator<Item = FileId> {
495        self.files_with_type(file_type).map(|file| file.id)
496    }
497
498    /// Returns an iterator over the stable IDs of all files that do not match a specific `FileType`.
499    #[inline]
500    fn file_ids_without_type(&self, file_type: FileType) -> impl Iterator<Item = FileId> {
501        self.files_without_type(file_type).map(|file| file.id)
502    }
503
504    /// Returns the total number of files in the database.
505    fn len(&self) -> usize;
506
507    /// Returns `true` if the database contains no files.
508    #[inline]
509    fn is_empty(&self) -> bool {
510        self.len() == 0
511    }
512}
513
514impl DatabaseReader for Database<'_> {
515    #[inline]
516    fn get_id(&self, name: &[u8]) -> Option<FileId> {
517        self.files.get(name).map(|f| f.id)
518    }
519
520    #[inline]
521    fn get(&self, id: &FileId) -> Result<Arc<File>, DatabaseError> {
522        let name = self.id_to_name.get(id).ok_or(DatabaseError::FileNotFound)?;
523        let file = self.files.get(name).ok_or(DatabaseError::FileNotFound)?;
524
525        Ok(Arc::clone(file))
526    }
527
528    #[inline]
529    fn get_ref(&self, id: &FileId) -> Result<&File, DatabaseError> {
530        let name = self.id_to_name.get(id).ok_or(DatabaseError::FileNotFound)?;
531        self.files.get(name).map(std::convert::AsRef::as_ref).ok_or(DatabaseError::FileNotFound)
532    }
533
534    #[inline]
535    fn get_by_name(&self, name: &[u8]) -> Result<Arc<File>, DatabaseError> {
536        self.files.get(name).cloned().ok_or(DatabaseError::FileNotFound)
537    }
538
539    #[inline]
540    fn get_by_path(&self, path: &Path) -> Result<Arc<File>, DatabaseError> {
541        self.files.values().find(|file| file.path.as_deref() == Some(path)).cloned().ok_or(DatabaseError::FileNotFound)
542    }
543
544    #[inline]
545    fn files(&self) -> impl Iterator<Item = Arc<File>> {
546        self.files.values().cloned()
547    }
548
549    #[inline]
550    fn len(&self) -> usize {
551        self.files.len()
552    }
553}
554
555impl DatabaseReader for ReadDatabase {
556    #[inline]
557    fn get_id(&self, name: &[u8]) -> Option<FileId> {
558        self.name_to_index.get(name).and_then(|&i| self.files.get(i)).map(|f| f.id)
559    }
560
561    #[inline]
562    fn get(&self, id: &FileId) -> Result<Arc<File>, DatabaseError> {
563        let index = self.id_to_index.get(id).ok_or(DatabaseError::FileNotFound)?;
564
565        self.files.get(*index).cloned().ok_or(DatabaseError::FileNotFound)
566    }
567
568    #[inline]
569    fn get_ref(&self, id: &FileId) -> Result<&File, DatabaseError> {
570        let index = self.id_to_index.get(id).ok_or(DatabaseError::FileNotFound)?;
571
572        self.files.get(*index).map(std::convert::AsRef::as_ref).ok_or(DatabaseError::FileNotFound)
573    }
574
575    #[inline]
576    fn get_by_name(&self, name: &[u8]) -> Result<Arc<File>, DatabaseError> {
577        self.name_to_index.get(name).and_then(|&i| self.files.get(i)).cloned().ok_or(DatabaseError::FileNotFound)
578    }
579
580    #[inline]
581    fn get_by_path(&self, path: &Path) -> Result<Arc<File>, DatabaseError> {
582        self.path_to_index.get(path).and_then(|&i| self.files.get(i)).cloned().ok_or(DatabaseError::FileNotFound)
583    }
584
585    #[inline]
586    fn files(&self) -> impl Iterator<Item = Arc<File>> {
587        self.files.iter().cloned()
588    }
589
590    #[inline]
591    fn len(&self) -> usize {
592        self.files.len()
593    }
594}