Skip to main content

mago_database/
lib.rs

1#![allow(clippy::exhaustive_enums)]
2
3//! High-performance file database for PHP projects.
4//!
5//! This crate provides an efficient in-memory database for managing collections of PHP source files.
6//! It offers two complementary database types optimized for different access patterns:
7//!
8//! - [`Database`]: Mutable builder optimized for modifications (add, update, delete)
9//! - [`ReadDatabase`]: Immutable snapshot optimized for high-performance reads
10//!
11//! # Architecture
12//!
13//! The database uses a two-phase approach:
14//!
15//! 1. **Build Phase**: Use [`Database`] to load files, make modifications, and track changes
16//! 2. **Query Phase**: Convert to [`ReadDatabase`] via [`Database::read_only`] for fast lookups
17//!
18//! # Key Features
19//!
20//! - **Fast Lookups**: O(1) average-time access by ID, name, or filesystem path
21//! - **Change Tracking**: Record and batch apply file modifications via [`ChangeLog`]
22//! - **Deterministic Iteration**: [`ReadDatabase`] guarantees consistent iteration order
23//! - **Parallel Operations**: Concurrent file I/O and processing support
24//! - **Type Safety**: Strong typing with stable [`FileId`] handles
25//!
26//! # Common Workflow
27//!
28//! ## Loading Files
29//!
30//! Use [`loader::DatabaseLoader`] to scan a project directory:
31//!
32//! The loader handles file discovery, exclusion patterns, and parallel loading.
33//!
34//! ## Querying Files
35//!
36//! Both database types implement [`DatabaseReader`] for uniform access:
37//!
38//! ## Modifying Files
39//!
40//! Use [`ChangeLog`] to batch modifications:
41//!
42//! Changes can be applied to the database and optionally written to disk in parallel.
43//!
44//! # Performance Characteristics
45//!
46//! ## Database (Mutable)
47//!
48//! - Add/Update/Delete: O(1) average
49//! - Lookup by ID/name: O(1) average
50//! - Iteration: Unordered
51//! - Memory: ~2x file count (maps for bidirectional lookup)
52//!
53//! ## `ReadDatabase` (Immutable)
54//!
55//! - Creation: O(n log n) for sorting
56//! - Lookup by ID/name/path: O(1) average
57//! - Iteration: Deterministic, sorted by `FileId`
58//! - Memory: ~3x file count (vector + 3 index maps)
59//!
60//! # Thread Safety
61//!
62//! [`Database`] is not thread-safe and should be used from a single thread during construction.
63//! [`ReadDatabase`] can be freely shared across threads for concurrent read access.
64
65use std::borrow::Cow;
66use std::path::Path;
67use std::path::PathBuf;
68use std::sync::Arc;
69
70use foldhash::HashMap;
71use foldhash::HashMapExt;
72use rayon::iter::IntoParallelIterator;
73use rayon::iter::ParallelIterator;
74use serde::Deserialize;
75use serde::Serialize;
76
77use crate::change::Change;
78use crate::change::ChangeLog;
79use crate::error::DatabaseError;
80use crate::exclusion::Exclusion;
81use crate::file::File;
82use crate::file::FileId;
83use crate::file::FileType;
84use crate::file::line_starts;
85use crate::operation::FilesystemOperation;
86
87mod utils;
88
89pub mod change;
90pub mod error;
91pub mod exclusion;
92pub mod file;
93pub mod loader;
94pub mod matcher;
95pub mod membership;
96pub mod watcher;
97
98mod operation;
99
100/// Configuration for database loading and watching.
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct DatabaseConfiguration<'config> {
103    pub workspace: Cow<'config, Path>,
104    /// Paths or glob patterns for source files.
105    /// Can be directory paths (e.g., "src") or glob patterns (e.g., "src/**/*.php")
106    pub paths: Vec<Cow<'config, [u8]>>,
107    /// Paths or glob patterns for included files.
108    /// Can be directory paths (e.g., "vendor") or glob patterns (e.g., "vendor/**/*.php")
109    pub includes: Vec<Cow<'config, [u8]>>,
110    pub patches: Vec<Cow<'config, [u8]>>,
111    pub excludes: Vec<Exclusion<'config>>,
112    pub extensions: Vec<Cow<'config, [u8]>>,
113    /// Settings for glob pattern matching behavior.
114    pub glob: GlobSettings,
115}
116
117/// Settings for glob pattern matching behavior.
118///
119/// All defaults match the `globset` crate defaults for backwards compatibility.
120#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
121pub struct GlobSettings {
122    /// Match patterns case-insensitively.
123    ///
124    /// Default: `false`.
125    pub case_insensitive: bool,
126    /// When `true`, a single `*` does not match path separators (`/`).
127    /// This makes `src/*/Test` match only `src/foo/Test`, not `src/foo/bar/Test`.
128    /// Use `**` for recursive matching.
129    ///
130    /// Default: `false`.
131    pub literal_separator: bool,
132    /// Whether `\` escapes special characters in patterns.
133    ///
134    /// Default: `true`.
135    pub backslash_escape: bool,
136    /// Whether an empty case in alternates is allowed (e.g., `{,a}` matches `""` and `"a"`).
137    ///
138    /// Default: `false`.
139    pub empty_alternates: bool,
140}
141
142impl Default for GlobSettings {
143    #[inline]
144    fn default() -> Self {
145        Self {
146            case_insensitive: false,
147            literal_separator: false,
148            backslash_escape: !std::path::is_separator('\\'),
149            empty_alternates: false,
150        }
151    }
152}
153
154impl<'config> DatabaseConfiguration<'config> {
155    #[inline]
156    #[must_use]
157    pub fn new(
158        workspace: &'config Path,
159        paths: Vec<&'config [u8]>,
160        includes: Vec<&'config [u8]>,
161        excludes: Vec<Exclusion<'config>>,
162        extensions: Vec<&'config [u8]>,
163    ) -> Self {
164        let paths = paths.into_iter().map(Cow::Borrowed).collect();
165        let includes = includes.into_iter().map(Cow::Borrowed).collect();
166
167        let excludes = excludes
168            .into_iter()
169            .filter_map(|exclusion| match exclusion {
170                Exclusion::Path(p) => Some(if p.is_absolute() {
171                    Exclusion::Path(p)
172                } else {
173                    workspace.join(p).canonicalize().ok().map(Cow::Owned).map(Exclusion::Path)?
174                }),
175                Exclusion::Pattern(pat) => Some(Exclusion::Pattern(pat)),
176            })
177            .collect();
178
179        let extensions = extensions.into_iter().map(Cow::Borrowed).collect();
180
181        Self {
182            workspace: Cow::Borrowed(workspace),
183            paths,
184            includes,
185            patches: Vec::new(),
186            excludes,
187            extensions,
188            glob: GlobSettings::default(),
189        }
190    }
191
192    #[inline]
193    #[must_use]
194    pub fn into_static(self) -> DatabaseConfiguration<'static> {
195        DatabaseConfiguration {
196            workspace: Cow::Owned(self.workspace.into_owned()),
197            paths: self.paths.into_iter().map(|s| Cow::Owned(s.into_owned())).collect(),
198            includes: self.includes.into_iter().map(|s| Cow::Owned(s.into_owned())).collect(),
199            patches: self.patches.into_iter().map(|s| Cow::Owned(s.into_owned())).collect(),
200            excludes: self
201                .excludes
202                .into_iter()
203                .map(|e| match e {
204                    Exclusion::Path(p) => Exclusion::Path(Cow::Owned(p.into_owned())),
205                    Exclusion::Pattern(pat) => Exclusion::Pattern(Cow::Owned(pat.into_owned())),
206                })
207                .collect(),
208            extensions: self.extensions.into_iter().map(|s| Cow::Owned(s.into_owned())).collect(),
209            glob: self.glob,
210        }
211    }
212}
213
214/// Mutable database for managing project files with add/update/delete operations.
215#[derive(Debug, Clone, Serialize, Deserialize)]
216#[allow(clippy::field_scoped_visibility_modifiers)]
217pub struct Database<'config> {
218    files: HashMap<Cow<'static, [u8]>, Arc<File>>,
219    id_to_name: HashMap<FileId, Cow<'static, [u8]>>,
220    pub(crate) configuration: DatabaseConfiguration<'config>,
221}
222
223/// Immutable, read-optimized snapshot of the database.
224#[derive(Debug)]
225pub struct ReadDatabase {
226    files: Vec<Arc<File>>,
227    id_to_index: HashMap<FileId, usize>,
228    name_to_index: HashMap<Cow<'static, [u8]>, usize>,
229    path_to_index: HashMap<PathBuf, usize>,
230}
231
232impl<'config> Database<'config> {
233    #[inline]
234    #[must_use]
235    pub fn new(configuration: DatabaseConfiguration<'config>) -> Self {
236        Self { files: HashMap::default(), id_to_name: HashMap::default(), configuration }
237    }
238
239    #[inline]
240    #[must_use]
241    pub fn single(file: File, configuration: DatabaseConfiguration<'config>) -> Self {
242        let mut db = Self::new(configuration);
243        db.add(file);
244        db
245    }
246
247    /// Reserves capacity for at least `additional` more files.
248    #[inline]
249    pub fn reserve(&mut self, additional: usize) {
250        self.files.reserve(additional);
251        self.id_to_name.reserve(additional);
252    }
253
254    #[inline]
255    pub fn add(&mut self, file: File) -> FileId {
256        let name = file.name.clone();
257        let id = file.id;
258
259        if let Some(old_file) = self.files.insert(name.clone(), Arc::new(file)) {
260            self.id_to_name.remove(&old_file.id);
261        }
262
263        self.id_to_name.insert(id, name);
264
265        id
266    }
267
268    /// Updates a file's content using its stable `FileId`.
269    ///
270    /// This recalculates derived data like file size, line endings, and `FileRevision`.
271    /// If another `ReadDatabase` snapshot holds a reference to the file (preventing in-place
272    /// mutation), a new `Arc<File>` is created with the updated contents.
273    ///
274    /// Returns `true` if a file with the given ID was found and updated.
275    #[inline]
276    pub fn update(&mut self, id: FileId, new_contents: Cow<'static, [u8]>) -> bool {
277        let Some(name) = self.id_to_name.get(&id) else {
278            return false;
279        };
280
281        let Some(arc) = self.files.get_mut(name) else {
282            return false;
283        };
284
285        if let Some(file) = Arc::get_mut(arc) {
286            file.contents = new_contents;
287            file.size = file.contents.len() as u32;
288            file.lines = line_starts(file.contents.as_ref());
289        } else {
290            // other Arc clones exist (e.g., from a ReadDatabase snapshot).
291            // Create a new File with updated contents and replace the Arc.
292            let old = &**arc;
293            *arc = Arc::new(File::new(old.name.clone(), old.file_type, old.path.clone(), new_contents));
294        }
295
296        true
297    }
298
299    /// Deletes a file from the database using its stable `FileId`.
300    ///
301    /// Returns `true` if a file with the given ID was found and removed.
302    #[inline]
303    pub fn delete(&mut self, id: FileId) -> bool {
304        if let Some(name) = self.id_to_name.remove(&id) { self.files.remove(&name).is_some() } else { false }
305    }
306
307    /// Commits a [`ChangeLog`], applying all its recorded operations to the database
308    /// and optionally writing them to the filesystem.
309    ///
310    /// # Arguments
311    ///
312    /// * `change_log`: The log of changes to apply.
313    /// * `write_to_disk`: If `true`, changes for files that have a filesystem
314    ///   path will be written to disk in parallel.
315    ///
316    /// # Errors
317    ///
318    /// Returns a [`DatabaseError`] if the log cannot be consumed or if any
319    /// filesystem operation fails.
320    #[inline]
321    pub fn commit(&mut self, change_log: ChangeLog, write_to_disk: bool) -> Result<(), DatabaseError> {
322        let changes = change_log.into_inner()?;
323        let mut fs_operations = if write_to_disk { Vec::new() } else { Vec::with_capacity(0) };
324
325        for change in changes {
326            match change {
327                Change::Add(file) => {
328                    if write_to_disk && let Some(path) = &file.path {
329                        fs_operations.push(FilesystemOperation::Write(path.clone(), file.contents.clone()));
330                    }
331
332                    self.add(file);
333                }
334                Change::Update(id, contents) => {
335                    if write_to_disk
336                        && let Ok(file) = self.get(&id)
337                        && let Some(path) = &file.path
338                    {
339                        fs_operations.push(FilesystemOperation::Write(path.clone(), contents.clone()));
340                    }
341
342                    self.update(id, contents);
343                }
344                Change::Delete(id) => {
345                    if write_to_disk
346                        && let Ok(file) = self.get(&id)
347                        && let Some(path) = &file.path
348                    {
349                        fs_operations.push(FilesystemOperation::Delete(path.clone()));
350                    }
351
352                    self.delete(id);
353                }
354            }
355        }
356
357        if write_to_disk {
358            fs_operations.into_par_iter().try_for_each(|op| -> Result<(), DatabaseError> { op.execute() })?;
359        }
360
361        Ok(())
362    }
363
364    /// Creates an independent, immutable snapshot of the database.
365    ///
366    /// This is a potentially expensive one-time operation as it **clones** all file
367    /// data. The resulting [`ReadDatabase`] is highly optimized for fast reads and
368    /// guarantees a deterministic iteration order. The original `Database` is not
369    /// consumed and can continue to be used.
370    #[inline]
371    #[must_use]
372    pub fn read_only(&self) -> ReadDatabase {
373        let mut files_vec: Vec<Arc<File>> = self.files.values().cloned().collect();
374        files_vec.sort_unstable_by_key(|f| f.id);
375
376        let mut id_to_index = HashMap::with_capacity(files_vec.len());
377        let mut name_to_index = HashMap::with_capacity(files_vec.len());
378        let mut path_to_index = HashMap::with_capacity(files_vec.len());
379
380        for (index, file) in files_vec.iter().enumerate() {
381            id_to_index.insert(file.id, index);
382            name_to_index.insert(file.name.clone(), index);
383            if let Some(path) = &file.path {
384                path_to_index.insert(path.clone(), index);
385            }
386        }
387
388        ReadDatabase { files: files_vec, id_to_index, name_to_index, path_to_index }
389    }
390}
391
392impl ReadDatabase {
393    #[inline]
394    #[must_use]
395    pub fn empty() -> Self {
396        Self {
397            files: Vec::with_capacity(0),
398            id_to_index: HashMap::with_capacity(0),
399            name_to_index: HashMap::with_capacity(0),
400            path_to_index: HashMap::with_capacity(0),
401        }
402    }
403
404    /// Creates a new `ReadDatabase` containing only a single file.
405    ///
406    /// This is a convenience constructor for situations, such as testing or
407    /// single-file tools, where an operation requires a [`DatabaseReader`]
408    /// implementation but only needs to be aware of one file.
409    ///
410    /// # Arguments
411    ///
412    /// * `file`: The single `File` to include in the database.
413    #[inline]
414    #[must_use]
415    pub fn single(file: File) -> Self {
416        let mut id_to_index = HashMap::with_capacity(1);
417        let mut name_to_index = HashMap::with_capacity(1);
418        let mut path_to_index = HashMap::with_capacity(1);
419
420        id_to_index.insert(file.id, 0);
421        name_to_index.insert(file.name.clone(), 0);
422        if let Some(path) = &file.path {
423            path_to_index.insert(path.clone(), 0);
424        }
425
426        Self { files: vec![Arc::new(file)], id_to_index, name_to_index, path_to_index }
427    }
428}
429
430/// A universal interface for reading data from any database implementation.
431///
432/// This trait provides a common API for querying file data, abstracting over
433/// whether the underlying source is the mutable [`Database`] or the read-optimized
434/// [`ReadDatabase`]. This allows for writing generic code that can operate on either.
435pub trait DatabaseReader {
436    /// Retrieves a file's stable ID using its logical name.
437    fn get_id(&self, name: &[u8]) -> Option<FileId>;
438
439    /// Retrieves a reference to a file using its stable `FileId`.
440    ///
441    /// # Errors
442    ///
443    /// Returns `DatabaseError::FileNotFound` if no file with the given ID exists.
444    fn get(&self, id: &FileId) -> Result<Arc<File>, DatabaseError>;
445
446    /// Retrieves a reference to a file using its stable `FileId`.
447    ///
448    /// # Errors
449    ///
450    /// Returns `DatabaseError::FileNotFound` if no file with the given ID exists.
451    fn get_ref(&self, id: &FileId) -> Result<&File, DatabaseError>;
452
453    /// Retrieves a reference to a file using its logical name.
454    ///
455    /// # Errors
456    ///
457    /// Returns `DatabaseError::FileNotFound` if no file with the given name exists.
458    fn get_by_name(&self, name: &[u8]) -> Result<Arc<File>, DatabaseError>;
459
460    /// Retrieves a reference to a file by its absolute filesystem path.
461    ///
462    /// # Errors
463    ///
464    /// Returns `DatabaseError::FileNotFound` if no file with the given path exists.
465    fn get_by_path(&self, path: &Path) -> Result<Arc<File>, DatabaseError>;
466
467    /// Returns an iterator over all files in the database.
468    ///
469    /// The order is not guaranteed for `Database`, but is sorted by `FileId`
470    /// for `ReadDatabase`, providing deterministic iteration.
471    fn files(&self) -> impl Iterator<Item = Arc<File>>;
472
473    /// Returns an iterator over all files of a specific `FileType`.
474    #[inline]
475    fn files_with_type(&self, file_type: FileType) -> impl Iterator<Item = Arc<File>> {
476        self.files().filter(move |file| file.file_type == file_type)
477    }
478
479    /// Returns an iterator over all files that do not match a specific `FileType`.
480    #[inline]
481    fn files_without_type(&self, file_type: FileType) -> impl Iterator<Item = Arc<File>> {
482        self.files().filter(move |file| file.file_type != file_type)
483    }
484
485    /// Returns an iterator over the stable IDs of all files in the database.
486    #[inline]
487    fn file_ids(&self) -> impl Iterator<Item = FileId> {
488        self.files().map(|file| file.id)
489    }
490
491    /// Returns an iterator over the stable IDs of all files of a specific `FileType`.
492    #[inline]
493    fn file_ids_with_type(&self, file_type: FileType) -> impl Iterator<Item = FileId> {
494        self.files_with_type(file_type).map(|file| file.id)
495    }
496
497    /// Returns an iterator over the stable IDs of all files that do not match a specific `FileType`.
498    #[inline]
499    fn file_ids_without_type(&self, file_type: FileType) -> impl Iterator<Item = FileId> {
500        self.files_without_type(file_type).map(|file| file.id)
501    }
502
503    /// Returns the total number of files in the database.
504    fn len(&self) -> usize;
505
506    /// Returns `true` if the database contains no files.
507    #[inline]
508    fn is_empty(&self) -> bool {
509        self.len() == 0
510    }
511}
512
513impl DatabaseReader for Database<'_> {
514    #[inline]
515    fn get_id(&self, name: &[u8]) -> Option<FileId> {
516        self.files.get(name).map(|f| f.id)
517    }
518
519    #[inline]
520    fn get(&self, id: &FileId) -> Result<Arc<File>, DatabaseError> {
521        let name = self.id_to_name.get(id).ok_or(DatabaseError::FileNotFound)?;
522        let file = self.files.get(name).ok_or(DatabaseError::FileNotFound)?;
523
524        Ok(Arc::clone(file))
525    }
526
527    #[inline]
528    fn get_ref(&self, id: &FileId) -> Result<&File, DatabaseError> {
529        let name = self.id_to_name.get(id).ok_or(DatabaseError::FileNotFound)?;
530        self.files.get(name).map(std::convert::AsRef::as_ref).ok_or(DatabaseError::FileNotFound)
531    }
532
533    #[inline]
534    fn get_by_name(&self, name: &[u8]) -> Result<Arc<File>, DatabaseError> {
535        self.files.get(name).cloned().ok_or(DatabaseError::FileNotFound)
536    }
537
538    #[inline]
539    fn get_by_path(&self, path: &Path) -> Result<Arc<File>, DatabaseError> {
540        self.files.values().find(|file| file.path.as_deref() == Some(path)).cloned().ok_or(DatabaseError::FileNotFound)
541    }
542
543    #[inline]
544    fn files(&self) -> impl Iterator<Item = Arc<File>> {
545        self.files.values().cloned()
546    }
547
548    #[inline]
549    fn len(&self) -> usize {
550        self.files.len()
551    }
552}
553
554impl DatabaseReader for ReadDatabase {
555    #[inline]
556    fn get_id(&self, name: &[u8]) -> Option<FileId> {
557        self.name_to_index.get(name).and_then(|&i| self.files.get(i)).map(|f| f.id)
558    }
559
560    #[inline]
561    fn get(&self, id: &FileId) -> Result<Arc<File>, DatabaseError> {
562        let index = self.id_to_index.get(id).ok_or(DatabaseError::FileNotFound)?;
563
564        self.files.get(*index).cloned().ok_or(DatabaseError::FileNotFound)
565    }
566
567    #[inline]
568    fn get_ref(&self, id: &FileId) -> Result<&File, DatabaseError> {
569        let index = self.id_to_index.get(id).ok_or(DatabaseError::FileNotFound)?;
570
571        self.files.get(*index).map(std::convert::AsRef::as_ref).ok_or(DatabaseError::FileNotFound)
572    }
573
574    #[inline]
575    fn get_by_name(&self, name: &[u8]) -> Result<Arc<File>, DatabaseError> {
576        self.name_to_index.get(name).and_then(|&i| self.files.get(i)).cloned().ok_or(DatabaseError::FileNotFound)
577    }
578
579    #[inline]
580    fn get_by_path(&self, path: &Path) -> Result<Arc<File>, DatabaseError> {
581        self.path_to_index.get(path).and_then(|&i| self.files.get(i)).cloned().ok_or(DatabaseError::FileNotFound)
582    }
583
584    #[inline]
585    fn files(&self) -> impl Iterator<Item = Arc<File>> {
586        self.files.iter().cloned()
587    }
588
589    #[inline]
590    fn len(&self) -> usize {
591        self.files.len()
592    }
593}