mago_database/
lib.rs

1//! High-performance file database for PHP projects.
2//!
3//! This crate provides an efficient in-memory database for managing collections of PHP source files.
4//! It offers two complementary database types optimized for different access patterns:
5//!
6//! - [`Database`]: Mutable builder optimized for modifications (add, update, delete)
7//! - [`ReadDatabase`]: Immutable snapshot optimized for high-performance reads
8//!
9//! # Architecture
10//!
11//! The database uses a two-phase approach:
12//!
13//! 1. **Build Phase**: Use [`Database`] to load files, make modifications, and track changes
14//! 2. **Query Phase**: Convert to [`ReadDatabase`] via [`Database::read_only`] for fast lookups
15//!
16//! # Key Features
17//!
18//! - **Fast Lookups**: O(1) average-time access by ID, name, or filesystem path
19//! - **Change Tracking**: Record and batch apply file modifications via [`ChangeLog`]
20//! - **Deterministic Iteration**: [`ReadDatabase`] guarantees consistent iteration order
21//! - **Parallel Operations**: Concurrent file I/O and processing support
22//! - **Type Safety**: Strong typing with stable [`FileId`] handles
23//!
24//! # Common Workflow
25//!
26//! ## Loading Files
27//!
28//! Use [`loader::DatabaseLoader`] to scan a project directory:
29//!
30//! The loader handles file discovery, exclusion patterns, and parallel loading.
31//!
32//! ## Querying Files
33//!
34//! Both database types implement [`DatabaseReader`] for uniform access:
35//!
36//! ## Modifying Files
37//!
38//! Use [`ChangeLog`] to batch modifications:
39//!
40//! Changes can be applied to the database and optionally written to disk in parallel.
41//!
42//! # Performance Characteristics
43//!
44//! ## Database (Mutable)
45//!
46//! - Add/Update/Delete: O(1) average
47//! - Lookup by ID/name: O(1) average
48//! - Iteration: Unordered
49//! - Memory: ~2x file count (maps for bidirectional lookup)
50//!
51//! ## ReadDatabase (Immutable)
52//!
53//! - Creation: O(n log n) for sorting
54//! - Lookup by ID/name/path: O(1) average
55//! - Iteration: Deterministic, sorted by FileId
56//! - Memory: ~3x file count (vector + 3 index maps)
57//!
58//! # Thread Safety
59//!
60//! [`Database`] is not thread-safe and should be used from a single thread during construction.
61//! [`ReadDatabase`] can be freely shared across threads for concurrent read access.
62
63use std::borrow::Cow;
64use std::path::Path;
65use std::path::PathBuf;
66use std::sync::Arc;
67
68use ahash::HashMap;
69use ahash::HashMapExt;
70use rayon::iter::IntoParallelIterator;
71use rayon::iter::ParallelIterator;
72use serde::Deserialize;
73use serde::Serialize;
74
75use crate::change::Change;
76use crate::change::ChangeLog;
77use crate::error::DatabaseError;
78use crate::exclusion::Exclusion;
79use crate::file::File;
80use crate::file::FileId;
81use crate::file::FileType;
82use crate::file::line_starts;
83use crate::operation::FilesystemOperation;
84
85mod utils;
86
87pub mod change;
88pub mod error;
89pub mod exclusion;
90pub mod file;
91pub mod loader;
92pub mod watcher;
93
94mod operation;
95
96/// Configuration for database loading and watching.
97#[derive(Debug, Clone, Serialize, Deserialize)]
98pub struct DatabaseConfiguration<'a> {
99    pub workspace: Cow<'a, Path>,
100    pub paths: Vec<Cow<'a, Path>>,
101    pub includes: Vec<Cow<'a, Path>>,
102    pub excludes: Vec<Exclusion<'a>>,
103    pub extensions: Vec<Cow<'a, str>>,
104}
105
106impl<'a> DatabaseConfiguration<'a> {
107    pub fn new(
108        workspace: &'a Path,
109        paths: Vec<&'a Path>,
110        includes: Vec<&'a Path>,
111        excludes: Vec<Exclusion<'a>>,
112        extensions: Vec<&'a str>,
113    ) -> Self {
114        let paths = canonicalize_paths(workspace, paths);
115        let includes = canonicalize_paths(workspace, includes);
116
117        let excludes = excludes
118            .into_iter()
119            .filter_map(|exclusion| match exclusion {
120                Exclusion::Path(p) => Some(if p.is_absolute() {
121                    Exclusion::Path(p)
122                } else {
123                    workspace.join(p).canonicalize().ok().map(Cow::Owned).map(Exclusion::Path)?
124                }),
125                Exclusion::Pattern(pat) => Some(Exclusion::Pattern(pat)),
126            })
127            .collect();
128
129        let extensions = extensions.into_iter().map(Cow::Borrowed).collect();
130
131        Self { workspace: Cow::Borrowed(workspace), paths, includes, excludes, extensions }
132    }
133
134    pub fn into_static(self) -> DatabaseConfiguration<'static> {
135        DatabaseConfiguration {
136            workspace: Cow::Owned(self.workspace.into_owned()),
137            paths: self.paths.into_iter().map(|p| Cow::Owned(p.into_owned())).collect(),
138            includes: self.includes.into_iter().map(|p| Cow::Owned(p.into_owned())).collect(),
139            excludes: self
140                .excludes
141                .into_iter()
142                .map(|e| match e {
143                    Exclusion::Path(p) => Exclusion::Path(Cow::Owned(p.into_owned())),
144                    Exclusion::Pattern(pat) => Exclusion::Pattern(Cow::Owned(pat.into_owned())),
145                })
146                .collect(),
147            extensions: self.extensions.into_iter().map(|s| Cow::Owned(s.into_owned())).collect(),
148        }
149    }
150}
151
152fn canonicalize_paths<'a>(workspace: &'a Path, paths: Vec<&'a Path>) -> Vec<Cow<'a, Path>> {
153    paths
154        .into_iter()
155        .filter_map(|p| {
156            Some(if p.is_absolute() {
157                Cow::Borrowed(p)
158            } else {
159                workspace
160                    .join(p)
161                    .canonicalize()
162                    .inspect_err(|_| tracing::warn!("Ignoring invalid or non-existent path `{}`", p.display()))
163                    .ok()
164                    .map(Cow::Owned)?
165            })
166        })
167        .collect()
168}
169
170/// Mutable database for managing project files with add/update/delete operations.
171#[derive(Debug, Clone, Serialize, Deserialize)]
172pub struct Database<'a> {
173    files: HashMap<Cow<'static, str>, Arc<File>>,
174    id_to_name: HashMap<FileId, Cow<'static, str>>,
175    pub(crate) configuration: DatabaseConfiguration<'a>,
176}
177
178/// Immutable, read-optimized snapshot of the database.
179#[derive(Debug)]
180pub struct ReadDatabase {
181    files: Vec<Arc<File>>,
182    id_to_index: HashMap<FileId, usize>,
183    name_to_index: HashMap<Cow<'static, str>, usize>,
184    path_to_index: HashMap<PathBuf, usize>,
185}
186
187impl<'a> Database<'a> {
188    pub fn new(configuration: DatabaseConfiguration<'a>) -> Self {
189        Self { files: HashMap::default(), id_to_name: HashMap::default(), configuration }
190    }
191
192    pub fn single(file: File, configuration: DatabaseConfiguration<'a>) -> Self {
193        let mut db = Self::new(configuration);
194        db.add(file);
195        db
196    }
197
198    pub fn add(&mut self, file: File) -> FileId {
199        let name = file.name.clone();
200        let id = file.id;
201
202        if let Some(old_file) = self.files.insert(name.clone(), Arc::new(file)) {
203            self.id_to_name.remove(&old_file.id);
204        }
205
206        self.id_to_name.insert(id, name);
207
208        id
209    }
210
211    /// Updates a file's content in-place using its stable `FileId`.
212    ///
213    /// This recalculates derived data like file size, line endings, and `FileRevision`.
214    /// Returns `true` if a file with the given ID was found and updated.
215    pub fn update(&mut self, id: FileId, new_contents: Cow<'static, str>) -> bool {
216        if let Some(name) = self.id_to_name.get(&id)
217            && let Some(file) = self.files.get_mut(name)
218            && let Some(file) = Arc::get_mut(file)
219        {
220            file.contents = new_contents;
221            file.size = file.contents.len() as u32;
222            file.lines = line_starts(file.contents.as_ref()).collect();
223            return true;
224        }
225        false
226    }
227
228    /// Deletes a file from the database using its stable `FileId`.
229    ///
230    /// Returns `true` if a file with the given ID was found and removed.
231    pub fn delete(&mut self, id: FileId) -> bool {
232        if let Some(name) = self.id_to_name.remove(&id) { self.files.remove(&name).is_some() } else { false }
233    }
234
235    /// Commits a [`ChangeLog`], applying all its recorded operations to the database
236    /// and optionally writing them to the filesystem.
237    ///
238    /// # Arguments
239    ///
240    /// * `change_log`: The log of changes to apply.
241    /// * `write_to_disk`: If `true`, changes for files that have a filesystem
242    ///   path will be written to disk in parallel.
243    ///
244    /// # Errors
245    ///
246    /// Returns a [`DatabaseError`] if the log cannot be consumed or if any
247    /// filesystem operation fails.
248    pub fn commit(&mut self, change_log: ChangeLog, write_to_disk: bool) -> Result<(), DatabaseError> {
249        let changes = change_log.into_inner()?;
250        let mut fs_operations = if write_to_disk { Vec::new() } else { Vec::with_capacity(0) };
251
252        for change in changes {
253            match change {
254                Change::Add(file) => {
255                    if write_to_disk && let Some(path) = &file.path {
256                        fs_operations.push(FilesystemOperation::Write(path.clone(), file.contents.clone()));
257                    }
258
259                    self.add(file);
260                }
261                Change::Update(id, contents) => {
262                    if write_to_disk
263                        && let Ok(file) = self.get(&id)
264                        && let Some(path) = &file.path
265                    {
266                        fs_operations.push(FilesystemOperation::Write(path.clone(), contents.clone()));
267                    }
268
269                    self.update(id, contents);
270                }
271                Change::Delete(id) => {
272                    if write_to_disk
273                        && let Ok(file) = self.get(&id)
274                        && let Some(path) = &file.path
275                    {
276                        fs_operations.push(FilesystemOperation::Delete(path.clone()));
277                    }
278
279                    self.delete(id);
280                }
281            }
282        }
283
284        if write_to_disk {
285            fs_operations.into_par_iter().try_for_each(|op| -> Result<(), DatabaseError> { op.execute() })?;
286        }
287
288        Ok(())
289    }
290
291    /// Creates an independent, immutable snapshot of the database.
292    ///
293    /// This is a potentially expensive one-time operation as it **clones** all file
294    /// data. The resulting [`ReadDatabase`] is highly optimized for fast reads and
295    /// guarantees a deterministic iteration order. The original `Database` is not
296    /// consumed and can continue to be used.
297    pub fn read_only(&self) -> ReadDatabase {
298        let mut files_vec: Vec<Arc<File>> = self.files.values().cloned().collect();
299        files_vec.sort_unstable_by_key(|f| f.id);
300
301        let mut id_to_index = HashMap::with_capacity(files_vec.len());
302        let mut name_to_index = HashMap::with_capacity(files_vec.len());
303        let mut path_to_index = HashMap::with_capacity(files_vec.len());
304
305        for (index, file) in files_vec.iter().enumerate() {
306            id_to_index.insert(file.id, index);
307            name_to_index.insert(file.name.clone(), index);
308            if let Some(path) = &file.path {
309                path_to_index.insert(path.clone(), index);
310            }
311        }
312
313        ReadDatabase { files: files_vec, id_to_index, name_to_index, path_to_index }
314    }
315}
316
317impl ReadDatabase {
318    pub fn empty() -> Self {
319        Self {
320            files: Vec::with_capacity(0),
321            id_to_index: HashMap::with_capacity(0),
322            name_to_index: HashMap::with_capacity(0),
323            path_to_index: HashMap::with_capacity(0),
324        }
325    }
326
327    /// Creates a new `ReadDatabase` containing only a single file.
328    ///
329    /// This is a convenience constructor for situations, such as testing or
330    /// single-file tools, where an operation requires a [`DatabaseReader`]
331    /// implementation but only needs to be aware of one file.
332    ///
333    /// # Arguments
334    ///
335    /// * `file`: The single `File` to include in the database.
336    pub fn single(file: File) -> Self {
337        let mut id_to_index = HashMap::with_capacity(1);
338        let mut name_to_index = HashMap::with_capacity(1);
339        let mut path_to_index = HashMap::with_capacity(1);
340
341        id_to_index.insert(file.id, 0);
342        name_to_index.insert(file.name.clone(), 0);
343        if let Some(path) = &file.path {
344            path_to_index.insert(path.clone(), 0);
345        }
346
347        Self { files: vec![Arc::new(file)], id_to_index, name_to_index, path_to_index }
348    }
349}
350
351/// A universal interface for reading data from any database implementation.
352///
353/// This trait provides a common API for querying file data, abstracting over
354/// whether the underlying source is the mutable [`Database`] or the read-optimized
355/// [`ReadDatabase`]. This allows for writing generic code that can operate on either.
356pub trait DatabaseReader {
357    /// Retrieves a file's stable ID using its logical name.
358    fn get_id(&self, name: &str) -> Option<FileId>;
359
360    /// Retrieves a reference to a file using its stable `FileId`.
361    ///
362    /// # Errors
363    ///
364    /// Returns `DatabaseError::FileNotFound` if no file with the given ID exists.
365    fn get(&self, id: &FileId) -> Result<Arc<File>, DatabaseError>;
366
367    /// Retrieves a reference to a file using its stable `FileId`.
368    ///
369    /// # Errors
370    ///
371    /// Returns `DatabaseError::FileNotFound` if no file with the given ID exists.
372    fn get_ref(&self, id: &FileId) -> Result<&File, DatabaseError>;
373
374    /// Retrieves a reference to a file using its logical name.
375    ///
376    /// # Errors
377    ///
378    /// Returns `DatabaseError::FileNotFound` if no file with the given name exists.
379    fn get_by_name(&self, name: &str) -> Result<Arc<File>, DatabaseError>;
380
381    /// Retrieves a reference to a file by its absolute filesystem path.
382    ///
383    /// # Errors
384    ///
385    /// Returns `DatabaseError::FileNotFound` if no file with the given path exists.
386    fn get_by_path(&self, path: &Path) -> Result<Arc<File>, DatabaseError>;
387
388    /// Returns an iterator over all files in the database.
389    ///
390    /// The order is not guaranteed for `Database`, but is sorted by `FileId`
391    /// for `ReadDatabase`, providing deterministic iteration.
392    fn files(&self) -> impl Iterator<Item = Arc<File>>;
393
394    /// Returns an iterator over all files of a specific `FileType`.
395    fn files_with_type(&self, file_type: FileType) -> impl Iterator<Item = Arc<File>> {
396        self.files().filter(move |file| file.file_type == file_type)
397    }
398
399    /// Returns an iterator over all files that do not match a specific `FileType`.
400    fn files_without_type(&self, file_type: FileType) -> impl Iterator<Item = Arc<File>> {
401        self.files().filter(move |file| file.file_type != file_type)
402    }
403
404    /// Returns an iterator over the stable IDs of all files in the database.
405    fn file_ids(&self) -> impl Iterator<Item = FileId> {
406        self.files().map(|file| file.id)
407    }
408
409    /// Returns an iterator over the stable IDs of all files of a specific `FileType`.
410    fn file_ids_with_type(&self, file_type: FileType) -> impl Iterator<Item = FileId> {
411        self.files_with_type(file_type).map(|file| file.id)
412    }
413
414    /// Returns an iterator over the stable IDs of all files that do not match a specific `FileType`.
415    fn file_ids_without_type(&self, file_type: FileType) -> impl Iterator<Item = FileId> {
416        self.files_without_type(file_type).map(|file| file.id)
417    }
418
419    /// Returns the total number of files in the database.
420    fn len(&self) -> usize;
421
422    /// Returns `true` if the database contains no files.
423    fn is_empty(&self) -> bool {
424        self.len() == 0
425    }
426}
427
428impl<'a> DatabaseReader for Database<'a> {
429    fn get_id(&self, name: &str) -> Option<FileId> {
430        self.files.get(name).map(|f| f.id)
431    }
432
433    fn get(&self, id: &FileId) -> Result<Arc<File>, DatabaseError> {
434        let name = self.id_to_name.get(id).ok_or(DatabaseError::FileNotFound)?;
435        let file = self.files.get(name).ok_or(DatabaseError::FileNotFound)?;
436
437        Ok(file.clone())
438    }
439
440    fn get_ref(&self, id: &FileId) -> Result<&File, DatabaseError> {
441        let name = self.id_to_name.get(id).ok_or(DatabaseError::FileNotFound)?;
442        self.files.get(name).map(|file| file.as_ref()).ok_or(DatabaseError::FileNotFound)
443    }
444
445    fn get_by_name(&self, name: &str) -> Result<Arc<File>, DatabaseError> {
446        self.files.get(name).cloned().ok_or(DatabaseError::FileNotFound)
447    }
448
449    fn get_by_path(&self, path: &Path) -> Result<Arc<File>, DatabaseError> {
450        self.files.values().find(|file| file.path.as_deref() == Some(path)).cloned().ok_or(DatabaseError::FileNotFound)
451    }
452
453    fn files(&self) -> impl Iterator<Item = Arc<File>> {
454        self.files.values().cloned()
455    }
456
457    fn len(&self) -> usize {
458        self.files.len()
459    }
460}
461
462impl DatabaseReader for ReadDatabase {
463    fn get_id(&self, name: &str) -> Option<FileId> {
464        self.name_to_index.get(name).and_then(|&i| self.files.get(i)).map(|f| f.id)
465    }
466
467    fn get(&self, id: &FileId) -> Result<Arc<File>, DatabaseError> {
468        let index = self.id_to_index.get(id).ok_or(DatabaseError::FileNotFound)?;
469
470        self.files.get(*index).cloned().ok_or(DatabaseError::FileNotFound)
471    }
472
473    fn get_ref(&self, id: &FileId) -> Result<&File, DatabaseError> {
474        let index = self.id_to_index.get(id).ok_or(DatabaseError::FileNotFound)?;
475
476        self.files.get(*index).map(|file| file.as_ref()).ok_or(DatabaseError::FileNotFound)
477    }
478
479    fn get_by_name(&self, name: &str) -> Result<Arc<File>, DatabaseError> {
480        self.name_to_index.get(name).and_then(|&i| self.files.get(i)).cloned().ok_or(DatabaseError::FileNotFound)
481    }
482
483    fn get_by_path(&self, path: &Path) -> Result<Arc<File>, DatabaseError> {
484        self.path_to_index.get(path).and_then(|&i| self.files.get(i)).cloned().ok_or(DatabaseError::FileNotFound)
485    }
486
487    fn files(&self) -> impl Iterator<Item = Arc<File>> {
488        self.files.iter().cloned()
489    }
490
491    fn len(&self) -> usize {
492        self.files.len()
493    }
494}