Skip to main content

mago_database/
file.rs

1use std::borrow::Cow;
2use std::hash::DefaultHasher;
3use std::hash::Hash;
4use std::hash::Hasher;
5use std::path::Path;
6use std::path::PathBuf;
7
8use serde::Deserialize;
9use serde::Serialize;
10
11use crate::error::DatabaseError;
12use crate::utils::read_file;
13
14/// A stable, unique identifier for a file.
15///
16/// This ID is generated by hashing the file's logical name, ensuring it remains
17/// consistent across application runs and is unaffected by content modifications.
18#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
19#[repr(transparent)]
20pub struct FileId(u64);
21
22/// Distinguishes between the origins of source files.
23#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
24#[repr(u8)]
25pub enum FileType {
26    /// The file is part of the primary project source code.
27    /// These files typically reside on the filesystem and are actively developed.
28    Host,
29
30    /// The file belongs to a third-party dependency (e.g., a Composer package).
31    /// These files exist on the filesystem within the project (e.g., in `vendor/`)
32    /// but are not considered part of the primary source code.
33    Vendored,
34
35    /// The file represents a built-in language construct (e.g., a core PHP function or class).
36    /// These "files" do not exist on the filesystem and their content is typically
37    /// provided as pre-defined stubs for analysis.
38    Builtin,
39
40    /// The file is a user-provided patch that overrides type information for vendored or
41    /// built-in code with corrected PHPDoc / type declarations.
42    /// Like vendored files, patches are not actively analyzed, linted, or formatted,
43    /// but their metadata takes precedence over both vendored and built-in definitions.
44    Patch,
45}
46
47/// A file that's either stored on the host system's file system or in the vendored file system.
48///
49/// This struct encapsulates all the necessary information about a file, including its content,
50/// location, and metadata for change detection.
51#[derive(Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
52pub struct File {
53    /// A stable, unique identifier for the file, generated from its logical name.
54    /// This ID persists across application runs and content modifications.
55    pub id: FileId,
56
57    /// The logical name of the file, typically the path relative to the root of the project.
58    pub name: Cow<'static, [u8]>,
59
60    /// The absolute path of the file on the host's filesystem, if it exists there.
61    /// This will be `None` for vendored files that don't have a physical counterpart.
62    pub path: Option<PathBuf>,
63
64    /// The type of the file, indicating its origin.
65    pub file_type: FileType,
66
67    /// The contents of the file, if available.
68    pub contents: Cow<'static, [u8]>,
69
70    /// The size of the file's contents in bytes.
71    pub size: u32,
72
73    /// A vector containing the starting byte offsets of each line in `contents`.
74    /// The first line always starts at offset 0. This is useful for quickly
75    /// navigating to a specific line number without scanning the whole file.
76    pub lines: Vec<u32>,
77}
78
79pub trait HasFileId {
80    /// Returns the unique identifier of the file.
81    fn file_id(&self) -> FileId;
82}
83
84impl File {
85    /// Creates a new `File` instance from its name, type, path, and contents.
86    ///
87    /// It automatically calculates the size, and line start offsets.
88    #[inline]
89    #[must_use]
90    pub fn new(
91        name: Cow<'static, [u8]>,
92        file_type: FileType,
93        path: Option<PathBuf>,
94        contents: Cow<'static, [u8]>,
95    ) -> Self {
96        let id = FileId::new(&name);
97        let size = contents.len() as u32;
98        let lines = line_starts(contents.as_ref());
99
100        Self { id, name, path, file_type, contents, size, lines }
101    }
102
103    /// Creates a new `File` instance by reading its contents from the filesystem.
104    ///
105    /// This is the primary factory function for creating a `File` from a disk source.
106    /// It handles determining the file's logical name relative to the workspace,
107    /// reading its contents, and robustly handling non-UTF-8 text via lossy conversion.
108    ///
109    /// # Arguments
110    ///
111    /// * `workspace`: The root directory of the project, used to calculate the logical name.
112    /// * `path`: The absolute path to the file to read from disk.
113    /// * `file_type`: The [`FileType`] to assign to the created file.
114    ///
115    /// # Errors
116    ///
117    /// Returns a [`DatabaseError::IOError`] if the file cannot be read from the disk.
118    #[inline(always)]
119    pub fn read(workspace: &Path, path: &Path, file_type: FileType) -> Result<Self, DatabaseError> {
120        read_file(workspace, path, file_type)
121    }
122
123    /// Creates an ephemeral, in-memory `File` from a name and content.
124    ///
125    /// This is a convenience method for situations like testing or formatting where
126    /// a full file context (e.g., a real path) is not required. It defaults to
127    /// `FileType::Host` and a `path` of `None`.
128    #[inline]
129    #[must_use]
130    pub fn ephemeral(name: Cow<'static, [u8]>, contents: Cow<'static, [u8]>) -> Self {
131        Self::new(name, FileType::Host, None, contents)
132    }
133
134    /// Retrieve the line number for the given byte offset.
135    ///
136    /// # Parameters
137    ///
138    /// - `offset`: The byte offset to retrieve the line number for.
139    ///
140    /// # Returns
141    ///
142    /// The line number for the given byte offset (0-based index).
143    #[inline]
144    #[must_use]
145    pub fn line_number(&self, offset: u32) -> u32 {
146        self.lines.binary_search(&offset).unwrap_or_else(|next_line| next_line - 1) as u32
147    }
148
149    /// Retrieve the byte offset for the start of the given line.
150    ///
151    /// # Parameters
152    ///
153    /// - `line`: The line number to retrieve the start offset for.
154    ///
155    /// # Returns
156    ///
157    /// The byte offset for the start of the given line (0-based index).
158    #[inline]
159    #[must_use]
160    pub fn get_line_start_offset(&self, line: u32) -> Option<u32> {
161        self.lines.get(line as usize).copied()
162    }
163
164    /// Retrieve the byte offset for the end of the given line.
165    ///
166    /// # Parameters
167    ///
168    /// - `line`: The line number to retrieve the end offset for.
169    ///
170    /// # Returns
171    ///
172    /// The byte offset for the end of the given line (0-based index).
173    #[inline]
174    #[must_use]
175    pub fn get_line_end_offset(&self, line: u32) -> Option<u32> {
176        match self.lines.get(line as usize + 1) {
177            Some(&end) => Some(end - 1),
178            None if line as usize == self.lines.len() - 1 => Some(self.size),
179            _ => None,
180        }
181    }
182
183    /// Retrieve the column number for the given byte offset.
184    ///
185    /// # Parameters
186    ///
187    /// - `offset`: The byte offset to retrieve the column number for.
188    ///
189    /// # Returns
190    ///
191    /// The column number for the given byte offset (0-based index).
192    #[inline]
193    #[must_use]
194    pub fn column_number(&self, offset: u32) -> u32 {
195        let line = self.line_number(offset) as usize;
196
197        offset - self.lines[line]
198    }
199}
200
201impl FileType {
202    /// Returns `true` if the file is a host file, meaning it is part of the project's source code.
203    #[inline]
204    #[must_use]
205    pub const fn is_host(self) -> bool {
206        matches!(self, FileType::Host)
207    }
208
209    /// Returns `true` if the file is a vendored file, meaning it comes from an external library or dependency.
210    #[inline]
211    #[must_use]
212    pub const fn is_vendored(self) -> bool {
213        matches!(self, FileType::Vendored)
214    }
215
216    /// Returns `true` if the file is a built-in file, meaning it represents a core language construct.
217    #[inline]
218    #[must_use]
219    pub const fn is_builtin(self) -> bool {
220        matches!(self, FileType::Builtin)
221    }
222
223    /// Returns `true` if the file is a patch, meaning it overrides type information for vendored or built-in code.
224    #[must_use]
225    pub const fn is_patch(self) -> bool {
226        matches!(self, FileType::Patch)
227    }
228}
229
230impl FileId {
231    #[inline]
232    #[must_use]
233    pub fn new(logical_name: &[u8]) -> Self {
234        let mut hasher = DefaultHasher::new();
235        logical_name.hash(&mut hasher);
236        Self(hasher.finish())
237    }
238
239    #[inline]
240    #[must_use]
241    pub const fn zero() -> Self {
242        Self(0)
243    }
244
245    #[inline]
246    #[must_use]
247    pub const fn is_zero(self) -> bool {
248        self.0 == 0
249    }
250
251    #[inline]
252    #[must_use]
253    pub fn as_u64(self) -> u64 {
254        self.0
255    }
256}
257
258impl HasFileId for File {
259    #[inline]
260    fn file_id(&self) -> FileId {
261        self.id
262    }
263}
264
265impl std::fmt::Display for FileId {
266    #[inline]
267    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
268        write!(f, "{}", self.0)
269    }
270}
271
272/// Returns a vec over the starting byte offsets of each line in `source`.
273#[inline]
274pub(crate) fn line_starts(source: &[u8]) -> Vec<u32> {
275    // Heuristic: On the test corpus, the mean length is about 30 bytes, the median is 23.
276    // Since the whole vec will be small, we prefer slight over-allocation to avoid re-allocations
277    // in the common case
278    const LINE_WIDTH_HEURISTIC: usize = 20;
279
280    // Pre-allocate to avoid calling `realloc` thousands of times per file.
281    let mut lines = Vec::with_capacity(source.len() / LINE_WIDTH_HEURISTIC);
282    lines.push(0);
283
284    // Detect line ending style from the first \r or \n.  Real files use one
285    // convention throughout, so we never need to handle mixed \r\n / bare \r.
286    match memchr::memchr2(b'\r', b'\n', source) {
287        // No line endings: single-line file, nothing more to push.
288        None => {}
289        // Old Mac (\r only): first line-ending char is a bare \r.
290        Some(cr) if source[cr] == b'\r' && source.get(cr + 1) != Some(&b'\n') => {
291            for pos in memchr::memchr_iter(b'\r', source) {
292                lines.push((pos + 1) as u32);
293            }
294        }
295        // Unix (\n only) or Windows (\r\n): \n marks every line start.
296        _ => {
297            for pos in memchr::memchr_iter(b'\n', source) {
298                lines.push((pos + 1) as u32);
299            }
300        }
301    }
302
303    lines
304}