Skip to main content

mago_database/
file.rs

1use std::borrow::Cow;
2use std::hash::DefaultHasher;
3use std::hash::Hash;
4use std::hash::Hasher;
5use std::path::Path;
6use std::path::PathBuf;
7
8use serde::Deserialize;
9use serde::Serialize;
10
11use crate::error::DatabaseError;
12use crate::utils::read_file;
13
14/// A stable, unique identifier for a file.
15///
16/// This ID is generated by hashing the file's logical name, ensuring it remains
17/// consistent across application runs and is unaffected by content modifications.
18#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
19#[repr(transparent)]
20pub struct FileId(u64);
21
22/// Distinguishes between the origins of source files.
23#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
24#[repr(u8)]
25pub enum FileType {
26    /// The file is part of the primary project source code.
27    /// These files typically reside on the filesystem and are actively developed.
28    Host,
29
30    /// The file belongs to a third-party dependency (e.g., a Composer package).
31    /// These files exist on the filesystem within the project (e.g., in `vendor/`)
32    /// but are not considered part of the primary source code.
33    Vendored,
34
35    /// The file represents a built-in language construct (e.g., a core PHP function or class).
36    /// These "files" do not exist on the filesystem and their content is typically
37    /// provided as pre-defined stubs for analysis.
38    Builtin,
39}
40
41/// A file that's either stored on the host system's file system or in the vendored file system.
42///
43/// This struct encapsulates all the necessary information about a file, including its content,
44/// location, and metadata for change detection.
45#[derive(Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
46pub struct File {
47    /// A stable, unique identifier for the file, generated from its logical name.
48    /// This ID persists across application runs and content modifications.
49    pub id: FileId,
50
51    /// The logical name of the file, typically the path relative to the root of the project.
52    pub name: Cow<'static, str>,
53
54    /// The absolute path of the file on the host's filesystem, if it exists there.
55    /// This will be `None` for vendored files that don't have a physical counterpart.
56    pub path: Option<PathBuf>,
57
58    /// The type of the file, indicating its origin.
59    pub file_type: FileType,
60
61    /// The contents of the file, if available.
62    pub contents: Cow<'static, str>,
63
64    /// The size of the file's contents in bytes.
65    pub size: u32,
66
67    /// A vector containing the starting byte offsets of each line in `contents`.
68    /// The first line always starts at offset 0. This is useful for quickly
69    /// navigating to a specific line number without scanning the whole file.
70    pub lines: Vec<u32>,
71}
72
73pub trait HasFileId {
74    /// Returns the unique identifier of the file.
75    fn file_id(&self) -> FileId;
76}
77
78impl File {
79    /// Creates a new `File` instance from its name, type, path, and contents.
80    ///
81    /// It automatically calculates the size, and line start offsets.
82    #[inline]
83    #[must_use]
84    pub fn new(
85        name: Cow<'static, str>,
86        file_type: FileType,
87        path: Option<PathBuf>,
88        contents: Cow<'static, str>,
89    ) -> Self {
90        let id = FileId::new(&name);
91        let size = contents.len() as u32;
92        let lines = line_starts(contents.as_ref());
93
94        Self { id, name, path, file_type, contents, size, lines }
95    }
96
97    /// Creates a new `File` instance by reading its contents from the filesystem.
98    ///
99    /// This is the primary factory function for creating a `File` from a disk source.
100    /// It handles determining the file's logical name relative to the workspace,
101    /// reading its contents, and robustly handling non-UTF-8 text via lossy conversion.
102    ///
103    /// # Arguments
104    ///
105    /// * `workspace`: The root directory of the project, used to calculate the logical name.
106    /// * `path`: The absolute path to the file to read from disk.
107    /// * `file_type`: The [`FileType`] to assign to the created file.
108    ///
109    /// # Errors
110    ///
111    /// Returns a [`DatabaseError::IOError`] if the file cannot be read from the disk.
112    #[inline(always)]
113    pub fn read(workspace: &Path, path: &Path, file_type: FileType) -> Result<Self, DatabaseError> {
114        read_file(workspace, path, file_type)
115    }
116
117    /// Creates an ephemeral, in-memory `File` from a name and content.
118    ///
119    /// This is a convenience method for situations like testing or formatting where
120    /// a full file context (e.g., a real path) is not required. It defaults to
121    /// `FileType::Host` and a `path` of `None`.
122    #[inline]
123    #[must_use]
124    pub fn ephemeral(name: Cow<'static, str>, contents: Cow<'static, str>) -> Self {
125        Self::new(name, FileType::Host, None, contents)
126    }
127
128    /// Retrieve the line number for the given byte offset.
129    ///
130    /// # Parameters
131    ///
132    /// - `offset`: The byte offset to retrieve the line number for.
133    ///
134    /// # Returns
135    ///
136    /// The line number for the given byte offset (0-based index).
137    #[inline]
138    #[must_use]
139    pub fn line_number(&self, offset: u32) -> u32 {
140        self.lines.binary_search(&offset).unwrap_or_else(|next_line| next_line - 1) as u32
141    }
142
143    /// Retrieve the byte offset for the start of the given line.
144    ///
145    /// # Parameters
146    ///
147    /// - `line`: The line number to retrieve the start offset for.
148    ///
149    /// # Returns
150    ///
151    /// The byte offset for the start of the given line (0-based index).
152    #[inline]
153    #[must_use]
154    pub fn get_line_start_offset(&self, line: u32) -> Option<u32> {
155        self.lines.get(line as usize).copied()
156    }
157
158    /// Retrieve the byte offset for the end of the given line.
159    ///
160    /// # Parameters
161    ///
162    /// - `line`: The line number to retrieve the end offset for.
163    ///
164    /// # Returns
165    ///
166    /// The byte offset for the end of the given line (0-based index).
167    #[inline]
168    #[must_use]
169    pub fn get_line_end_offset(&self, line: u32) -> Option<u32> {
170        match self.lines.get(line as usize + 1) {
171            Some(&end) => Some(end - 1),
172            None if line as usize == self.lines.len() - 1 => Some(self.size),
173            _ => None,
174        }
175    }
176
177    /// Retrieve the column number for the given byte offset.
178    ///
179    /// # Parameters
180    ///
181    /// - `offset`: The byte offset to retrieve the column number for.
182    ///
183    /// # Returns
184    ///
185    /// The column number for the given byte offset (0-based index).
186    #[inline]
187    #[must_use]
188    pub fn column_number(&self, offset: u32) -> u32 {
189        let line = self.line_number(offset) as usize;
190
191        offset - self.lines[line]
192    }
193}
194
195impl FileType {
196    /// Returns `true` if the file is a host file, meaning it is part of the project's source code.
197    #[inline]
198    #[must_use]
199    pub const fn is_host(self) -> bool {
200        matches!(self, FileType::Host)
201    }
202
203    /// Returns `true` if the file is a vendored file, meaning it comes from an external library or dependency.
204    #[inline]
205    #[must_use]
206    pub const fn is_vendored(self) -> bool {
207        matches!(self, FileType::Vendored)
208    }
209
210    /// Returns `true` if the file is a built-in file, meaning it represents a core language construct.
211    #[inline]
212    #[must_use]
213    pub const fn is_builtin(self) -> bool {
214        matches!(self, FileType::Builtin)
215    }
216}
217
218impl FileId {
219    #[inline]
220    #[must_use]
221    pub fn new(logical_name: &str) -> Self {
222        let mut hasher = DefaultHasher::new();
223        logical_name.hash(&mut hasher);
224        Self(hasher.finish())
225    }
226
227    #[inline]
228    #[must_use]
229    pub const fn zero() -> Self {
230        Self(0)
231    }
232
233    #[inline]
234    #[must_use]
235    pub const fn is_zero(self) -> bool {
236        self.0 == 0
237    }
238
239    #[inline]
240    #[must_use]
241    pub fn as_u64(self) -> u64 {
242        self.0
243    }
244}
245
246impl HasFileId for File {
247    #[inline]
248    fn file_id(&self) -> FileId {
249        self.id
250    }
251}
252
253impl std::fmt::Display for FileId {
254    #[inline]
255    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
256        write!(f, "{}", self.0)
257    }
258}
259
260/// Returns a vec over the starting byte offsets of each line in `source`.
261#[inline]
262pub(crate) fn line_starts(source: &str) -> Vec<u32> {
263    // Heuristic: On the test corpus, the mean length is about 30 bytes, the median is 23.
264    // Since the whole vec will be small, we prefer slight over-allocation to avoid re-allocations
265    // in the common case
266    const LINE_WIDTH_HEURISTIC: usize = 20;
267
268    let bytes = source.as_bytes();
269
270    // Pre-allocate to avoid calling `realloc` thousands of times per file.
271    let mut lines = Vec::with_capacity(bytes.len() / LINE_WIDTH_HEURISTIC);
272    lines.push(0);
273
274    // Detect line ending style from the first \r or \n.  Real files use one
275    // convention throughout, so we never need to handle mixed \r\n / bare \r.
276    match memchr::memchr2(b'\r', b'\n', bytes) {
277        // No line endings: single-line file, nothing more to push.
278        None => {}
279        // Old Mac (\r only): first line-ending char is a bare \r.
280        Some(cr) if bytes[cr] == b'\r' && bytes.get(cr + 1) != Some(&b'\n') => {
281            for pos in memchr::memchr_iter(b'\r', bytes) {
282                lines.push((pos + 1) as u32);
283            }
284        }
285        // Unix (\n only) or Windows (\r\n): \n marks every line start.
286        _ => {
287            for pos in memchr::memchr_iter(b'\n', bytes) {
288                lines.push((pos + 1) as u32);
289            }
290        }
291    }
292
293    lines
294}