mago_database/file.rs
1use std::borrow::Cow;
2use std::hash::DefaultHasher;
3use std::hash::Hash;
4use std::hash::Hasher;
5use std::path::Path;
6use std::path::PathBuf;
7
8use serde::Deserialize;
9use serde::Serialize;
10
11use crate::error::DatabaseError;
12use crate::utils::read_file;
13
14/// A stable, unique identifier for a file.
15///
16/// This ID is generated by hashing the file's logical name, ensuring it remains
17/// consistent across application runs and is unaffected by content modifications.
18#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
19#[repr(transparent)]
20pub struct FileId(u64);
21
22/// Distinguishes between the origins of source files.
23#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
24#[repr(u8)]
25pub enum FileType {
26 /// The file is part of the primary project source code.
27 /// These files typically reside on the filesystem and are actively developed.
28 Host,
29
30 /// The file belongs to a third-party dependency (e.g., a Composer package).
31 /// These files exist on the filesystem within the project (e.g., in `vendor/`)
32 /// but are not considered part of the primary source code.
33 Vendored,
34
35 /// The file represents a built-in language construct (e.g., a core PHP function or class).
36 /// These "files" do not exist on the filesystem and their content is typically
37 /// provided as pre-defined stubs for analysis.
38 Builtin,
39
40 /// The file is a user-provided patch that overrides type information for vendored or
41 /// built-in code with corrected PHPDoc / type declarations.
42 /// Like vendored files, patches are not actively analyzed, linted, or formatted,
43 /// but their metadata takes precedence over both vendored and built-in definitions.
44 Patch,
45}
46
47/// A file that's either stored on the host system's file system or in the vendored file system.
48///
49/// This struct encapsulates all the necessary information about a file, including its content,
50/// location, and metadata for change detection.
51#[derive(Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
52pub struct File {
53 /// A stable, unique identifier for the file, generated from its logical name.
54 /// This ID persists across application runs and content modifications.
55 pub id: FileId,
56
57 /// The logical name of the file, typically the path relative to the root of the project.
58 pub name: Cow<'static, [u8]>,
59
60 /// The absolute path of the file on the host's filesystem, if it exists there.
61 /// This will be `None` for vendored files that don't have a physical counterpart.
62 pub path: Option<PathBuf>,
63
64 /// The type of the file, indicating its origin.
65 pub file_type: FileType,
66
67 /// The contents of the file, if available.
68 pub contents: Cow<'static, [u8]>,
69
70 /// The size of the file's contents in bytes.
71 pub size: u32,
72
73 /// A vector containing the starting byte offsets of each line in `contents`.
74 /// The first line always starts at offset 0. This is useful for quickly
75 /// navigating to a specific line number without scanning the whole file.
76 pub lines: Vec<u32>,
77}
78
79pub trait HasFileId {
80 /// Returns the unique identifier of the file.
81 fn file_id(&self) -> FileId;
82}
83
84impl File {
85 /// Creates a new `File` instance from its name, type, path, and contents.
86 ///
87 /// It automatically calculates the size, and line start offsets.
88 #[inline]
89 #[must_use]
90 pub fn new(
91 name: Cow<'static, [u8]>,
92 file_type: FileType,
93 path: Option<PathBuf>,
94 contents: Cow<'static, [u8]>,
95 ) -> Self {
96 let id = FileId::new(&name);
97 let size = contents.len() as u32;
98 let lines = line_starts(contents.as_ref());
99
100 Self { id, name, path, file_type, contents, size, lines }
101 }
102
103 /// Creates a new `File` instance by reading its contents from the filesystem.
104 ///
105 /// This is the primary factory function for creating a `File` from a disk source.
106 /// It handles determining the file's logical name relative to the workspace,
107 /// reading its contents, and robustly handling non-UTF-8 text via lossy conversion.
108 ///
109 /// # Arguments
110 ///
111 /// * `workspace`: The root directory of the project, used to calculate the logical name.
112 /// * `path`: The absolute path to the file to read from disk.
113 /// * `file_type`: The [`FileType`] to assign to the created file.
114 ///
115 /// # Errors
116 ///
117 /// Returns a [`DatabaseError::IOError`] if the file cannot be read from the disk.
118 #[inline(always)]
119 pub fn read(workspace: &Path, path: &Path, file_type: FileType) -> Result<Self, DatabaseError> {
120 read_file(workspace, path, file_type)
121 }
122
123 /// Creates an ephemeral, in-memory `File` from a name and content.
124 ///
125 /// This is a convenience method for situations like testing or formatting where
126 /// a full file context (e.g., a real path) is not required. It defaults to
127 /// `FileType::Host` and a `path` of `None`.
128 #[inline]
129 #[must_use]
130 pub fn ephemeral(name: Cow<'static, [u8]>, contents: Cow<'static, [u8]>) -> Self {
131 Self::new(name, FileType::Host, None, contents)
132 }
133
134 /// Retrieve the line number for the given byte offset.
135 ///
136 /// # Parameters
137 ///
138 /// - `offset`: The byte offset to retrieve the line number for.
139 ///
140 /// # Returns
141 ///
142 /// The line number for the given byte offset (0-based index).
143 #[inline]
144 #[must_use]
145 pub fn line_number(&self, offset: u32) -> u32 {
146 self.lines.binary_search(&offset).unwrap_or_else(|next_line| next_line - 1) as u32
147 }
148
149 /// Retrieve the byte offset for the start of the given line.
150 ///
151 /// # Parameters
152 ///
153 /// - `line`: The line number to retrieve the start offset for.
154 ///
155 /// # Returns
156 ///
157 /// The byte offset for the start of the given line (0-based index).
158 #[inline]
159 #[must_use]
160 pub fn get_line_start_offset(&self, line: u32) -> Option<u32> {
161 self.lines.get(line as usize).copied()
162 }
163
164 /// Retrieve the byte offset for the end of the given line.
165 ///
166 /// # Parameters
167 ///
168 /// - `line`: The line number to retrieve the end offset for.
169 ///
170 /// # Returns
171 ///
172 /// The byte offset for the end of the given line (0-based index).
173 #[inline]
174 #[must_use]
175 pub fn get_line_end_offset(&self, line: u32) -> Option<u32> {
176 match self.lines.get(line as usize + 1) {
177 Some(&end) => Some(end - 1),
178 None if line as usize == self.lines.len() - 1 => Some(self.size),
179 _ => None,
180 }
181 }
182
183 /// Retrieve the column number for the given byte offset.
184 ///
185 /// # Parameters
186 ///
187 /// - `offset`: The byte offset to retrieve the column number for.
188 ///
189 /// # Returns
190 ///
191 /// The column number for the given byte offset (0-based index).
192 #[inline]
193 #[must_use]
194 pub fn column_number(&self, offset: u32) -> u32 {
195 let line = self.line_number(offset) as usize;
196
197 offset - self.lines[line]
198 }
199}
200
201impl FileType {
202 /// Returns `true` if the file is a host file, meaning it is part of the project's source code.
203 #[inline]
204 #[must_use]
205 pub const fn is_host(self) -> bool {
206 matches!(self, FileType::Host)
207 }
208
209 /// Returns `true` if the file is a vendored file, meaning it comes from an external library or dependency.
210 #[inline]
211 #[must_use]
212 pub const fn is_vendored(self) -> bool {
213 matches!(self, FileType::Vendored)
214 }
215
216 /// Returns `true` if the file is a built-in file, meaning it represents a core language construct.
217 #[inline]
218 #[must_use]
219 pub const fn is_builtin(self) -> bool {
220 matches!(self, FileType::Builtin)
221 }
222
223 /// Returns `true` if the file is a patch, meaning it overrides type information for vendored or built-in code.
224 #[must_use]
225 pub const fn is_patch(self) -> bool {
226 matches!(self, FileType::Patch)
227 }
228}
229
230impl FileId {
231 #[inline]
232 #[must_use]
233 pub fn new(logical_name: &[u8]) -> Self {
234 let mut hasher = DefaultHasher::new();
235 logical_name.hash(&mut hasher);
236 Self(hasher.finish())
237 }
238
239 #[inline]
240 #[must_use]
241 pub const fn zero() -> Self {
242 Self(0)
243 }
244
245 #[inline]
246 #[must_use]
247 pub const fn is_zero(self) -> bool {
248 self.0 == 0
249 }
250
251 #[inline]
252 #[must_use]
253 pub fn as_u64(self) -> u64 {
254 self.0
255 }
256}
257
258impl HasFileId for File {
259 #[inline]
260 fn file_id(&self) -> FileId {
261 self.id
262 }
263}
264
265impl std::fmt::Display for FileId {
266 #[inline]
267 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
268 write!(f, "{}", self.0)
269 }
270}
271
272/// Returns a vec over the starting byte offsets of each line in `source`.
273#[inline]
274pub(crate) fn line_starts(source: &[u8]) -> Vec<u32> {
275 // Heuristic: On the test corpus, the mean length is about 30 bytes, the median is 23.
276 // Since the whole vec will be small, we prefer slight over-allocation to avoid re-allocations
277 // in the common case
278 const LINE_WIDTH_HEURISTIC: usize = 20;
279
280 // Pre-allocate to avoid calling `realloc` thousands of times per file.
281 let mut lines = Vec::with_capacity(source.len() / LINE_WIDTH_HEURISTIC);
282 lines.push(0);
283
284 // Detect line ending style from the first \r or \n. Real files use one
285 // convention throughout, so we never need to handle mixed \r\n / bare \r.
286 match memchr::memchr2(b'\r', b'\n', source) {
287 // No line endings: single-line file, nothing more to push.
288 None => {}
289 // Old Mac (\r only): first line-ending char is a bare \r.
290 Some(cr) if source[cr] == b'\r' && source.get(cr + 1) != Some(&b'\n') => {
291 for pos in memchr::memchr_iter(b'\r', source) {
292 lines.push((pos + 1) as u32);
293 }
294 }
295 // Unix (\n only) or Windows (\r\n): \n marks every line start.
296 _ => {
297 for pos in memchr::memchr_iter(b'\n', source) {
298 lines.push((pos + 1) as u32);
299 }
300 }
301 }
302
303 lines
304}