mago_database/file.rs
1use std::borrow::Cow;
2use std::hash::DefaultHasher;
3use std::hash::Hash;
4use std::hash::Hasher;
5use std::path::Path;
6use std::path::PathBuf;
7
8use serde::Deserialize;
9use serde::Serialize;
10
11use crate::error::DatabaseError;
12use crate::utils::read_file;
13
14/// A stable, unique identifier for a file.
15///
16/// This ID is generated by hashing the file's logical name, ensuring it remains
17/// consistent across application runs and is unaffected by content modifications.
18#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
19#[repr(transparent)]
20pub struct FileId(u64);
21
22/// Distinguishes between the origins of source files.
23#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
24#[repr(u8)]
25pub enum FileType {
26 /// The file is part of the primary project source code.
27 /// These files typically reside on the filesystem and are actively developed.
28 Host,
29
30 /// The file belongs to a third-party dependency (e.g., a Composer package).
31 /// These files exist on the filesystem within the project (e.g., in `vendor/`)
32 /// but are not considered part of the primary source code.
33 Vendored,
34
35 /// The file represents a built-in language construct (e.g., a core PHP function or class).
36 /// These "files" do not exist on the filesystem and their content is typically
37 /// provided as pre-defined stubs for analysis.
38 Builtin,
39}
40
41/// A file that's either stored on the host system's file system or in the vendored file system.
42///
43/// This struct encapsulates all the necessary information about a file, including its content,
44/// location, and metadata for change detection.
45#[derive(Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
46pub struct File {
47 /// A stable, unique identifier for the file, generated from its logical name.
48 /// This ID persists across application runs and content modifications.
49 pub id: FileId,
50
51 /// The logical name of the file, typically the path relative to the root of the project.
52 pub name: Cow<'static, [u8]>,
53
54 /// The absolute path of the file on the host's filesystem, if it exists there.
55 /// This will be `None` for vendored files that don't have a physical counterpart.
56 pub path: Option<PathBuf>,
57
58 /// The type of the file, indicating its origin.
59 pub file_type: FileType,
60
61 /// The contents of the file, if available.
62 pub contents: Cow<'static, [u8]>,
63
64 /// The size of the file's contents in bytes.
65 pub size: u32,
66
67 /// A vector containing the starting byte offsets of each line in `contents`.
68 /// The first line always starts at offset 0. This is useful for quickly
69 /// navigating to a specific line number without scanning the whole file.
70 pub lines: Vec<u32>,
71}
72
73pub trait HasFileId {
74 /// Returns the unique identifier of the file.
75 fn file_id(&self) -> FileId;
76}
77
78impl File {
79 /// Creates a new `File` instance from its name, type, path, and contents.
80 ///
81 /// It automatically calculates the size, and line start offsets.
82 #[inline]
83 #[must_use]
84 pub fn new(
85 name: Cow<'static, [u8]>,
86 file_type: FileType,
87 path: Option<PathBuf>,
88 contents: Cow<'static, [u8]>,
89 ) -> Self {
90 let id = FileId::new(&name);
91 let size = contents.len() as u32;
92 let lines = line_starts(contents.as_ref());
93
94 Self { id, name, path, file_type, contents, size, lines }
95 }
96
97 /// Creates a new `File` instance by reading its contents from the filesystem.
98 ///
99 /// This is the primary factory function for creating a `File` from a disk source.
100 /// It handles determining the file's logical name relative to the workspace,
101 /// reading its contents, and robustly handling non-UTF-8 text via lossy conversion.
102 ///
103 /// # Arguments
104 ///
105 /// * `workspace`: The root directory of the project, used to calculate the logical name.
106 /// * `path`: The absolute path to the file to read from disk.
107 /// * `file_type`: The [`FileType`] to assign to the created file.
108 ///
109 /// # Errors
110 ///
111 /// Returns a [`DatabaseError::IOError`] if the file cannot be read from the disk.
112 #[inline(always)]
113 pub fn read(workspace: &Path, path: &Path, file_type: FileType) -> Result<Self, DatabaseError> {
114 read_file(workspace, path, file_type)
115 }
116
117 /// Creates an ephemeral, in-memory `File` from a name and content.
118 ///
119 /// This is a convenience method for situations like testing or formatting where
120 /// a full file context (e.g., a real path) is not required. It defaults to
121 /// `FileType::Host` and a `path` of `None`.
122 #[inline]
123 #[must_use]
124 pub fn ephemeral(name: Cow<'static, [u8]>, contents: Cow<'static, [u8]>) -> Self {
125 Self::new(name, FileType::Host, None, contents)
126 }
127
128 /// Retrieve the line number for the given byte offset.
129 ///
130 /// # Parameters
131 ///
132 /// - `offset`: The byte offset to retrieve the line number for.
133 ///
134 /// # Returns
135 ///
136 /// The line number for the given byte offset (0-based index).
137 #[inline]
138 #[must_use]
139 pub fn line_number(&self, offset: u32) -> u32 {
140 self.lines.binary_search(&offset).unwrap_or_else(|next_line| next_line - 1) as u32
141 }
142
143 /// Retrieve the byte offset for the start of the given line.
144 ///
145 /// # Parameters
146 ///
147 /// - `line`: The line number to retrieve the start offset for.
148 ///
149 /// # Returns
150 ///
151 /// The byte offset for the start of the given line (0-based index).
152 #[inline]
153 #[must_use]
154 pub fn get_line_start_offset(&self, line: u32) -> Option<u32> {
155 self.lines.get(line as usize).copied()
156 }
157
158 /// Retrieve the byte offset for the end of the given line.
159 ///
160 /// # Parameters
161 ///
162 /// - `line`: The line number to retrieve the end offset for.
163 ///
164 /// # Returns
165 ///
166 /// The byte offset for the end of the given line (0-based index).
167 #[inline]
168 #[must_use]
169 pub fn get_line_end_offset(&self, line: u32) -> Option<u32> {
170 match self.lines.get(line as usize + 1) {
171 Some(&end) => Some(end - 1),
172 None if line as usize == self.lines.len() - 1 => Some(self.size),
173 _ => None,
174 }
175 }
176
177 /// Retrieve the column number for the given byte offset.
178 ///
179 /// # Parameters
180 ///
181 /// - `offset`: The byte offset to retrieve the column number for.
182 ///
183 /// # Returns
184 ///
185 /// The column number for the given byte offset (0-based index).
186 #[inline]
187 #[must_use]
188 pub fn column_number(&self, offset: u32) -> u32 {
189 let line = self.line_number(offset) as usize;
190
191 offset - self.lines[line]
192 }
193}
194
195impl FileType {
196 /// Returns `true` if the file is a host file, meaning it is part of the project's source code.
197 #[inline]
198 #[must_use]
199 pub const fn is_host(self) -> bool {
200 matches!(self, FileType::Host)
201 }
202
203 /// Returns `true` if the file is a vendored file, meaning it comes from an external library or dependency.
204 #[inline]
205 #[must_use]
206 pub const fn is_vendored(self) -> bool {
207 matches!(self, FileType::Vendored)
208 }
209
210 /// Returns `true` if the file is a built-in file, meaning it represents a core language construct.
211 #[inline]
212 #[must_use]
213 pub const fn is_builtin(self) -> bool {
214 matches!(self, FileType::Builtin)
215 }
216}
217
218impl FileId {
219 #[inline]
220 #[must_use]
221 pub fn new(logical_name: &[u8]) -> Self {
222 let mut hasher = DefaultHasher::new();
223 logical_name.hash(&mut hasher);
224 Self(hasher.finish())
225 }
226
227 #[inline]
228 #[must_use]
229 pub const fn zero() -> Self {
230 Self(0)
231 }
232
233 #[inline]
234 #[must_use]
235 pub const fn is_zero(self) -> bool {
236 self.0 == 0
237 }
238
239 #[inline]
240 #[must_use]
241 pub fn as_u64(self) -> u64 {
242 self.0
243 }
244}
245
246impl HasFileId for File {
247 #[inline]
248 fn file_id(&self) -> FileId {
249 self.id
250 }
251}
252
253impl std::fmt::Display for FileId {
254 #[inline]
255 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
256 write!(f, "{}", self.0)
257 }
258}
259
260/// Returns a vec over the starting byte offsets of each line in `source`.
261#[inline]
262pub(crate) fn line_starts(source: &[u8]) -> Vec<u32> {
263 // Heuristic: On the test corpus, the mean length is about 30 bytes, the median is 23.
264 // Since the whole vec will be small, we prefer slight over-allocation to avoid re-allocations
265 // in the common case
266 const LINE_WIDTH_HEURISTIC: usize = 20;
267
268 // Pre-allocate to avoid calling `realloc` thousands of times per file.
269 let mut lines = Vec::with_capacity(source.len() / LINE_WIDTH_HEURISTIC);
270 lines.push(0);
271
272 // Detect line ending style from the first \r or \n. Real files use one
273 // convention throughout, so we never need to handle mixed \r\n / bare \r.
274 match memchr::memchr2(b'\r', b'\n', source) {
275 // No line endings: single-line file, nothing more to push.
276 None => {}
277 // Old Mac (\r only): first line-ending char is a bare \r.
278 Some(cr) if source[cr] == b'\r' && source.get(cr + 1) != Some(&b'\n') => {
279 for pos in memchr::memchr_iter(b'\r', source) {
280 lines.push((pos + 1) as u32);
281 }
282 }
283 // Unix (\n only) or Windows (\r\n): \n marks every line start.
284 _ => {
285 for pos in memchr::memchr_iter(b'\n', source) {
286 lines.push((pos + 1) as u32);
287 }
288 }
289 }
290
291 lines
292}