mago_database/file.rs
1use std::borrow::Cow;
2use std::hash::DefaultHasher;
3use std::hash::Hash;
4use std::hash::Hasher;
5use std::path::Path;
6use std::path::PathBuf;
7
8use serde::Deserialize;
9use serde::Serialize;
10
11use crate::error::DatabaseError;
12use crate::utils::read_file;
13
14/// A stable, unique identifier for a file.
15///
16/// This ID is generated by hashing the file's logical name, ensuring it remains
17/// consistent across application runs and is unaffected by content modifications.
18#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
19#[repr(transparent)]
20pub struct FileId(u64);
21
22/// Distinguishes between the origins of source files.
23#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
24#[repr(u8)]
25pub enum FileType {
26 /// The file is part of the primary project source code.
27 /// These files typically reside on the filesystem and are actively developed.
28 Host,
29
30 /// The file belongs to a third-party dependency (e.g., a Composer package).
31 /// These files exist on the filesystem within the project (e.g., in `vendor/`)
32 /// but are not considered part of the primary source code.
33 Vendored,
34
35 /// The file represents a built-in language construct (e.g., a core PHP function or class).
36 /// These "files" do not exist on the filesystem and their content is typically
37 /// provided as pre-defined stubs for analysis.
38 Builtin,
39}
40
41/// A file that's either stored on the host system's file system or in the vendored file system.
42///
43/// This struct encapsulates all the necessary information about a file, including its content,
44/// location, and metadata for change detection.
45#[derive(Debug, Eq, PartialEq, Hash, Serialize, Deserialize)]
46pub struct File {
47 /// A stable, unique identifier for the file, generated from its logical name.
48 /// This ID persists across application runs and content modifications.
49 pub id: FileId,
50
51 /// The logical name of the file, typically the path relative to the root of the project.
52 pub name: Cow<'static, str>,
53
54 /// The absolute path of the file on the host's filesystem, if it exists there.
55 /// This will be `None` for vendored files that don't have a physical counterpart.
56 pub path: Option<PathBuf>,
57
58 /// The type of the file, indicating its origin.
59 pub file_type: FileType,
60
61 /// The contents of the file, if available.
62 pub contents: Cow<'static, str>,
63
64 /// The size of the file's contents in bytes.
65 pub size: u32,
66
67 /// A vector containing the starting byte offsets of each line in `contents`.
68 /// The first line always starts at offset 0. This is useful for quickly
69 /// navigating to a specific line number without scanning the whole file.
70 pub lines: Vec<u32>,
71}
72
73pub trait HasFileId {
74 /// Returns the unique identifier of the file.
75 fn file_id(&self) -> FileId;
76}
77
78impl File {
79 /// Creates a new `File` instance from its name, type, path, and contents.
80 ///
81 /// It automatically calculates the size, and line start offsets.
82 #[must_use]
83 pub fn new(
84 name: Cow<'static, str>,
85 file_type: FileType,
86 path: Option<PathBuf>,
87 contents: Cow<'static, str>,
88 ) -> Self {
89 let id = FileId::new(&name);
90 let size = contents.len() as u32;
91 let lines = line_starts(contents.as_ref());
92
93 Self { id, name, path, file_type, contents, size, lines }
94 }
95
96 /// Creates a new `File` instance by reading its contents from the filesystem.
97 ///
98 /// This is the primary factory function for creating a `File` from a disk source.
99 /// It handles determining the file's logical name relative to the workspace,
100 /// reading its contents, and robustly handling non-UTF-8 text via lossy conversion.
101 ///
102 /// # Arguments
103 ///
104 /// * `workspace`: The root directory of the project, used to calculate the logical name.
105 /// * `path`: The absolute path to the file to read from disk.
106 /// * `file_type`: The [`FileType`] to assign to the created file.
107 ///
108 /// # Errors
109 ///
110 /// Returns a [`DatabaseError::IOError`] if the file cannot be read from the disk.
111 #[inline(always)]
112 pub fn read(workspace: &Path, path: &Path, file_type: FileType) -> Result<Self, DatabaseError> {
113 read_file(workspace, path, file_type)
114 }
115
116 /// Creates an ephemeral, in-memory `File` from a name and content.
117 ///
118 /// This is a convenience method for situations like testing or formatting where
119 /// a full file context (e.g., a real path) is not required. It defaults to
120 /// `FileType::Host` and a `path` of `None`.
121 #[must_use]
122 pub fn ephemeral(name: Cow<'static, str>, contents: Cow<'static, str>) -> Self {
123 Self::new(name, FileType::Host, None, contents)
124 }
125
126 /// Retrieve the line number for the given byte offset.
127 ///
128 /// # Parameters
129 ///
130 /// - `offset`: The byte offset to retrieve the line number for.
131 ///
132 /// # Returns
133 ///
134 /// The line number for the given byte offset (0-based index).
135 #[inline]
136 #[must_use]
137 pub fn line_number(&self, offset: u32) -> u32 {
138 self.lines.binary_search(&offset).unwrap_or_else(|next_line| next_line - 1) as u32
139 }
140
141 /// Retrieve the byte offset for the start of the given line.
142 ///
143 /// # Parameters
144 ///
145 /// - `line`: The line number to retrieve the start offset for.
146 ///
147 /// # Returns
148 ///
149 /// The byte offset for the start of the given line (0-based index).
150 #[must_use]
151 pub fn get_line_start_offset(&self, line: u32) -> Option<u32> {
152 self.lines.get(line as usize).copied()
153 }
154
155 /// Retrieve the byte offset for the end of the given line.
156 ///
157 /// # Parameters
158 ///
159 /// - `line`: The line number to retrieve the end offset for.
160 ///
161 /// # Returns
162 ///
163 /// The byte offset for the end of the given line (0-based index).
164 #[must_use]
165 pub fn get_line_end_offset(&self, line: u32) -> Option<u32> {
166 match self.lines.get(line as usize + 1) {
167 Some(&end) => Some(end - 1),
168 None if line as usize == self.lines.len() - 1 => Some(self.size),
169 _ => None,
170 }
171 }
172
173 /// Retrieve the column number for the given byte offset.
174 ///
175 /// # Parameters
176 ///
177 /// - `offset`: The byte offset to retrieve the column number for.
178 ///
179 /// # Returns
180 ///
181 /// The column number for the given byte offset (0-based index).
182 #[inline]
183 #[must_use]
184 pub fn column_number(&self, offset: u32) -> u32 {
185 let line_start =
186 self.lines.binary_search(&offset).unwrap_or_else(|next_line| self.lines[next_line - 1] as usize);
187
188 offset - line_start as u32
189 }
190}
191
192impl FileType {
193 /// Returns `true` if the file is a host file, meaning it is part of the project's source code.
194 #[must_use]
195 pub const fn is_host(self) -> bool {
196 matches!(self, FileType::Host)
197 }
198
199 /// Returns `true` if the file is a vendored file, meaning it comes from an external library or dependency.
200 #[must_use]
201 pub const fn is_vendored(self) -> bool {
202 matches!(self, FileType::Vendored)
203 }
204
205 /// Returns `true` if the file is a built-in file, meaning it represents a core language construct.
206 #[must_use]
207 pub const fn is_builtin(self) -> bool {
208 matches!(self, FileType::Builtin)
209 }
210}
211
212impl FileId {
213 #[must_use]
214 pub fn new(logical_name: &str) -> Self {
215 let mut hasher = DefaultHasher::new();
216 logical_name.hash(&mut hasher);
217 Self(hasher.finish())
218 }
219
220 #[must_use]
221 pub const fn zero() -> Self {
222 Self(0)
223 }
224
225 #[must_use]
226 pub const fn is_zero(self) -> bool {
227 self.0 == 0
228 }
229
230 #[must_use]
231 pub fn as_u64(self) -> u64 {
232 self.0
233 }
234}
235
236impl HasFileId for File {
237 fn file_id(&self) -> FileId {
238 self.id
239 }
240}
241
242impl std::fmt::Display for FileId {
243 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
244 write!(f, "{}", self.0)
245 }
246}
247
248/// Returns a vec over the starting byte offsets of each line in `source`.
249#[inline]
250pub(crate) fn line_starts(source: &str) -> Vec<u32> {
251 // Heuristic: Average line of code is ~40 bytes.
252 const LINE_WIDTH_HEURISTIC: usize = 40;
253
254 let bytes = source.as_bytes();
255
256 // Pre-allocate to avoid calling `realloc` thousands of times per file.
257 let mut lines = Vec::with_capacity(bytes.len() / LINE_WIDTH_HEURISTIC);
258 lines.push(0);
259
260 for pos in memchr::memchr_iter(b'\n', bytes) {
261 let next_start = if pos > 0 && bytes[pos - 1] == b'\r' { pos } else { pos + 1 };
262
263 lines.push(next_start as u32);
264 }
265
266 lines
267}