mago_database/
file.rs

1use std::hash::DefaultHasher;
2use std::hash::Hash;
3use std::hash::Hasher;
4use std::path::Path;
5use std::path::PathBuf;
6
7use serde::Deserialize;
8use serde::Serialize;
9
10use crate::error::DatabaseError;
11use crate::utils::read_file;
12
13/// A stable, unique identifier for a file.
14///
15/// This ID is generated by hashing the file's logical name, ensuring it remains
16/// consistent across application runs and is unaffected by content modifications.
17#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
18#[repr(transparent)]
19pub struct FileId(u64);
20
21/// Distinguishes between the origins of source files.
22#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
23#[repr(u8)]
24pub enum FileType {
25    /// The file is part of the primary project source code.
26    /// These files typically reside on the filesystem and are actively developed.
27    Host,
28
29    /// The file belongs to a third-party dependency (e.g., a Composer package).
30    /// These files exist on the filesystem within the project (e.g., in `vendor/`)
31    /// but are not considered part of the primary source code.
32    Vendored,
33
34    /// The file represents a built-in language construct (e.g., a core PHP function or class).
35    /// These "files" do not exist on the filesystem and their content is typically
36    /// provided as pre-defined stubs for analysis.
37    Builtin,
38}
39
40/// A file that's either stored on the host system's file system or in the vendored file system.
41///
42/// This struct encapsulates all the necessary information about a file, including its content,
43/// location, and metadata for change detection.
44#[derive(Clone, Debug, Eq, PartialEq, Hash)]
45pub struct File {
46    /// A stable, unique identifier for the file, generated from its logical name.
47    /// This ID persists across application runs and content modifications.
48    pub id: FileId,
49
50    /// The logical name of the file, typically the path relative to the root of the project.
51    pub name: String,
52
53    /// The absolute path of the file on the host's filesystem, if it exists there.
54    /// This will be `None` for vendored files that don't have a physical counterpart.
55    pub path: Option<PathBuf>,
56
57    /// The type of the file, indicating its origin.
58    pub file_type: FileType,
59
60    /// The contents of the file, if available.
61    pub contents: String,
62
63    /// The size of the file's contents in bytes.
64    pub size: usize,
65
66    /// A vector containing the starting byte offsets of each line in `contents`.
67    /// The first line always starts at offset 0. This is useful for quickly
68    /// navigating to a specific line number without scanning the whole file.
69    pub lines: Vec<usize>,
70}
71
72pub trait HasFileId {
73    /// Returns the unique identifier of the file.
74    fn file_id(&self) -> FileId;
75}
76
77impl File {
78    /// Creates a new `File` instance from its name, type, path, and contents.
79    ///
80    /// It automatically calculates the size, and line start offsets.
81    pub fn new(name: String, file_type: FileType, path: Option<PathBuf>, contents: String) -> Self {
82        let id = FileId::new(&name);
83        let size = contents.len();
84        let lines = line_starts(&contents).collect::<Vec<_>>();
85
86        Self { id, name, path, file_type, contents, size, lines }
87    }
88
89    /// Creates a new `File` instance by reading its contents from the filesystem.
90    ///
91    /// This is the primary factory function for creating a `File` from a disk source.
92    /// It handles determining the file's logical name relative to the workspace,
93    /// reading its contents, and robustly handling non-UTF-8 text via lossy conversion.
94    ///
95    /// # Arguments
96    ///
97    /// * `workspace`: The root directory of the project, used to calculate the logical name.
98    /// * `path`: The absolute path to the file to read from disk.
99    /// * `file_type`: The [`FileType`] to assign to the created file.
100    ///
101    /// # Errors
102    ///
103    /// Returns a [`DatabaseError::IOError`] if the file cannot be read from the disk.
104    pub fn read(workspace: &Path, path: &Path, file_type: FileType) -> Result<Self, DatabaseError> {
105        read_file(workspace, path, file_type)
106    }
107
108    /// Creates an ephemeral, in-memory `File` from a name and content.
109    ///
110    /// This is a convenience method for situations like testing or formatting where
111    /// a full file context (e.g., a real path) is not required. It defaults to
112    /// `FileType::Host` and a `path` of `None`.
113    pub fn ephemeral(name: String, contents: String) -> Self {
114        Self::new(name, FileType::Host, None, contents)
115    }
116
117    /// Retrieve the line number for the given byte offset.
118    ///
119    /// # Parameters
120    ///
121    /// - `offset`: The byte offset to retrieve the line number for.
122    ///
123    /// # Returns
124    ///
125    /// The line number for the given byte offset (0-based index).
126    #[inline]
127    pub fn line_number(&self, offset: usize) -> usize {
128        self.lines.binary_search(&offset).unwrap_or_else(|next_line| next_line - 1)
129    }
130
131    /// Retrieve the byte offset for the start of the given line.
132    ///
133    /// # Parameters
134    ///
135    /// - `line`: The line number to retrieve the start offset for.
136    ///
137    /// # Returns
138    ///
139    /// The byte offset for the start of the given line (0-based index).
140    pub fn get_line_start_offset(&self, line: usize) -> Option<usize> {
141        self.lines.get(line).copied()
142    }
143
144    /// Retrieve the byte offset for the end of the given line.
145    ///
146    /// # Parameters
147    ///
148    /// - `line`: The line number to retrieve the end offset for.
149    ///
150    /// # Returns
151    ///
152    /// The byte offset for the end of the given line (0-based index).
153    pub fn get_line_end_offset(&self, line: usize) -> Option<usize> {
154        match self.lines.get(line + 1) {
155            Some(&end) => Some(end - 1),
156            None if line == self.lines.len() - 1 => Some(self.size),
157            _ => None,
158        }
159    }
160
161    /// Retrieve the column number for the given byte offset.
162    ///
163    /// # Parameters
164    ///
165    /// - `offset`: The byte offset to retrieve the column number for.
166    ///
167    /// # Returns
168    ///
169    /// The column number for the given byte offset (0-based index).
170    #[inline]
171    pub fn column_number(&self, offset: usize) -> usize {
172        let line_start = self.lines.binary_search(&offset).unwrap_or_else(|next_line| self.lines[next_line - 1]);
173
174        offset - line_start
175    }
176}
177
178impl FileType {
179    /// Returns `true` if the file is a host file, meaning it is part of the project's source code.
180    pub const fn is_host(self) -> bool {
181        matches!(self, FileType::Host)
182    }
183
184    /// Returns `true` if the file is a vendored file, meaning it comes from an external library or dependency.
185    pub const fn is_vendored(self) -> bool {
186        matches!(self, FileType::Vendored)
187    }
188
189    /// Returns `true` if the file is a built-in file, meaning it represents a core language construct.
190    pub const fn is_builtin(self) -> bool {
191        matches!(self, FileType::Builtin)
192    }
193}
194
195impl HasFileId for File {
196    fn file_id(&self) -> FileId {
197        self.id
198    }
199}
200
201impl FileId {
202    pub fn new(logical_name: &str) -> Self {
203        let mut hasher = DefaultHasher::new();
204        logical_name.hash(&mut hasher);
205        Self(hasher.finish())
206    }
207}
208
209impl FileId {
210    pub const fn zero() -> Self {
211        Self(0)
212    }
213
214    pub const fn is_zero(self) -> bool {
215        self.0 == 0
216    }
217
218    #[must_use]
219    pub fn as_u64(self) -> u64 {
220        self.0
221    }
222}
223
224impl std::fmt::Display for FileId {
225    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
226        write!(f, "{}", self.0)
227    }
228}
229
230/// Returns an iterator over the starting byte offsets of each line in `source`.
231#[inline]
232pub(crate) fn line_starts(source: &str) -> impl Iterator<Item = usize> + '_ {
233    let bytes = source.as_bytes();
234
235    std::iter::once(0)
236        .chain(memchr::memchr_iter(b'\n', bytes).map(|i| if i > 0 && bytes[i - 1] == b'\r' { i } else { i + 1 }))
237}