mago_source/
lib.rs

1use std::path::PathBuf;
2use std::sync::Arc;
3
4use ahash::HashMap;
5use parking_lot::RwLock;
6use serde::Deserialize;
7use serde::Serialize;
8
9use mago_interner::StringIdentifier;
10use mago_interner::ThreadedInterner;
11
12use crate::error::SourceError;
13
14pub mod error;
15
16/// Represents the category of the source for a PHP construct.
17///
18/// This enum categorizes the origin of a source, based on where it is are defined.
19/// The categories are useful for distinguishing between user-written code, vendor-provided libraries,
20/// and built-in PHP features.
21///
22/// # Variants
23///
24/// - `BuiltIn`: Represents a construct that is part of PHP's core or extension libraries.
25/// - `External`: Represents a construct defined in a vendor-provided or third-party library.
26/// - `UserDefined`: Represents a construct written by the user or part of the current project.
27#[derive(Default, Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
28pub enum SourceCategory {
29    /// Represents a PHP construct that is part of the PHP core or extension libraries.
30    BuiltIn,
31
32    /// Represents a PHP construct defined in vendor-provided or third-party libraries.
33    External,
34
35    /// Represents a PHP construct written by the user or part of the current project.
36    #[default]
37    UserDefined,
38}
39
40/// A unique identifier for a source.
41#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
42#[repr(C)]
43pub struct SourceIdentifier(pub StringIdentifier, pub SourceCategory);
44
45/// Represents a source file.
46#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
47pub struct Source {
48    pub identifier: SourceIdentifier,
49    pub path: Option<PathBuf>,
50    pub content: StringIdentifier,
51    pub size: usize,
52    pub lines: Vec<usize>,
53}
54
55/// Trait for items that have a source.
56pub trait HasSource {
57    fn source(&self) -> SourceIdentifier;
58}
59
60/// Internal structure to store source information before full loading.
61#[derive(Debug)]
62struct SourceEntry {
63    /// The file path (if any).
64    path: Option<PathBuf>,
65    /// The content, if already loaded, plus its size and line-start positions.
66    content: Option<(StringIdentifier, usize, Vec<usize>)>,
67}
68
69/// Internal container for our maps. We keep two maps:
70///  - one from SourceIdentifier → SourceEntry
71///  - an auxiliary index from interned name → SourceIdentifier
72#[derive(Debug)]
73struct SourceManagerInner {
74    sources: HashMap<SourceIdentifier, SourceEntry>,
75    sources_by_name: HashMap<StringIdentifier, SourceIdentifier>,
76}
77
78/// A manager for sources.
79///
80/// This version replaces DashMap with a single inner structure protected by a
81/// high-performance `RwLock` (from the parking_lot crate) and uses AHashMap for speed.
82#[derive(Clone, Debug)]
83pub struct SourceManager {
84    /// The interner used for source names and content.
85    interner: ThreadedInterner,
86    /// Inner maps protected by a lock.
87    inner: Arc<RwLock<SourceManagerInner>>,
88}
89
90/// Methods for SourceCategory.
91impl SourceCategory {
92    #[inline(always)]
93    pub const fn is_built_in(&self) -> bool {
94        matches!(self, Self::BuiltIn)
95    }
96
97    #[inline(always)]
98    pub const fn is_external(&self) -> bool {
99        matches!(self, Self::External)
100    }
101
102    #[inline(always)]
103    pub const fn is_user_defined(&self) -> bool {
104        matches!(self, Self::UserDefined)
105    }
106}
107
108/// Methods for SourceIdentifier.
109impl SourceIdentifier {
110    #[inline(always)]
111    pub fn dummy() -> Self {
112        Self(StringIdentifier::empty(), SourceCategory::UserDefined)
113    }
114
115    /// Returns the interned string identifier.
116    #[inline(always)]
117    pub const fn value(&self) -> StringIdentifier {
118        self.0
119    }
120
121    /// Returns the source category.
122    #[inline(always)]
123    pub const fn category(&self) -> SourceCategory {
124        self.1
125    }
126}
127/// Methods for Source.
128impl Source {
129    /// Creates a [`Source`] from a single piece of `content` without needing
130    /// a full [`SourceManager`].
131    ///
132    /// This is particularly useful for quick parsing or one-off analyses
133    /// where you do not need to manage multiple sources.
134    ///
135    /// # Arguments
136    ///
137    /// * `interner` - A reference to a [`ThreadedInterner`] used to intern
138    ///   the `content` and store string identifiers.
139    /// * `name` - A logical identifier for this source, such as `"inline"`
140    ///   or `"my_script.php"`.
141    /// * `content` - The actual PHP (or other) code string.
142    #[inline(always)]
143    pub fn standalone(interner: &ThreadedInterner, name: &str, content: &str) -> Self {
144        let lines: Vec<_> = line_starts(content).collect();
145        let size = content.len();
146        let content_id = interner.intern(content);
147
148        Self {
149            identifier: SourceIdentifier(interner.intern(name), SourceCategory::UserDefined),
150            path: None,
151            content: content_id,
152            size,
153            lines,
154        }
155    }
156
157    /// Retrieve the line number for the given byte offset.
158    ///
159    /// # Parameters
160    ///
161    /// - `offset`: The byte offset to retrieve the line number for.
162    ///
163    /// # Returns
164    ///
165    /// The line number for the given byte offset (0-based index).
166    #[inline(always)]
167    pub fn line_number(&self, offset: usize) -> usize {
168        self.lines.binary_search(&offset).unwrap_or_else(|next_line| next_line - 1)
169    }
170
171    /// Retrieve the byte offset for the start of the given line.
172    ///
173    /// # Parameters
174    ///
175    /// - `line`: The line number to retrieve the start offset for.
176    ///
177    /// # Returns
178    ///
179    /// The byte offset for the start of the given line (0-based index).
180    pub fn get_line_start_offset(&self, line: usize) -> Option<usize> {
181        self.lines.get(line).copied()
182    }
183
184    /// Retrieve the column number for the given byte offset.
185    ///
186    /// # Parameters
187    ///
188    /// - `offset`: The byte offset to retrieve the column number for.
189    ///
190    /// # Returns
191    ///
192    /// The column number for the given byte offset (0-based index).
193    #[inline(always)]
194    pub fn column_number(&self, offset: usize) -> usize {
195        let line_start = self.lines.binary_search(&offset).unwrap_or_else(|next_line| self.lines[next_line - 1]);
196
197        offset - line_start
198    }
199}
200
201impl SourceManager {
202    /// Creates a new source manager.
203    #[inline(always)]
204    pub fn new(interner: ThreadedInterner) -> Self {
205        Self {
206            interner,
207            inner: Arc::new(RwLock::new(SourceManagerInner {
208                sources: HashMap::default(),
209                sources_by_name: HashMap::default(),
210            })),
211        }
212    }
213
214    /// Inserts a source with the given name and file path.
215    #[inline(always)]
216    pub fn insert_path(&self, name: impl AsRef<str>, path: PathBuf, category: SourceCategory) -> SourceIdentifier {
217        let name_str = name.as_ref();
218        let name_id = self.interner.intern(name_str);
219        let source_id = SourceIdentifier(name_id, category);
220
221        {
222            let inner = self.inner.read();
223            if inner.sources.contains_key(&source_id) {
224                return source_id;
225            }
226        }
227
228        let mut inner = self.inner.write();
229        // Double-check to avoid duplicate insertion.
230        if inner.sources.contains_key(&source_id) {
231            return source_id;
232        }
233        inner.sources.insert(source_id, SourceEntry { path: Some(path), content: None });
234        inner.sources_by_name.insert(name_id, source_id);
235        source_id
236    }
237
238    /// Inserts a source with the given name and content.
239    #[inline(always)]
240    pub fn insert_content(
241        &self,
242        name: impl AsRef<str>,
243        content: impl AsRef<str>,
244        category: SourceCategory,
245    ) -> SourceIdentifier {
246        let name_str = name.as_ref();
247        let content_str = content.as_ref();
248        let name_id = self.interner.intern(name_str);
249
250        {
251            let inner = self.inner.read();
252            if let Some(&source_id) = inner.sources_by_name.get(&name_id) {
253                return source_id;
254            }
255        }
256
257        let lines: Vec<_> = line_starts(content_str).collect();
258        let size = content_str.len();
259        let content_id = self.interner.intern(content_str);
260        let source_id = SourceIdentifier(name_id, category);
261
262        let mut inner = self.inner.write();
263        if let Some(&existing) = inner.sources_by_name.get(&name_id) {
264            return existing;
265        }
266        inner.sources.insert(source_id, SourceEntry { path: None, content: Some((content_id, size, lines)) });
267        inner.sources_by_name.insert(name_id, source_id);
268        source_id
269    }
270
271    /// Returns whether the manager contains a source with the given identifier.
272    #[inline(always)]
273    pub fn contains(&self, source_id: &SourceIdentifier) -> bool {
274        let inner = self.inner.read();
275        inner.sources.contains_key(source_id)
276    }
277
278    /// Returns all source identifiers.
279    #[inline(always)]
280    pub fn source_ids(&self) -> Vec<SourceIdentifier> {
281        let inner = self.inner.read();
282        inner.sources.keys().cloned().collect()
283    }
284
285    /// Returns source identifiers for the given category.
286    #[inline(always)]
287    pub fn source_ids_for_category(&self, category: SourceCategory) -> Vec<SourceIdentifier> {
288        let inner = self.inner.read();
289        inner.sources.keys().filter(|id| id.category() == category).cloned().collect()
290    }
291
292    /// Returns source identifiers for categories other than the given one.
293    #[inline(always)]
294    pub fn source_ids_except_category(&self, category: SourceCategory) -> Vec<SourceIdentifier> {
295        let inner = self.inner.read();
296        inner.sources.keys().filter(|id| id.category() != category).cloned().collect()
297    }
298
299    /// Loads the source for the given identifier.
300    ///
301    /// If the source content is already loaded, it is returned immediately.
302    /// Otherwise the file is read from disk, processed, and cached.
303    #[inline(always)]
304    pub fn load(&self, source_id: &SourceIdentifier) -> Result<Source, SourceError> {
305        // First, try to read without locking for update.
306        {
307            let inner = self.inner.read();
308            if let Some(entry) = inner.sources.get(source_id) {
309                if let Some((content, size, ref lines)) = entry.content {
310                    return Ok(Source {
311                        identifier: *source_id,
312                        path: entry.path.clone(),
313                        content,
314                        size,
315                        lines: lines.clone(),
316                    });
317                }
318            }
319        }
320
321        // Retrieve the file path (must exist if content is not loaded).
322        let path = {
323            let inner = self.inner.read();
324            let entry = inner.sources.get(source_id).ok_or(SourceError::UnavailableSource(*source_id))?;
325
326            entry.path.clone().ok_or(SourceError::UnavailableSource(*source_id))?
327        };
328
329        // Perform file I/O outside the lock.
330        let bytes = std::fs::read(&path).map_err(SourceError::IOError)?;
331        let content_str = match String::from_utf8(bytes) {
332            Ok(s) => s,
333            Err(err) => {
334                let s = err.into_bytes();
335                let s = String::from_utf8_lossy(&s).into_owned();
336                tracing::warn!("Source '{}' contains invalid UTF-8 sequence; behavior is undefined.", path.display());
337                s
338            }
339        };
340        let lines: Vec<_> = line_starts(&content_str).collect();
341        let size = content_str.len();
342        let content_id = self.interner.intern(&content_str);
343
344        // Update the entry under a write lock.
345        {
346            let mut inner = self.inner.write();
347            if let Some(entry) = inner.sources.get_mut(source_id) {
348                // Check again in case another thread updated it meanwhile.
349                if entry.content.is_none() {
350                    entry.content = Some((content_id, size, lines.clone()));
351                }
352                Ok(Source { identifier: *source_id, path: entry.path.clone(), content: content_id, size, lines })
353            } else {
354                Err(SourceError::UnavailableSource(*source_id))
355            }
356        }
357    }
358
359    /// Writes updated content for the source with the given identifier.
360    #[inline(always)]
361    pub fn write(&self, source_id: SourceIdentifier, new_content: impl AsRef<str>) -> Result<(), SourceError> {
362        let new_content_str = new_content.as_ref();
363        let new_content_id = self.interner.intern(new_content_str);
364        let new_lines: Vec<_> = line_starts(new_content_str).collect();
365        let new_size = new_content_str.len();
366
367        let path_opt = {
368            let mut inner = self.inner.write();
369            let entry = inner.sources.get_mut(&source_id).ok_or(SourceError::UnavailableSource(source_id))?;
370            if let Some((old_content, _, _)) = entry.content {
371                if old_content == new_content_id {
372                    return Ok(());
373                }
374            }
375            entry.content = Some((new_content_id, new_size, new_lines));
376            entry.path.clone()
377        };
378
379        // If the source has an associated file, update it on disk.
380        if let Some(ref path) = path_opt {
381            std::fs::write(path, self.interner.lookup(&new_content_id)).map_err(SourceError::IOError)?;
382        }
383
384        Ok(())
385    }
386
387    /// Returns the number of sources.
388    #[inline(always)]
389    pub fn len(&self) -> usize {
390        let inner = self.inner.read();
391        inner.sources.len()
392    }
393
394    /// Returns true if there are no sources.
395    #[inline(always)]
396    pub fn is_empty(&self) -> bool {
397        let inner = self.inner.read();
398        inner.sources.is_empty()
399    }
400}
401
402impl<T: HasSource> HasSource for Box<T> {
403    #[inline(always)]
404    fn source(&self) -> SourceIdentifier {
405        self.as_ref().source()
406    }
407}
408
409/// Returns an iterator over the starting byte offsets of each line in `source`.
410#[inline(always)]
411fn line_starts(source: &str) -> impl Iterator<Item = usize> + '_ {
412    std::iter::once(0).chain(memchr::memchr_iter(b'\n', source.as_bytes()).map(|i| i + 1))
413}