mago_source/
lib.rs

1use std::path::PathBuf;
2use std::sync::Arc;
3
4use ahash::HashMap;
5use parking_lot::RwLock;
6use serde::Deserialize;
7use serde::Serialize;
8
9use mago_interner::StringIdentifier;
10use mago_interner::ThreadedInterner;
11
12use crate::error::SourceError;
13
14pub mod error;
15
16/// Represents the category of the source for a PHP construct.
17///
18/// This enum categorizes the origin of a source, based on where it is are defined.
19/// The categories are useful for distinguishing between user-written code, vendor-provided libraries,
20/// and built-in PHP features.
21///
22/// # Variants
23///
24/// - `BuiltIn`: Represents a construct that is part of PHP's core or extension libraries.
25/// - `External`: Represents a construct defined in a vendor-provided or third-party library.
26/// - `UserDefined`: Represents a construct written by the user or part of the current project.
27#[derive(Default, Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
28pub enum SourceCategory {
29    /// Represents a PHP construct that is part of the PHP core or extension libraries.
30    BuiltIn,
31
32    /// Represents a PHP construct defined in vendor-provided or third-party libraries.
33    External,
34
35    /// Represents a PHP construct written by the user or part of the current project.
36    #[default]
37    UserDefined,
38}
39
40/// A unique identifier for a source.
41#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
42#[repr(C)]
43pub struct SourceIdentifier(pub StringIdentifier, pub SourceCategory);
44
45/// Represents a source file.
46#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
47pub struct Source {
48    pub identifier: SourceIdentifier,
49    pub path: Option<PathBuf>,
50    pub content: StringIdentifier,
51    pub size: usize,
52    pub lines: Vec<usize>,
53}
54
55/// Trait for items that have a source.
56pub trait HasSource {
57    fn source(&self) -> SourceIdentifier;
58}
59
60/// Internal structure to store source information before full loading.
61#[derive(Debug)]
62struct SourceEntry {
63    /// The file path (if any).
64    path: Option<PathBuf>,
65    /// The content, if already loaded, plus its size and line-start positions.
66    content: Option<(StringIdentifier, usize, Vec<usize>)>,
67}
68
69/// Internal container for our maps. We keep two maps:
70///  - one from SourceIdentifier → SourceEntry
71///  - an auxiliary index from interned name → SourceIdentifier
72#[derive(Debug)]
73struct SourceManagerInner {
74    sources: HashMap<SourceIdentifier, SourceEntry>,
75    sources_by_name: HashMap<StringIdentifier, SourceIdentifier>,
76}
77
78/// A manager for sources.
79///
80/// This version replaces DashMap with a single inner structure protected by a
81/// high-performance `RwLock` (from the parking_lot crate) and uses AHashMap for speed.
82#[derive(Clone, Debug)]
83pub struct SourceManager {
84    /// The interner used for source names and content.
85    interner: ThreadedInterner,
86    /// Inner maps protected by a lock.
87    inner: Arc<RwLock<SourceManagerInner>>,
88}
89
90/// Methods for SourceCategory.
91impl SourceCategory {
92    #[inline]
93    pub const fn is_built_in(&self) -> bool {
94        matches!(self, Self::BuiltIn)
95    }
96
97    #[inline]
98    pub const fn is_external(&self) -> bool {
99        matches!(self, Self::External)
100    }
101
102    #[inline]
103    pub const fn is_user_defined(&self) -> bool {
104        matches!(self, Self::UserDefined)
105    }
106
107    #[inline]
108    pub const fn as_str(&self) -> &'static str {
109        match self {
110            Self::BuiltIn => "built-in",
111            Self::External => "external",
112            Self::UserDefined => "user defined",
113        }
114    }
115}
116
117/// Methods for SourceIdentifier.
118impl SourceIdentifier {
119    #[inline]
120    pub fn dummy() -> Self {
121        Self(StringIdentifier::empty(), SourceCategory::UserDefined)
122    }
123
124    /// Returns the interned string identifier.
125    #[inline]
126    pub const fn value(&self) -> StringIdentifier {
127        self.0
128    }
129
130    /// Returns the source category.
131    #[inline]
132    pub const fn category(&self) -> SourceCategory {
133        self.1
134    }
135}
136/// Methods for Source.
137impl Source {
138    /// Creates a [`Source`] from a single piece of `content` without needing
139    /// a full [`SourceManager`].
140    ///
141    /// This is particularly useful for quick parsing or one-off analyses
142    /// where you do not need to manage multiple sources.
143    ///
144    /// # Arguments
145    ///
146    /// * `interner` - A reference to a [`ThreadedInterner`] used to intern
147    ///   the `content` and store string identifiers.
148    /// * `name` - A logical identifier for this source, such as `"inline"`
149    ///   or `"my_script.php"`.
150    /// * `content` - The actual PHP (or other) code string.
151    #[inline]
152    pub fn standalone(interner: &ThreadedInterner, name: &str, content: &str) -> Self {
153        let lines: Vec<_> = line_starts(content).collect();
154        let size = content.len();
155        let content_id = interner.intern(content);
156
157        Self {
158            identifier: SourceIdentifier(interner.intern(name), SourceCategory::UserDefined),
159            path: None,
160            content: content_id,
161            size,
162            lines,
163        }
164    }
165
166    /// Retrieve the line number for the given byte offset.
167    ///
168    /// # Parameters
169    ///
170    /// - `offset`: The byte offset to retrieve the line number for.
171    ///
172    /// # Returns
173    ///
174    /// The line number for the given byte offset (0-based index).
175    #[inline]
176    pub fn line_number(&self, offset: usize) -> usize {
177        self.lines.binary_search(&offset).unwrap_or_else(|next_line| next_line - 1)
178    }
179
180    /// Retrieve the byte offset for the start of the given line.
181    ///
182    /// # Parameters
183    ///
184    /// - `line`: The line number to retrieve the start offset for.
185    ///
186    /// # Returns
187    ///
188    /// The byte offset for the start of the given line (0-based index).
189    pub fn get_line_start_offset(&self, line: usize) -> Option<usize> {
190        self.lines.get(line).copied()
191    }
192
193    /// Retrieve the byte offset for the end of the given line.
194    ///
195    /// # Parameters
196    ///
197    /// - `line`: The line number to retrieve the end offset for.
198    ///
199    /// # Returns
200    ///
201    /// The byte offset for the end of the given line (0-based index).
202    pub fn get_line_end_offset(&self, line: usize) -> Option<usize> {
203        match self.lines.get(line + 1) {
204            Some(&end) => Some(end - 1),
205            None if line == self.lines.len() - 1 => Some(self.size),
206            _ => None,
207        }
208    }
209
210    /// Retrieve the column number for the given byte offset.
211    ///
212    /// # Parameters
213    ///
214    /// - `offset`: The byte offset to retrieve the column number for.
215    ///
216    /// # Returns
217    ///
218    /// The column number for the given byte offset (0-based index).
219    #[inline]
220    pub fn column_number(&self, offset: usize) -> usize {
221        let line_start = self.lines.binary_search(&offset).unwrap_or_else(|next_line| self.lines[next_line - 1]);
222
223        offset - line_start
224    }
225}
226
227impl SourceManager {
228    /// Creates a new source manager.
229    #[inline]
230    pub fn new(interner: ThreadedInterner) -> Self {
231        Self {
232            interner,
233            inner: Arc::new(RwLock::new(SourceManagerInner {
234                sources: HashMap::default(),
235                sources_by_name: HashMap::default(),
236            })),
237        }
238    }
239
240    /// Inserts a source with the given name and file path.
241    #[inline]
242    pub fn insert_path(&self, name: impl AsRef<str>, path: PathBuf, category: SourceCategory) -> SourceIdentifier {
243        let name_str = name.as_ref();
244        let name_id = self.interner.intern(name_str);
245        let source_id = SourceIdentifier(name_id, category);
246
247        {
248            let inner = self.inner.read();
249            if inner.sources.contains_key(&source_id) {
250                return source_id;
251            }
252        }
253
254        let mut inner = self.inner.write();
255        // Double-check to avoid duplicate insertion.
256        if inner.sources.contains_key(&source_id) {
257            return source_id;
258        }
259        inner.sources.insert(source_id, SourceEntry { path: Some(path), content: None });
260        inner.sources_by_name.insert(name_id, source_id);
261        source_id
262    }
263
264    /// Inserts a source with the given name and content.
265    #[inline]
266    pub fn insert_content(
267        &self,
268        name: impl AsRef<str>,
269        content: impl AsRef<str>,
270        category: SourceCategory,
271    ) -> SourceIdentifier {
272        let name_str = name.as_ref();
273        let content_str = content.as_ref();
274        let name_id = self.interner.intern(name_str);
275
276        {
277            let inner = self.inner.read();
278            if let Some(&source_id) = inner.sources_by_name.get(&name_id) {
279                return source_id;
280            }
281        }
282
283        let lines: Vec<_> = line_starts(content_str).collect();
284        let size = content_str.len();
285        let content_id = self.interner.intern(content_str);
286        let source_id = SourceIdentifier(name_id, category);
287
288        let mut inner = self.inner.write();
289        if let Some(&existing) = inner.sources_by_name.get(&name_id) {
290            return existing;
291        }
292        inner.sources.insert(source_id, SourceEntry { path: None, content: Some((content_id, size, lines)) });
293        inner.sources_by_name.insert(name_id, source_id);
294        source_id
295    }
296
297    /// Returns whether the manager contains a source with the given identifier.
298    #[inline]
299    pub fn contains(&self, source_id: &SourceIdentifier) -> bool {
300        let inner = self.inner.read();
301        inner.sources.contains_key(source_id)
302    }
303
304    /// Returns all source identifiers.
305    #[inline]
306    pub fn source_ids(&self) -> Vec<SourceIdentifier> {
307        let inner = self.inner.read();
308        inner.sources.keys().cloned().collect()
309    }
310
311    /// Returns source identifiers for the given category.
312    #[inline]
313    pub fn source_ids_for_category(&self, category: SourceCategory) -> Vec<SourceIdentifier> {
314        let inner = self.inner.read();
315        inner.sources.keys().filter(|id| id.category() == category).cloned().collect()
316    }
317
318    /// Returns source identifiers for categories other than the given one.
319    #[inline]
320    pub fn source_ids_except_category(&self, category: SourceCategory) -> Vec<SourceIdentifier> {
321        let inner = self.inner.read();
322        inner.sources.keys().filter(|id| id.category() != category).cloned().collect()
323    }
324
325    /// Loads the source for the given identifier.
326    ///
327    /// If the source content is already loaded, it is returned immediately.
328    /// Otherwise the file is read from disk, processed, and cached.
329    #[inline]
330    pub fn load(&self, source_id: &SourceIdentifier) -> Result<Source, SourceError> {
331        let path = {
332            let inner = self.inner.read();
333            let entry = inner.sources.get(source_id).ok_or(SourceError::UnavailableSource(*source_id))?;
334
335            // First, try to read without locking for update.
336            if let Some((content, size, ref lines)) = entry.content {
337                return Ok(Source {
338                    identifier: *source_id,
339                    path: entry.path.clone(),
340                    content,
341                    size,
342                    lines: lines.clone(),
343                });
344            }
345
346            // Retrieve the file path (must exist if content is not loaded).
347            entry.path.clone().ok_or(SourceError::UnavailableSource(*source_id))?
348        };
349
350        // Perform file I/O outside the lock.
351        let bytes = std::fs::read(&path).map_err(SourceError::IOError)?;
352        let content_str = match String::from_utf8(bytes) {
353            Ok(s) => s,
354            Err(err) => {
355                let s = err.into_bytes();
356                let s = String::from_utf8_lossy(&s).into_owned();
357                if source_id.category().is_user_defined() {
358                    tracing::debug!(
359                        "Source '{}' contains invalid UTF-8 sequence; behavior is undefined.",
360                        path.display()
361                    );
362                } else {
363                    tracing::info!(
364                        "Source '{}' contains invalid UTF-8 sequence; behavior is undefined.",
365                        path.display()
366                    );
367                }
368
369                s
370            }
371        };
372        let lines: Vec<_> = line_starts(&content_str).collect();
373        let size = content_str.len();
374        let content_id = self.interner.intern(&content_str);
375
376        // Update the entry under a write lock.
377        {
378            let mut inner = self.inner.write();
379            if let Some(entry) = inner.sources.get_mut(source_id) {
380                // Check again in case another thread updated it meanwhile.
381                if entry.content.is_none() {
382                    entry.content = Some((content_id, size, lines.clone()));
383                }
384                Ok(Source { identifier: *source_id, path: entry.path.clone(), content: content_id, size, lines })
385            } else {
386                Err(SourceError::UnavailableSource(*source_id))
387            }
388        }
389    }
390
391    /// Writes updated content for the source with the given identifier.
392    #[inline]
393    pub fn write(&self, source_id: SourceIdentifier, new_content: impl AsRef<str>) -> Result<(), SourceError> {
394        let new_content_str = new_content.as_ref();
395        let new_content_id = self.interner.intern(new_content_str);
396        let new_lines: Vec<_> = line_starts(new_content_str).collect();
397        let new_size = new_content_str.len();
398
399        let path_opt = {
400            let mut inner = self.inner.write();
401            let entry = inner.sources.get_mut(&source_id).ok_or(SourceError::UnavailableSource(source_id))?;
402            if let Some((old_content, _, _)) = entry.content
403                && old_content == new_content_id
404            {
405                return Ok(());
406            }
407            entry.content = Some((new_content_id, new_size, new_lines));
408            entry.path.clone()
409        };
410
411        // If the source has an associated file, update it on disk.
412        if let Some(ref path) = path_opt {
413            std::fs::write(path, self.interner.lookup(&new_content_id)).map_err(SourceError::IOError)?;
414        }
415
416        Ok(())
417    }
418
419    /// Returns the number of sources.
420    #[inline]
421    pub fn len(&self) -> usize {
422        let inner = self.inner.read();
423        inner.sources.len()
424    }
425
426    /// Returns true if there are no sources.
427    #[inline]
428    pub fn is_empty(&self) -> bool {
429        let inner = self.inner.read();
430        inner.sources.is_empty()
431    }
432}
433
434impl<T: HasSource> HasSource for Box<T> {
435    #[inline]
436    fn source(&self) -> SourceIdentifier {
437        self.as_ref().source()
438    }
439}
440
441/// Returns an iterator over the starting byte offsets of each line in `source`.
442#[inline]
443fn line_starts(source: &str) -> impl Iterator<Item = usize> + '_ {
444    let bytes = source.as_bytes();
445
446    std::iter::once(0)
447        .chain(memchr::memchr_iter(b'\n', bytes).map(|i| if i > 0 && bytes[i - 1] == b'\r' { i } else { i + 1 }))
448}