mago_source/
lib.rs

1use std::path::PathBuf;
2use std::sync::Arc;
3
4use ahash::HashMap;
5use parking_lot::RwLock;
6use serde::Deserialize;
7use serde::Serialize;
8
9use mago_interner::StringIdentifier;
10use mago_interner::ThreadedInterner;
11
12use crate::error::SourceError;
13
14pub mod error;
15
16/// Represents the category of the source for a PHP construct.
17///
18/// This enum categorizes the origin of a source, based on where it is are defined.
19/// The categories are useful for distinguishing between user-written code, vendor-provided libraries,
20/// and built-in PHP features.
21///
22/// # Variants
23///
24/// - `BuiltIn`: Represents a construct that is part of PHP's core or extension libraries.
25/// - `External`: Represents a construct defined in a vendor-provided or third-party library.
26/// - `UserDefined`: Represents a construct written by the user or part of the current project.
27#[derive(Default, Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
28pub enum SourceCategory {
29    /// Represents a PHP construct that is part of the PHP core or extension libraries.
30    BuiltIn,
31
32    /// Represents a PHP construct defined in vendor-provided or third-party libraries.
33    External,
34
35    /// Represents a PHP construct written by the user or part of the current project.
36    #[default]
37    UserDefined,
38}
39
40/// A unique identifier for a source.
41#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
42#[repr(C)]
43pub struct SourceIdentifier(pub StringIdentifier, pub SourceCategory);
44
45/// Represents a source file.
46#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
47pub struct Source {
48    pub identifier: SourceIdentifier,
49    pub path: Option<PathBuf>,
50    pub content: StringIdentifier,
51    pub size: usize,
52    pub lines: Vec<usize>,
53}
54
55/// Trait for items that have a source.
56pub trait HasSource {
57    fn source(&self) -> SourceIdentifier;
58}
59
60/// Internal structure to store source information before full loading.
61#[derive(Debug)]
62struct SourceEntry {
63    /// The file path (if any).
64    path: Option<PathBuf>,
65    /// The content, if already loaded, plus its size and line-start positions.
66    content: Option<(StringIdentifier, usize, Vec<usize>)>,
67}
68
69/// Internal container for our maps. We keep two maps:
70///  - one from SourceIdentifier → SourceEntry
71///  - an auxiliary index from interned name → SourceIdentifier
72#[derive(Debug)]
73struct SourceManagerInner {
74    sources: HashMap<SourceIdentifier, SourceEntry>,
75    sources_by_name: HashMap<StringIdentifier, SourceIdentifier>,
76}
77
78/// A manager for sources.
79///
80/// This version replaces DashMap with a single inner structure protected by a
81/// high-performance `RwLock` (from the parking_lot crate) and uses AHashMap for speed.
82#[derive(Clone, Debug)]
83pub struct SourceManager {
84    /// The interner used for source names and content.
85    interner: ThreadedInterner,
86    /// Inner maps protected by a lock.
87    inner: Arc<RwLock<SourceManagerInner>>,
88}
89
90/// Methods for SourceCategory.
91impl SourceCategory {
92    #[inline(always)]
93    pub const fn is_built_in(&self) -> bool {
94        matches!(self, Self::BuiltIn)
95    }
96
97    #[inline(always)]
98    pub const fn is_external(&self) -> bool {
99        matches!(self, Self::External)
100    }
101
102    #[inline(always)]
103    pub const fn is_user_defined(&self) -> bool {
104        matches!(self, Self::UserDefined)
105    }
106}
107
108/// Methods for SourceIdentifier.
109impl SourceIdentifier {
110    #[inline(always)]
111    pub fn dummy() -> Self {
112        Self(StringIdentifier::empty(), SourceCategory::UserDefined)
113    }
114
115    /// Returns the interned string identifier.
116    #[inline(always)]
117    pub const fn value(&self) -> StringIdentifier {
118        self.0
119    }
120
121    /// Returns the source category.
122    #[inline(always)]
123    pub const fn category(&self) -> SourceCategory {
124        self.1
125    }
126}
127/// Methods for Source.
128impl Source {
129    /// Creates a [`Source`] from a single piece of `content` without needing
130    /// a full [`SourceManager`].
131    ///
132    /// This is particularly useful for quick parsing or one-off analyses
133    /// where you do not need to manage multiple sources.
134    ///
135    /// # Arguments
136    ///
137    /// * `interner` - A reference to a [`ThreadedInterner`] used to intern
138    ///   the `content` and store string identifiers.
139    /// * `name` - A logical identifier for this source, such as `"inline"`
140    ///   or `"my_script.php"`.
141    /// * `content` - The actual PHP (or other) code string.
142    #[inline(always)]
143    pub fn standalone(interner: &ThreadedInterner, name: &str, content: &str) -> Self {
144        let lines: Vec<_> = line_starts(content).collect();
145        let size = content.len();
146        let content_id = interner.intern(content);
147
148        Self {
149            identifier: SourceIdentifier(interner.intern(name), SourceCategory::UserDefined),
150            path: None,
151            content: content_id,
152            size,
153            lines,
154        }
155    }
156
157    /// Retrieve the line number for the given byte offset.
158    ///
159    /// # Parameters
160    ///
161    /// - `offset`: The byte offset to retrieve the line number for.
162    ///
163    /// # Returns
164    ///
165    /// The line number for the given byte offset (0-based index).
166    #[inline(always)]
167    pub fn line_number(&self, offset: usize) -> usize {
168        self.lines.binary_search(&offset).unwrap_or_else(|next_line| next_line - 1)
169    }
170
171    /// Retrieve the byte offset for the start of the given line.
172    ///
173    /// # Parameters
174    ///
175    /// - `line`: The line number to retrieve the start offset for.
176    ///
177    /// # Returns
178    ///
179    /// The byte offset for the start of the given line (0-based index).
180    pub fn get_line_start_offset(&self, line: usize) -> Option<usize> {
181        self.lines.get(line).copied()
182    }
183
184    /// Retrieve the byte offset for the end of the given line.
185    ///
186    /// # Parameters
187    ///
188    /// - `line`: The line number to retrieve the end offset for.
189    ///
190    /// # Returns
191    ///
192    /// The byte offset for the end of the given line (0-based index).
193    pub fn get_line_end_offset(&self, line: usize) -> Option<usize> {
194        match self.lines.get(line + 1) {
195            Some(&end) => Some(end - 1),
196            None if line == self.lines.len() - 1 => Some(self.size),
197            _ => None,
198        }
199    }
200
201    /// Retrieve the column number for the given byte offset.
202    ///
203    /// # Parameters
204    ///
205    /// - `offset`: The byte offset to retrieve the column number for.
206    ///
207    /// # Returns
208    ///
209    /// The column number for the given byte offset (0-based index).
210    #[inline(always)]
211    pub fn column_number(&self, offset: usize) -> usize {
212        let line_start = self.lines.binary_search(&offset).unwrap_or_else(|next_line| self.lines[next_line - 1]);
213
214        offset - line_start
215    }
216}
217
218impl SourceManager {
219    /// Creates a new source manager.
220    #[inline(always)]
221    pub fn new(interner: ThreadedInterner) -> Self {
222        Self {
223            interner,
224            inner: Arc::new(RwLock::new(SourceManagerInner {
225                sources: HashMap::default(),
226                sources_by_name: HashMap::default(),
227            })),
228        }
229    }
230
231    /// Inserts a source with the given name and file path.
232    #[inline(always)]
233    pub fn insert_path(&self, name: impl AsRef<str>, path: PathBuf, category: SourceCategory) -> SourceIdentifier {
234        let name_str = name.as_ref();
235        let name_id = self.interner.intern(name_str);
236        let source_id = SourceIdentifier(name_id, category);
237
238        {
239            let inner = self.inner.read();
240            if inner.sources.contains_key(&source_id) {
241                return source_id;
242            }
243        }
244
245        let mut inner = self.inner.write();
246        // Double-check to avoid duplicate insertion.
247        if inner.sources.contains_key(&source_id) {
248            return source_id;
249        }
250        inner.sources.insert(source_id, SourceEntry { path: Some(path), content: None });
251        inner.sources_by_name.insert(name_id, source_id);
252        source_id
253    }
254
255    /// Inserts a source with the given name and content.
256    #[inline(always)]
257    pub fn insert_content(
258        &self,
259        name: impl AsRef<str>,
260        content: impl AsRef<str>,
261        category: SourceCategory,
262    ) -> SourceIdentifier {
263        let name_str = name.as_ref();
264        let content_str = content.as_ref();
265        let name_id = self.interner.intern(name_str);
266
267        {
268            let inner = self.inner.read();
269            if let Some(&source_id) = inner.sources_by_name.get(&name_id) {
270                return source_id;
271            }
272        }
273
274        let lines: Vec<_> = line_starts(content_str).collect();
275        let size = content_str.len();
276        let content_id = self.interner.intern(content_str);
277        let source_id = SourceIdentifier(name_id, category);
278
279        let mut inner = self.inner.write();
280        if let Some(&existing) = inner.sources_by_name.get(&name_id) {
281            return existing;
282        }
283        inner.sources.insert(source_id, SourceEntry { path: None, content: Some((content_id, size, lines)) });
284        inner.sources_by_name.insert(name_id, source_id);
285        source_id
286    }
287
288    /// Returns whether the manager contains a source with the given identifier.
289    #[inline(always)]
290    pub fn contains(&self, source_id: &SourceIdentifier) -> bool {
291        let inner = self.inner.read();
292        inner.sources.contains_key(source_id)
293    }
294
295    /// Returns all source identifiers.
296    #[inline(always)]
297    pub fn source_ids(&self) -> Vec<SourceIdentifier> {
298        let inner = self.inner.read();
299        inner.sources.keys().cloned().collect()
300    }
301
302    /// Returns source identifiers for the given category.
303    #[inline(always)]
304    pub fn source_ids_for_category(&self, category: SourceCategory) -> Vec<SourceIdentifier> {
305        let inner = self.inner.read();
306        inner.sources.keys().filter(|id| id.category() == category).cloned().collect()
307    }
308
309    /// Returns source identifiers for categories other than the given one.
310    #[inline(always)]
311    pub fn source_ids_except_category(&self, category: SourceCategory) -> Vec<SourceIdentifier> {
312        let inner = self.inner.read();
313        inner.sources.keys().filter(|id| id.category() != category).cloned().collect()
314    }
315
316    /// Loads the source for the given identifier.
317    ///
318    /// If the source content is already loaded, it is returned immediately.
319    /// Otherwise the file is read from disk, processed, and cached.
320    #[inline(always)]
321    pub fn load(&self, source_id: &SourceIdentifier) -> Result<Source, SourceError> {
322        // First, try to read without locking for update.
323        {
324            let inner = self.inner.read();
325            if let Some(entry) = inner.sources.get(source_id) {
326                if let Some((content, size, ref lines)) = entry.content {
327                    return Ok(Source {
328                        identifier: *source_id,
329                        path: entry.path.clone(),
330                        content,
331                        size,
332                        lines: lines.clone(),
333                    });
334                }
335            }
336        }
337
338        // Retrieve the file path (must exist if content is not loaded).
339        let path = {
340            let inner = self.inner.read();
341            let entry = inner.sources.get(source_id).ok_or(SourceError::UnavailableSource(*source_id))?;
342
343            entry.path.clone().ok_or(SourceError::UnavailableSource(*source_id))?
344        };
345
346        // Perform file I/O outside the lock.
347        let bytes = std::fs::read(&path).map_err(SourceError::IOError)?;
348        let content_str = match String::from_utf8(bytes) {
349            Ok(s) => s,
350            Err(err) => {
351                let s = err.into_bytes();
352                let s = String::from_utf8_lossy(&s).into_owned();
353                tracing::warn!("Source '{}' contains invalid UTF-8 sequence; behavior is undefined.", path.display());
354                s
355            }
356        };
357        let lines: Vec<_> = line_starts(&content_str).collect();
358        let size = content_str.len();
359        let content_id = self.interner.intern(&content_str);
360
361        // Update the entry under a write lock.
362        {
363            let mut inner = self.inner.write();
364            if let Some(entry) = inner.sources.get_mut(source_id) {
365                // Check again in case another thread updated it meanwhile.
366                if entry.content.is_none() {
367                    entry.content = Some((content_id, size, lines.clone()));
368                }
369                Ok(Source { identifier: *source_id, path: entry.path.clone(), content: content_id, size, lines })
370            } else {
371                Err(SourceError::UnavailableSource(*source_id))
372            }
373        }
374    }
375
376    /// Writes updated content for the source with the given identifier.
377    #[inline(always)]
378    pub fn write(&self, source_id: SourceIdentifier, new_content: impl AsRef<str>) -> Result<(), SourceError> {
379        let new_content_str = new_content.as_ref();
380        let new_content_id = self.interner.intern(new_content_str);
381        let new_lines: Vec<_> = line_starts(new_content_str).collect();
382        let new_size = new_content_str.len();
383
384        let path_opt = {
385            let mut inner = self.inner.write();
386            let entry = inner.sources.get_mut(&source_id).ok_or(SourceError::UnavailableSource(source_id))?;
387            if let Some((old_content, _, _)) = entry.content {
388                if old_content == new_content_id {
389                    return Ok(());
390                }
391            }
392            entry.content = Some((new_content_id, new_size, new_lines));
393            entry.path.clone()
394        };
395
396        // If the source has an associated file, update it on disk.
397        if let Some(ref path) = path_opt {
398            std::fs::write(path, self.interner.lookup(&new_content_id)).map_err(SourceError::IOError)?;
399        }
400
401        Ok(())
402    }
403
404    /// Returns the number of sources.
405    #[inline(always)]
406    pub fn len(&self) -> usize {
407        let inner = self.inner.read();
408        inner.sources.len()
409    }
410
411    /// Returns true if there are no sources.
412    #[inline(always)]
413    pub fn is_empty(&self) -> bool {
414        let inner = self.inner.read();
415        inner.sources.is_empty()
416    }
417}
418
419impl<T: HasSource> HasSource for Box<T> {
420    #[inline(always)]
421    fn source(&self) -> SourceIdentifier {
422        self.as_ref().source()
423    }
424}
425
426/// Returns an iterator over the starting byte offsets of each line in `source`.
427#[inline(always)]
428fn line_starts(source: &str) -> impl Iterator<Item = usize> + '_ {
429    let bytes = source.as_bytes();
430
431    std::iter::once(0)
432        .chain(memchr::memchr_iter(b'\n', bytes).map(|i| if i > 0 && bytes[i - 1] == b'\r' { i } else { i + 1 }))
433}