mago_source/
lib.rs

1use std::path::PathBuf;
2use std::sync::Arc;
3
4use ahash::HashMap;
5use parking_lot::RwLock;
6use serde::Deserialize;
7use serde::Serialize;
8
9use mago_interner::StringIdentifier;
10use mago_interner::ThreadedInterner;
11
12use crate::error::SourceError;
13
14pub mod error;
15
16/// Represents the category of the source for a PHP construct.
17///
18/// This enum categorizes the origin of a source, based on where it is are defined.
19/// The categories are useful for distinguishing between user-written code, vendor-provided libraries,
20/// and built-in PHP features.
21///
22/// # Variants
23///
24/// - `BuiltIn`: Represents a construct that is part of PHP's core or extension libraries.
25/// - `External`: Represents a construct defined in a vendor-provided or third-party library.
26/// - `UserDefined`: Represents a construct written by the user or part of the current project.
27#[derive(Default, Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
28pub enum SourceCategory {
29    /// Represents a PHP construct that is part of the PHP core or extension libraries.
30    BuiltIn,
31
32    /// Represents a PHP construct defined in vendor-provided or third-party libraries.
33    External,
34
35    /// Represents a PHP construct written by the user or part of the current project.
36    #[default]
37    UserDefined,
38}
39
40/// A unique identifier for a source.
41#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
42#[repr(C)]
43pub struct SourceIdentifier(pub StringIdentifier, pub SourceCategory);
44
45/// Represents a source file.
46#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
47pub struct Source {
48    pub identifier: SourceIdentifier,
49    pub path: Option<PathBuf>,
50    pub content: StringIdentifier,
51    pub size: usize,
52    pub lines: Vec<usize>,
53}
54
55/// Trait for items that have a source.
56pub trait HasSource {
57    fn source(&self) -> SourceIdentifier;
58}
59
60/// Internal structure to store source information before full loading.
61#[derive(Debug)]
62struct SourceEntry {
63    /// The file path (if any).
64    path: Option<PathBuf>,
65    /// The content, if already loaded, plus its size and line-start positions.
66    content: Option<(StringIdentifier, usize, Vec<usize>)>,
67}
68
69/// Internal container for our maps. We keep two maps:
70///  - one from SourceIdentifier → SourceEntry
71///  - an auxiliary index from interned name → SourceIdentifier
72#[derive(Debug)]
73struct SourceManagerInner {
74    sources: HashMap<SourceIdentifier, SourceEntry>,
75    sources_by_name: HashMap<StringIdentifier, SourceIdentifier>,
76}
77
78/// A manager for sources.
79///
80/// This version replaces DashMap with a single inner structure protected by a
81/// high-performance `RwLock` (from the parking_lot crate) and uses AHashMap for speed.
82#[derive(Clone, Debug)]
83pub struct SourceManager {
84    /// The interner used for source names and content.
85    interner: ThreadedInterner,
86    /// Inner maps protected by a lock.
87    inner: Arc<RwLock<SourceManagerInner>>,
88}
89
90/// Methods for SourceCategory.
91impl SourceCategory {
92    #[inline(always)]
93    pub const fn is_built_in(&self) -> bool {
94        matches!(self, Self::BuiltIn)
95    }
96
97    #[inline(always)]
98    pub const fn is_external(&self) -> bool {
99        matches!(self, Self::External)
100    }
101
102    #[inline(always)]
103    pub const fn is_user_defined(&self) -> bool {
104        matches!(self, Self::UserDefined)
105    }
106}
107
108/// Methods for SourceIdentifier.
109impl SourceIdentifier {
110    #[inline(always)]
111    pub fn dummy() -> Self {
112        Self(StringIdentifier::empty(), SourceCategory::UserDefined)
113    }
114
115    /// Returns the interned string identifier.
116    #[inline(always)]
117    pub const fn value(&self) -> StringIdentifier {
118        self.0
119    }
120
121    /// Returns the source category.
122    #[inline(always)]
123    pub const fn category(&self) -> SourceCategory {
124        self.1
125    }
126}
127/// Methods for Source.
128impl Source {
129    /// Creates a [`Source`] from a single piece of `content` without needing
130    /// a full [`SourceManager`].
131    ///
132    /// This is particularly useful for quick parsing or one-off analyses
133    /// where you do not need to manage multiple sources.
134    ///
135    /// # Arguments
136    ///
137    /// * `interner` - A reference to a [`ThreadedInterner`] used to intern
138    ///   the `content` and store string identifiers.
139    /// * `name` - A logical identifier for this source, such as `"inline"`
140    ///   or `"my_script.php"`.
141    /// * `content` - The actual PHP (or other) code string.
142    #[inline(always)]
143    pub fn standalone(interner: &ThreadedInterner, name: &str, content: &str) -> Self {
144        let lines: Vec<_> = line_starts(content).collect();
145        let size = content.len();
146        let content_id = interner.intern(content);
147
148        Self {
149            identifier: SourceIdentifier(interner.intern(name), SourceCategory::UserDefined),
150            path: None,
151            content: content_id,
152            size,
153            lines,
154        }
155    }
156
157    /// Retrieve the line number for the given byte offset.
158    ///
159    /// # Parameters
160    ///
161    /// - `offset`: The byte offset to retrieve the line number for.
162    ///
163    /// # Returns
164    ///
165    /// The line number for the given byte offset (0-based index).
166    #[inline(always)]
167    pub fn line_number(&self, offset: usize) -> usize {
168        self.lines.binary_search(&offset).unwrap_or_else(|next_line| next_line - 1)
169    }
170
171    /// Retrieve the byte offset for the start of the given line.
172    ///
173    /// # Parameters
174    ///
175    /// - `line`: The line number to retrieve the start offset for.
176    ///
177    /// # Returns
178    ///
179    /// The byte offset for the start of the given line (0-based index).
180    pub fn get_line_start_offset(&self, line: usize) -> Option<usize> {
181        self.lines.get(line).copied()
182    }
183
184    /// Retrieve the byte offset for the end of the given line.
185    ///
186    /// # Parameters
187    ///
188    /// - `line`: The line number to retrieve the end offset for.
189    ///
190    /// # Returns
191    ///
192    /// The byte offset for the end of the given line (0-based index).
193    pub fn get_line_end_offset(&self, line: usize) -> Option<usize> {
194        match self.lines.get(line + 1) {
195            Some(&end) => Some(end - 1),
196            None if line == self.lines.len() - 1 => Some(self.size),
197            _ => None,
198        }
199    }
200
201    /// Retrieve the column number for the given byte offset.
202    ///
203    /// # Parameters
204    ///
205    /// - `offset`: The byte offset to retrieve the column number for.
206    ///
207    /// # Returns
208    ///
209    /// The column number for the given byte offset (0-based index).
210    #[inline(always)]
211    pub fn column_number(&self, offset: usize) -> usize {
212        let line_start = self.lines.binary_search(&offset).unwrap_or_else(|next_line| self.lines[next_line - 1]);
213
214        offset - line_start
215    }
216}
217
218impl SourceManager {
219    /// Creates a new source manager.
220    #[inline(always)]
221    pub fn new(interner: ThreadedInterner) -> Self {
222        Self {
223            interner,
224            inner: Arc::new(RwLock::new(SourceManagerInner {
225                sources: HashMap::default(),
226                sources_by_name: HashMap::default(),
227            })),
228        }
229    }
230
231    /// Inserts a source with the given name and file path.
232    #[inline(always)]
233    pub fn insert_path(&self, name: impl AsRef<str>, path: PathBuf, category: SourceCategory) -> SourceIdentifier {
234        let name_str = name.as_ref();
235        let name_id = self.interner.intern(name_str);
236        let source_id = SourceIdentifier(name_id, category);
237
238        {
239            let inner = self.inner.read();
240            if inner.sources.contains_key(&source_id) {
241                return source_id;
242            }
243        }
244
245        let mut inner = self.inner.write();
246        // Double-check to avoid duplicate insertion.
247        if inner.sources.contains_key(&source_id) {
248            return source_id;
249        }
250        inner.sources.insert(source_id, SourceEntry { path: Some(path), content: None });
251        inner.sources_by_name.insert(name_id, source_id);
252        source_id
253    }
254
255    /// Inserts a source with the given name and content.
256    #[inline(always)]
257    pub fn insert_content(
258        &self,
259        name: impl AsRef<str>,
260        content: impl AsRef<str>,
261        category: SourceCategory,
262    ) -> SourceIdentifier {
263        let name_str = name.as_ref();
264        let content_str = content.as_ref();
265        let name_id = self.interner.intern(name_str);
266
267        {
268            let inner = self.inner.read();
269            if let Some(&source_id) = inner.sources_by_name.get(&name_id) {
270                return source_id;
271            }
272        }
273
274        let lines: Vec<_> = line_starts(content_str).collect();
275        let size = content_str.len();
276        let content_id = self.interner.intern(content_str);
277        let source_id = SourceIdentifier(name_id, category);
278
279        let mut inner = self.inner.write();
280        if let Some(&existing) = inner.sources_by_name.get(&name_id) {
281            return existing;
282        }
283        inner.sources.insert(source_id, SourceEntry { path: None, content: Some((content_id, size, lines)) });
284        inner.sources_by_name.insert(name_id, source_id);
285        source_id
286    }
287
288    /// Returns whether the manager contains a source with the given identifier.
289    #[inline(always)]
290    pub fn contains(&self, source_id: &SourceIdentifier) -> bool {
291        let inner = self.inner.read();
292        inner.sources.contains_key(source_id)
293    }
294
295    /// Returns all source identifiers.
296    #[inline(always)]
297    pub fn source_ids(&self) -> Vec<SourceIdentifier> {
298        let inner = self.inner.read();
299        inner.sources.keys().cloned().collect()
300    }
301
302    /// Returns source identifiers for the given category.
303    #[inline(always)]
304    pub fn source_ids_for_category(&self, category: SourceCategory) -> Vec<SourceIdentifier> {
305        let inner = self.inner.read();
306        inner.sources.keys().filter(|id| id.category() == category).cloned().collect()
307    }
308
309    /// Returns source identifiers for categories other than the given one.
310    #[inline(always)]
311    pub fn source_ids_except_category(&self, category: SourceCategory) -> Vec<SourceIdentifier> {
312        let inner = self.inner.read();
313        inner.sources.keys().filter(|id| id.category() != category).cloned().collect()
314    }
315
316    /// Loads the source for the given identifier.
317    ///
318    /// If the source content is already loaded, it is returned immediately.
319    /// Otherwise the file is read from disk, processed, and cached.
320    #[inline(always)]
321    pub fn load(&self, source_id: &SourceIdentifier) -> Result<Source, SourceError> {
322        let path = {
323            let inner = self.inner.read();
324            let entry = inner.sources.get(source_id).ok_or(SourceError::UnavailableSource(*source_id))?;
325
326            // First, try to read without locking for update.
327            if let Some((content, size, ref lines)) = entry.content {
328                return Ok(Source {
329                    identifier: *source_id,
330                    path: entry.path.clone(),
331                    content,
332                    size,
333                    lines: lines.clone(),
334                });
335            }
336
337            // Retrieve the file path (must exist if content is not loaded).
338            entry.path.clone().ok_or(SourceError::UnavailableSource(*source_id))?
339        };
340
341        // Perform file I/O outside the lock.
342        let bytes = std::fs::read(&path).map_err(SourceError::IOError)?;
343        let content_str = match String::from_utf8(bytes) {
344            Ok(s) => s,
345            Err(err) => {
346                let s = err.into_bytes();
347                let s = String::from_utf8_lossy(&s).into_owned();
348                if source_id.category().is_user_defined() {
349                    tracing::debug!(
350                        "Source '{}' contains invalid UTF-8 sequence; behavior is undefined.",
351                        path.display()
352                    );
353                } else {
354                    tracing::info!(
355                        "Source '{}' contains invalid UTF-8 sequence; behavior is undefined.",
356                        path.display()
357                    );
358                }
359
360                s
361            }
362        };
363        let lines: Vec<_> = line_starts(&content_str).collect();
364        let size = content_str.len();
365        let content_id = self.interner.intern(&content_str);
366
367        // Update the entry under a write lock.
368        {
369            let mut inner = self.inner.write();
370            if let Some(entry) = inner.sources.get_mut(source_id) {
371                // Check again in case another thread updated it meanwhile.
372                if entry.content.is_none() {
373                    entry.content = Some((content_id, size, lines.clone()));
374                }
375                Ok(Source { identifier: *source_id, path: entry.path.clone(), content: content_id, size, lines })
376            } else {
377                Err(SourceError::UnavailableSource(*source_id))
378            }
379        }
380    }
381
382    /// Writes updated content for the source with the given identifier.
383    #[inline(always)]
384    pub fn write(&self, source_id: SourceIdentifier, new_content: impl AsRef<str>) -> Result<(), SourceError> {
385        let new_content_str = new_content.as_ref();
386        let new_content_id = self.interner.intern(new_content_str);
387        let new_lines: Vec<_> = line_starts(new_content_str).collect();
388        let new_size = new_content_str.len();
389
390        let path_opt = {
391            let mut inner = self.inner.write();
392            let entry = inner.sources.get_mut(&source_id).ok_or(SourceError::UnavailableSource(source_id))?;
393            if let Some((old_content, _, _)) = entry.content
394                && old_content == new_content_id
395            {
396                return Ok(());
397            }
398            entry.content = Some((new_content_id, new_size, new_lines));
399            entry.path.clone()
400        };
401
402        // If the source has an associated file, update it on disk.
403        if let Some(ref path) = path_opt {
404            std::fs::write(path, self.interner.lookup(&new_content_id)).map_err(SourceError::IOError)?;
405        }
406
407        Ok(())
408    }
409
410    /// Returns the number of sources.
411    #[inline(always)]
412    pub fn len(&self) -> usize {
413        let inner = self.inner.read();
414        inner.sources.len()
415    }
416
417    /// Returns true if there are no sources.
418    #[inline(always)]
419    pub fn is_empty(&self) -> bool {
420        let inner = self.inner.read();
421        inner.sources.is_empty()
422    }
423}
424
425impl<T: HasSource> HasSource for Box<T> {
426    #[inline(always)]
427    fn source(&self) -> SourceIdentifier {
428        self.as_ref().source()
429    }
430}
431
432/// Returns an iterator over the starting byte offsets of each line in `source`.
433#[inline(always)]
434fn line_starts(source: &str) -> impl Iterator<Item = usize> + '_ {
435    let bytes = source.as_bytes();
436
437    std::iter::once(0)
438        .chain(memchr::memchr_iter(b'\n', bytes).map(|i| if i > 0 && bytes[i - 1] == b'\r' { i } else { i + 1 }))
439}