Skip to main content

php_lsp/
ast.rs

1/// Core AST infrastructure: arena-backed `ParsedDoc`, span utilities, and TypeHint formatting.
2use std::collections::HashMap;
3use std::mem::ManuallyDrop;
4use std::sync::{Arc, LazyLock, Mutex};
5
6use php_ast::{Program, Span, TypeHint, TypeHintKind};
7use tower_lsp::lsp_types::{Position, Range};
8
9/// Cached per-doc map of `class_name -> method_name -> return_class_name`.
10/// Exposed here (rather than in `type_map`) because it lives on `ParsedDoc`.
11pub type MethodReturnsMap = HashMap<String, HashMap<String, String>>;
12
13// ── BumpPool ──────────────────────────────────────────────────────────────────
14
15const POOL_CAP: usize = 8;
16
17struct BumpPool {
18    // Box<Bump> required: arena_box.as_ref() is transmuted to &'static Bump in
19    // ParsedDoc::parse(). The Box keeps the Bump at a stable heap address so
20    // that reference remains valid after the Box is moved into ArenaGuard.
21    #[allow(clippy::vec_box)]
22    pool: Mutex<Vec<Box<bumpalo::Bump>>>,
23}
24
25impl BumpPool {
26    fn take(&self) -> Box<bumpalo::Bump> {
27        self.pool
28            .lock()
29            .unwrap()
30            .pop()
31            .unwrap_or_else(|| Box::new(bumpalo::Bump::new()))
32    }
33
34    fn give(&self, mut arena: Box<bumpalo::Bump>) {
35        arena.reset();
36        let mut p = self.pool.lock().unwrap();
37        if p.len() < POOL_CAP {
38            p.push(arena);
39        }
40    }
41}
42
43static BUMP_POOL: LazyLock<BumpPool> = LazyLock::new(|| BumpPool {
44    pool: Mutex::new(Vec::new()),
45});
46
47// ── ArenaGuard ────────────────────────────────────────────────────────────────
48
49/// Returns the arena to the pool on drop.
50struct ArenaGuard(Option<Box<bumpalo::Bump>>);
51
52impl Drop for ArenaGuard {
53    fn drop(&mut self) {
54        if let Some(arena) = self.0.take() {
55            BUMP_POOL.give(arena);
56        }
57    }
58}
59
60// ── ParsedDoc ─────────────────────────────────────────────────────────────────
61
62/// Owns a parsed PHP document: the bumpalo arena, source snapshot, and Program.
63///
64/// SAFETY invariants:
65/// - `program` uses `ManuallyDrop` and is explicitly dropped in `Drop::drop()`
66///   before any field auto-drop runs. This guarantees arena-allocated nodes are
67///   gone before `ArenaGuard` recycles the arena — regardless of field order.
68/// - `_source: Arc<str>` keeps the source text alive at a stable heap address.
69///   The transmuted `&'static str` in the parser arena remains valid for the
70///   lifetime of `ParsedDoc` because `_source` is dropped last (after `program`
71///   is manually dropped and after `_arena` is recycled).
72/// - The `'static` lifetimes in `ManuallyDrop<Box<Program<'static, 'static>>>`
73///   are erased versions of the true lifetimes `'_arena` and `'_source`. The
74///   public `program()` accessor re-attaches them to `&self`, preventing any
75///   reference from escaping beyond the lifetime of the `ParsedDoc`.
76pub struct ParsedDoc {
77    program: ManuallyDrop<Box<Program<'static, 'static>>>,
78    pub errors: Vec<php_rs_parser::diagnostics::ParseError>,
79    _source: Arc<str>,
80    line_starts: Vec<u32>,
81    _arena: ArenaGuard,
82}
83
84impl Drop for ParsedDoc {
85    fn drop(&mut self) {
86        // Drop program explicitly before any field auto-drop runs (including
87        // _arena). ManuallyDrop prevents a second drop after this method returns.
88        // SAFETY: called exactly once here; no other code drops this field.
89        unsafe { ManuallyDrop::drop(&mut self.program) };
90    }
91}
92
93// SAFETY: Program nodes contain only data; no thread-local state.
94unsafe impl Send for ParsedDoc {}
95unsafe impl Sync for ParsedDoc {}
96
97impl ParsedDoc {
98    /// Parse PHP source text. Accepts `String`, `&str`, or `Arc<str>` — when
99    /// the caller already holds an `Arc<str>` (e.g. the salsa `parsed_doc`
100    /// query), no heap allocation occurs (refcount bump only).
101    pub fn parse(source: impl Into<Arc<str>>) -> Self {
102        let source: Arc<str> = source.into();
103        // Take a pre-warmed arena from the pool (or allocate a fresh one).
104        let arena_box = BUMP_POOL.take();
105
106        // SAFETY: `Arc<str>` data lives at a stable heap address for as long as
107        // the Arc is alive. `_source` keeps the Arc alive for `ParsedDoc`'s
108        // lifetime, so the transmuted `&'static str` remains valid. The arena
109        // box is similarly stable — moving a `Box` moves the pointer, not the
110        // heap data.
111        let src_ref: &'static str = unsafe { std::mem::transmute::<&str, &'static str>(&*source) };
112        let arena_ref: &'static bumpalo::Bump = unsafe {
113            std::mem::transmute::<&bumpalo::Bump, &'static bumpalo::Bump>(arena_box.as_ref())
114        };
115
116        let result = php_rs_parser::parse(arena_ref, src_ref);
117
118        let line_starts = build_line_starts(src_ref);
119
120        ParsedDoc {
121            program: ManuallyDrop::new(Box::new(result.program)),
122            errors: result.errors,
123            _source: source,
124            line_starts,
125            _arena: ArenaGuard(Some(arena_box)),
126        }
127    }
128
129    /// Borrow the program with lifetimes bounded by `&self`.
130    ///
131    /// SAFETY: covariance of `Program<'arena, 'src>` in both parameters lets
132    /// `&Program<'static, 'static>` shorten to `&Program<'_, '_>`.
133    #[inline]
134    pub fn program(&self) -> &Program<'_, '_> {
135        &self.program
136    }
137
138    /// Borrow the source text used when parsing.
139    #[inline]
140    pub fn source(&self) -> &str {
141        &self._source
142    }
143
144    /// Clone the `Arc<str>` backing the source text (refcount bump only).
145    ///
146    /// Callers that need to store the same source text in a salsa input
147    /// should use this to get the identical pointer — enabling cheap
148    /// `Arc::ptr_eq` validation in the `parsed_cache`.
149    #[inline]
150    pub fn source_arc(&self) -> Arc<str> {
151        self._source.clone()
152    }
153
154    /// Borrow the precomputed line-start byte offsets.
155    /// `line_starts[i]` is the byte offset of the first character on line `i`.
156    pub fn line_starts(&self) -> &[u32] {
157        &self.line_starts
158    }
159
160    /// Bundle source and line index for position lookups.
161    pub fn view(&self) -> SourceView<'_> {
162        SourceView {
163            source: self.source(),
164            line_starts: self.line_starts(),
165        }
166    }
167}
168
169impl Default for ParsedDoc {
170    fn default() -> Self {
171        ParsedDoc::parse("")
172    }
173}
174
175// ── Span / position utilities ─────────────────────────────────────────────────
176
177/// Build a table of byte offsets for the start of each line.
178/// `result[i]` is the byte offset of the first character on line `i`.
179fn build_line_starts(source: &str) -> Vec<u32> {
180    let mut starts = vec![0u32];
181    for (i, b) in source.bytes().enumerate() {
182        if b == b'\n' {
183            starts.push(i as u32 + 1);
184        }
185    }
186    starts
187}
188
189/// Bundles source text with its precomputed line-start table.
190/// `Copy` so inner functions can pass it by value without lifetime annotation churn.
191#[derive(Copy, Clone)]
192pub struct SourceView<'a> {
193    source: &'a str,
194    line_starts: &'a [u32],
195}
196
197impl<'a> SourceView<'a> {
198    #[inline]
199    pub fn source(self) -> &'a str {
200        self.source
201    }
202
203    pub fn position_of(self, offset: u32) -> Position {
204        offset_to_position(self.source, self.line_starts, offset)
205    }
206
207    #[inline]
208    pub fn line_starts(self) -> &'a [u32] {
209        self.line_starts
210    }
211
212    /// O(log lines) line number for a byte offset. Cheaper than
213    /// `position_of(..).line` because it skips the UTF-16 column scan.
214    #[inline]
215    pub fn line_of(self, offset: u32) -> u32 {
216        match self.line_starts.partition_point(|&s| s <= offset) {
217            0 => 0,
218            i => (i - 1) as u32,
219        }
220    }
221
222    /// O(log lines) UTF-16 `Position` -> byte offset. Uses the precomputed
223    /// `line_starts` table to jump to the target line and only scans that
224    /// line's characters. Much faster than a linear scan over the whole
225    /// source when the cursor is late in the file.
226    pub fn byte_of_position(self, pos: Position) -> u32 {
227        let line_idx = pos.line as usize;
228        let line_start = self.line_starts.get(line_idx).copied().unwrap_or(0) as usize;
229        let line_end = self
230            .line_starts
231            .get(line_idx + 1)
232            .map(|&s| (s as usize).saturating_sub(1))
233            .unwrap_or(self.source.len());
234        let raw = &self.source[line_start..line_end.min(self.source.len())];
235        let line = raw.strip_suffix('\r').unwrap_or(raw);
236        let mut col_utf16: u32 = 0;
237        let mut byte_in_line: usize = 0;
238        for ch in line.chars() {
239            if col_utf16 >= pos.character {
240                break;
241            }
242            col_utf16 += ch.len_utf16() as u32;
243            byte_in_line += ch.len_utf8();
244        }
245        (line_start + byte_in_line) as u32
246    }
247
248    pub fn range_of(self, span: Span) -> Range {
249        Range {
250            start: self.position_of(span.start),
251            end: self.position_of(span.end),
252        }
253    }
254
255    pub fn name_range(self, name: &str) -> Range {
256        let start = str_offset(self.source, name).unwrap_or(0);
257        Range {
258            start: self.position_of(start),
259            end: self.position_of(start + name.len() as u32),
260        }
261    }
262
263    /// Like [`name_range`], but searches for `name` *within* `span` instead
264    /// of the whole source. Needed when the same name appears in earlier
265    /// docblock comments / other declarations — a global search would
266    /// otherwise point at the first textual occurrence, not the AST node
267    /// the caller actually meant.
268    pub fn name_range_in_span(self, name: &str, span: php_ast::Span) -> Range {
269        let s = span.start as usize;
270        let e = (span.end as usize).min(self.source.len());
271        let start = self
272            .source
273            .get(s..e)
274            .and_then(|slice| slice.find(name))
275            .map(|off| span.start + off as u32)
276            .unwrap_or_else(|| str_offset(self.source, name).unwrap_or(0));
277        Range {
278            start: self.position_of(start),
279            end: self.position_of(start + name.len() as u32),
280        }
281    }
282}
283
284/// Convert a byte offset into `source` to an LSP `Position` (0-based line/char).
285///
286/// Uses a precomputed `line_starts` table for O(log n) binary search.
287/// Handles both LF-only and CRLF line endings: a trailing `\r` before `\n` is
288/// not counted as a column so that positions are consistent regardless of
289/// line-ending style.
290pub fn offset_to_position(source: &str, line_starts: &[u32], offset: u32) -> Position {
291    let offset_usize = (offset as usize).min(source.len());
292    // Binary search: find the last line_start ≤ offset.
293    let line = match line_starts.partition_point(|&s| s <= offset) {
294        0 => 0u32,
295        i => (i - 1) as u32,
296    };
297    let line_start = line_starts.get(line as usize).copied().unwrap_or(0) as usize;
298    let segment = &source[line_start..offset_usize];
299    // Strip trailing \r to handle CRLF: don't count \r as a column.
300    let segment = segment.strip_suffix('\r').unwrap_or(segment);
301    let character = segment.chars().map(|c| c.len_utf16() as u32).sum::<u32>();
302    Position { line, character }
303}
304
305/// Convert a `Span` (byte-offset pair) to an LSP `Range`.
306pub fn span_to_range(source: &str, line_starts: &[u32], span: Span) -> Range {
307    Range {
308        start: offset_to_position(source, line_starts, span.start),
309        end: offset_to_position(source, line_starts, span.end),
310    }
311}
312
313/// Return the byte offset of `substr` within `source`, or None if not found.
314///
315/// Uses pointer arithmetic when `substr` is a true sub-slice of `source`
316/// (i.e. arena-allocated names pointing into the same backing string).
317/// Falls back to a content search when the pointers differ — this handles
318/// tests and callers that pass a differently-allocated copy of the source.
319///
320/// When falling back, prefers matches that are preceded and followed by
321/// non-identifier characters (word boundaries), to avoid matching name parts
322/// (e.g., finding "B" in "Base" when searching for a class named "B").
323pub fn str_offset(source: &str, substr: &str) -> Option<u32> {
324    let src_ptr = source.as_ptr() as usize;
325    let sub_ptr = substr.as_ptr() as usize;
326    if sub_ptr >= src_ptr && sub_ptr + substr.len() <= src_ptr + source.len() {
327        return Some((sub_ptr - src_ptr) as u32);
328    }
329    // Fallback: locate by content with word-boundary preference.
330    // When pointer matching fails, prefer matches where substr is surrounded by
331    // non-identifier characters to avoid matching partial names (e.g., "B" in "Base").
332    let mut search_pos = 0;
333    while let Some(offset) = source[search_pos..].find(substr) {
334        let abs_offset = search_pos + offset;
335        let is_start_boundary = abs_offset == 0
336            || !source[..abs_offset]
337                .chars()
338                .last()
339                .map(|c| c.is_alphanumeric() || c == '_')
340                .unwrap_or(false);
341        let end_pos = abs_offset + substr.len();
342        let is_end_boundary = end_pos >= source.len()
343            || !source[end_pos..]
344                .chars()
345                .next()
346                .map(|c| c.is_alphanumeric() || c == '_')
347                .unwrap_or(false);
348
349        if is_start_boundary && is_end_boundary {
350            return Some(abs_offset as u32);
351        }
352
353        search_pos = abs_offset + 1;
354    }
355    None
356}
357
358/// Build an LSP `Range` for a name that is a sub-slice of `source`, or None if not found.
359pub fn name_range(source: &str, line_starts: &[u32], name: &str) -> Option<Range> {
360    let start = str_offset(source, name)?;
361    Some(Range {
362        start: offset_to_position(source, line_starts, start),
363        end: offset_to_position(source, line_starts, start + name.len() as u32),
364    })
365}
366
367/// Find a name within a specific byte range of the source, with word-boundary matching.
368/// Returns the absolute byte offset if found, None otherwise.
369pub fn str_offset_in_range(source: &str, span: Span, name: &str) -> Option<u32> {
370    let span_start = span.start as usize;
371    let span_end = span.end as usize;
372    if span_end > source.len() {
373        return None;
374    }
375    let span_text = &source[span_start..span_end];
376    let offset = str_offset(span_text, name)?;
377    Some(span_start as u32 + offset)
378}
379
380// ── TypeHint formatting ────────────────────────────────────────────────────────
381
382/// Format a `TypeHint` as a PHP type string, e.g. `?int`, `string|null`.
383pub fn format_type_hint(hint: &TypeHint<'_, '_>) -> String {
384    fmt_kind(&hint.kind)
385}
386
387fn fmt_kind(kind: &TypeHintKind<'_, '_>) -> String {
388    match kind {
389        TypeHintKind::Named(name) => name.to_string_repr().to_string(),
390        TypeHintKind::Keyword(builtin, _) => builtin.as_str().to_string(),
391        TypeHintKind::Nullable(inner) => format!("?{}", format_type_hint(inner)),
392        TypeHintKind::Union(types) => types
393            .iter()
394            .map(format_type_hint)
395            .collect::<Vec<_>>()
396            .join("|"),
397        TypeHintKind::Intersection(types) => types
398            .iter()
399            .map(format_type_hint)
400            .collect::<Vec<_>>()
401            .join("&"),
402    }
403}
404
405#[cfg(test)]
406mod tests {
407    use super::*;
408
409    #[test]
410    fn parses_empty_source() {
411        let doc = ParsedDoc::parse("<?php".to_string());
412        assert!(doc.errors.is_empty());
413        assert!(doc.program().stmts.is_empty());
414    }
415
416    #[test]
417    fn parses_function() {
418        let doc = ParsedDoc::parse("<?php\nfunction foo() {}".to_string());
419        assert_eq!(doc.program().stmts.len(), 1);
420    }
421
422    #[test]
423    fn offset_to_position_first_line() {
424        let src = "<?php\nfoo";
425        let doc = ParsedDoc::parse(src.to_string());
426        assert_eq!(
427            offset_to_position(src, doc.line_starts(), 0),
428            Position {
429                line: 0,
430                character: 0
431            }
432        );
433    }
434
435    #[test]
436    fn offset_to_position_second_line() {
437        // "<?php\n" — offset 6 is start of line 1
438        let src = "<?php\nfoo";
439        let doc = ParsedDoc::parse(src.to_string());
440        assert_eq!(
441            offset_to_position(src, doc.line_starts(), 6),
442            Position {
443                line: 1,
444                character: 0
445            }
446        );
447    }
448
449    #[test]
450    fn offset_to_position_multibyte_utf16() {
451        // "é" is U+00E9: 2 UTF-8 bytes, 1 UTF-16 code unit.
452        // "😀" is U+1F600: 4 UTF-8 bytes, 2 UTF-16 code units.
453        // source: "a😀b" — byte offsets: a=0, 😀=1..5, b=5
454        // UTF-16:            a=col 0, 😀=col 1..3, b=col 3
455        let src = "a\u{1F600}b";
456        let doc = ParsedDoc::parse(src.to_string());
457        assert_eq!(
458            offset_to_position(src, doc.line_starts(), 5), // byte offset of 'b'
459            Position {
460                line: 0,
461                character: 3
462            }  // UTF-16 col 3
463        );
464    }
465
466    #[test]
467    fn offset_to_position_crlf_start_of_line() {
468        // CRLF: offset pointing to first char of line 1 must give character=0.
469        // "foo\r\nbar": f=0 o=1 o=2 \r=3 \n=4 b=5 a=6 r=7
470        let src = "foo\r\nbar";
471        let doc = ParsedDoc::parse(src.to_string());
472        assert_eq!(
473            offset_to_position(src, doc.line_starts(), 5), // 'b'
474            Position {
475                line: 1,
476                character: 0
477            }
478        );
479    }
480
481    #[test]
482    fn offset_to_position_crlf_does_not_count_cr_in_column() {
483        // Offset pointing to the \r itself must not count it as a column.
484        // "foo\r\nbar": the \r is at offset 3, column must be 3 (length of "foo").
485        let src = "foo\r\nbar";
486        let doc = ParsedDoc::parse(src.to_string());
487        assert_eq!(
488            offset_to_position(src, doc.line_starts(), 3), // '\r'
489            Position {
490                line: 0,
491                character: 3
492            }
493        );
494    }
495
496    #[test]
497    fn offset_to_position_crlf_multiline() {
498        // Multiple CRLF lines: columns must not accumulate stray \r counts.
499        // "a\r\nb\r\nc": a=0 \r=1 \n=2 b=3 \r=4 \n=5 c=6
500        let src = "a\r\nb\r\nc";
501        let doc = ParsedDoc::parse(src.to_string());
502        assert_eq!(
503            offset_to_position(src, doc.line_starts(), 6), // 'c'
504            Position {
505                line: 2,
506                character: 0
507            }
508        );
509        assert_eq!(
510            offset_to_position(src, doc.line_starts(), 3), // 'b'
511            Position {
512                line: 1,
513                character: 0
514            }
515        );
516    }
517
518    #[test]
519    fn str_offset_finds_substr() {
520        let src = "<?php\nfunction foo() {}";
521        let name = &src[15..18]; // "foo"
522        assert_eq!(str_offset(src, name), Some(15));
523    }
524
525    #[test]
526    fn str_offset_content_fallback_for_different_allocation() {
527        // "foo" is a separately owned String (not a sub-slice of the source),
528        // so pointer arithmetic fails. The fallback finds it by content.
529        let owned = "foo".to_string();
530        assert_eq!(str_offset("<?php foo", &owned), Some(6));
531    }
532
533    #[test]
534    fn str_offset_unrelated_content_returns_none() {
535        let owned = "bar".to_string();
536        assert_eq!(str_offset("<?php foo", &owned), None);
537    }
538}