Skip to main content

php_lsp/
ast.rs

1/// Core AST infrastructure: arena-backed `ParsedDoc`, span utilities, and TypeHint formatting.
2use std::mem::ManuallyDrop;
3use std::sync::{Arc, LazyLock, Mutex};
4
5use php_ast::{Program, Span, TypeHint, TypeHintKind};
6use tower_lsp::lsp_types::{Position, Range};
7
8// ── BumpPool ──────────────────────────────────────────────────────────────────
9
10const POOL_CAP: usize = 8;
11
12struct BumpPool {
13    // Box<Bump> required: arena_box.as_ref() is transmuted to &'static Bump in
14    // ParsedDoc::parse(). The Box keeps the Bump at a stable heap address so
15    // that reference remains valid after the Box is moved into ArenaGuard.
16    #[allow(clippy::vec_box)]
17    pool: Mutex<Vec<Box<bumpalo::Bump>>>,
18}
19
20impl BumpPool {
21    fn take(&self) -> Box<bumpalo::Bump> {
22        self.pool
23            .lock()
24            .unwrap()
25            .pop()
26            .unwrap_or_else(|| Box::new(bumpalo::Bump::new()))
27    }
28
29    fn give(&self, mut arena: Box<bumpalo::Bump>) {
30        arena.reset();
31        let mut p = self.pool.lock().unwrap();
32        if p.len() < POOL_CAP {
33            p.push(arena);
34        }
35    }
36}
37
38static BUMP_POOL: LazyLock<BumpPool> = LazyLock::new(|| BumpPool {
39    pool: Mutex::new(Vec::new()),
40});
41
42// ── ArenaGuard ────────────────────────────────────────────────────────────────
43
44/// Returns the arena to the pool on drop.
45struct ArenaGuard(Option<Box<bumpalo::Bump>>);
46
47impl Drop for ArenaGuard {
48    fn drop(&mut self) {
49        if let Some(arena) = self.0.take() {
50            BUMP_POOL.give(arena);
51        }
52    }
53}
54
55// ── ParsedDoc ─────────────────────────────────────────────────────────────────
56
57/// Owns a parsed PHP document: the bumpalo arena, source snapshot, and Program.
58///
59/// SAFETY invariants:
60/// - `program` uses `ManuallyDrop` and is explicitly dropped in `Drop::drop()`
61///   before any field auto-drop runs. This guarantees arena-allocated nodes are
62///   gone before `ArenaGuard` recycles the arena — regardless of field order.
63/// - `_source: Arc<str>` keeps the source text alive at a stable heap address.
64///   The transmuted `&'static str` in the parser arena remains valid for the
65///   lifetime of `ParsedDoc` because `_source` is dropped last (after `program`
66///   is manually dropped and after `_arena` is recycled).
67/// - The `'static` lifetimes in `ManuallyDrop<Box<Program<'static, 'static>>>`
68///   are erased versions of the true lifetimes `'_arena` and `'_source`. The
69///   public `program()` accessor re-attaches them to `&self`, preventing any
70///   reference from escaping beyond the lifetime of the `ParsedDoc`.
71pub struct ParsedDoc {
72    program: ManuallyDrop<Box<Program<'static, 'static>>>,
73    pub errors: Vec<php_rs_parser::diagnostics::ParseError>,
74    _source: Arc<str>,
75    line_starts: Vec<u32>,
76    _arena: ArenaGuard,
77}
78
79impl Drop for ParsedDoc {
80    fn drop(&mut self) {
81        // Drop program explicitly before any field auto-drop runs (including
82        // _arena). ManuallyDrop prevents a second drop after this method returns.
83        // SAFETY: called exactly once here; no other code drops this field.
84        unsafe { ManuallyDrop::drop(&mut self.program) };
85    }
86}
87
88// SAFETY: Program nodes contain only data; no thread-local state.
89unsafe impl Send for ParsedDoc {}
90unsafe impl Sync for ParsedDoc {}
91
92impl ParsedDoc {
93    /// Parse PHP source text. Accepts `String`, `&str`, or `Arc<str>` — when
94    /// the caller already holds an `Arc<str>` (e.g. the salsa `parsed_doc`
95    /// query), no heap allocation occurs (refcount bump only).
96    pub fn parse(source: impl Into<Arc<str>>) -> Self {
97        let source: Arc<str> = source.into();
98        // Take a pre-warmed arena from the pool (or allocate a fresh one).
99        let arena_box = BUMP_POOL.take();
100
101        // SAFETY: `Arc<str>` data lives at a stable heap address for as long as
102        // the Arc is alive. `_source` keeps the Arc alive for `ParsedDoc`'s
103        // lifetime, so the transmuted `&'static str` remains valid. The arena
104        // box is similarly stable — moving a `Box` moves the pointer, not the
105        // heap data.
106        let src_ref: &'static str = unsafe { std::mem::transmute::<&str, &'static str>(&*source) };
107        let arena_ref: &'static bumpalo::Bump = unsafe {
108            std::mem::transmute::<&bumpalo::Bump, &'static bumpalo::Bump>(arena_box.as_ref())
109        };
110
111        let result = php_rs_parser::parse_arena(arena_ref, src_ref);
112
113        let line_starts = build_line_starts(src_ref);
114
115        ParsedDoc {
116            program: ManuallyDrop::new(Box::new(result.program)),
117            errors: result.errors,
118            _source: source,
119            line_starts,
120            _arena: ArenaGuard(Some(arena_box)),
121        }
122    }
123
124    /// Borrow the program with lifetimes bounded by `&self`.
125    ///
126    /// SAFETY: covariance of `Program<'arena, 'src>` in both parameters lets
127    /// `&Program<'static, 'static>` shorten to `&Program<'_, '_>`.
128    #[inline]
129    pub fn program(&self) -> &Program<'_, '_> {
130        &self.program
131    }
132
133    /// Borrow the source text used when parsing.
134    #[inline]
135    pub fn source(&self) -> &str {
136        &self._source
137    }
138
139    /// Clone the `Arc<str>` backing the source text (refcount bump only).
140    ///
141    /// Callers that need to store the same source text in a salsa input
142    /// should use this to get the identical pointer — enabling cheap
143    /// `Arc::ptr_eq` validation in the `parsed_cache`.
144    #[inline]
145    pub fn source_arc(&self) -> Arc<str> {
146        self._source.clone()
147    }
148
149    /// Borrow the precomputed line-start byte offsets.
150    /// `line_starts[i]` is the byte offset of the first character on line `i`.
151    pub fn line_starts(&self) -> &[u32] {
152        &self.line_starts
153    }
154
155    /// Bundle source and line index for position lookups.
156    pub fn view(&self) -> SourceView<'_> {
157        SourceView {
158            source: self.source(),
159            line_starts: self.line_starts(),
160        }
161    }
162}
163
164impl Default for ParsedDoc {
165    fn default() -> Self {
166        ParsedDoc::parse("")
167    }
168}
169
170// ── Span / position utilities ─────────────────────────────────────────────────
171
172/// Build a table of byte offsets for the start of each line.
173/// `result[i]` is the byte offset of the first character on line `i`.
174fn build_line_starts(source: &str) -> Vec<u32> {
175    let mut starts = vec![0u32];
176    for (i, b) in source.bytes().enumerate() {
177        if b == b'\n' {
178            starts.push(i as u32 + 1);
179        }
180    }
181    starts
182}
183
184/// Bundles source text with its precomputed line-start table.
185/// `Copy` so inner functions can pass it by value without lifetime annotation churn.
186#[derive(Copy, Clone)]
187pub struct SourceView<'a> {
188    source: &'a str,
189    line_starts: &'a [u32],
190}
191
192impl<'a> SourceView<'a> {
193    #[inline]
194    pub fn source(self) -> &'a str {
195        self.source
196    }
197
198    pub fn position_of(self, offset: u32) -> Position {
199        offset_to_position(self.source, self.line_starts, offset)
200    }
201
202    #[inline]
203    pub fn line_starts(self) -> &'a [u32] {
204        self.line_starts
205    }
206
207    /// O(log lines) line number for a byte offset. Cheaper than
208    /// `position_of(..).line` because it skips the UTF-16 column scan.
209    #[inline]
210    pub fn line_of(self, offset: u32) -> u32 {
211        match self.line_starts.partition_point(|&s| s <= offset) {
212            0 => 0,
213            i => (i - 1) as u32,
214        }
215    }
216
217    /// O(log lines) UTF-16 `Position` -> byte offset. Uses the precomputed
218    /// `line_starts` table to jump to the target line and only scans that
219    /// line's characters. Much faster than a linear scan over the whole
220    /// source when the cursor is late in the file.
221    pub fn byte_of_position(self, pos: Position) -> u32 {
222        let line_idx = pos.line as usize;
223        let line_start = self.line_starts.get(line_idx).copied().unwrap_or(0) as usize;
224        let line_end = self
225            .line_starts
226            .get(line_idx + 1)
227            .map(|&s| (s as usize).saturating_sub(1))
228            .unwrap_or(self.source.len());
229        let raw = &self.source[line_start..line_end.min(self.source.len())];
230        let line = raw.strip_suffix('\r').unwrap_or(raw);
231        let mut col_utf16: u32 = 0;
232        let mut byte_in_line: usize = 0;
233        for ch in line.chars() {
234            if col_utf16 >= pos.character {
235                break;
236            }
237            col_utf16 += ch.len_utf16() as u32;
238            byte_in_line += ch.len_utf8();
239        }
240        (line_start + byte_in_line) as u32
241    }
242
243    pub fn range_of(self, span: Span) -> Range {
244        Range {
245            start: self.position_of(span.start),
246            end: self.position_of(span.end),
247        }
248    }
249
250    pub fn name_range(self, name: &str) -> Range {
251        let start = str_offset(self.source, name).unwrap_or(0);
252        Range {
253            start: self.position_of(start),
254            end: self.position_of(start + name.len() as u32),
255        }
256    }
257
258    /// Like [`name_range`], but searches for `name` *within* `span` instead
259    /// of the whole source. Needed when the same name appears in earlier
260    /// docblock comments / other declarations — a global search would
261    /// otherwise point at the first textual occurrence, not the AST node
262    /// the caller actually meant.
263    pub fn name_range_in_span(self, name: &str, span: php_ast::Span) -> Range {
264        let s = span.start as usize;
265        let e = (span.end as usize).min(self.source.len());
266        let start = self
267            .source
268            .get(s..e)
269            .and_then(|slice| slice.find(name))
270            .map(|off| span.start + off as u32)
271            .unwrap_or_else(|| str_offset(self.source, name).unwrap_or(0));
272        Range {
273            start: self.position_of(start),
274            end: self.position_of(start + name.len() as u32),
275        }
276    }
277}
278
279/// Convert a byte offset into `source` to an LSP `Position` (0-based line/char).
280///
281/// Uses a precomputed `line_starts` table for O(log n) binary search.
282/// Handles both LF-only and CRLF line endings: a trailing `\r` before `\n` is
283/// not counted as a column so that positions are consistent regardless of
284/// line-ending style.
285pub fn offset_to_position(source: &str, line_starts: &[u32], offset: u32) -> Position {
286    let offset_usize = (offset as usize).min(source.len());
287    // Binary search: find the last line_start ≤ offset.
288    let line = match line_starts.partition_point(|&s| s <= offset) {
289        0 => 0u32,
290        i => (i - 1) as u32,
291    };
292    let line_start = line_starts.get(line as usize).copied().unwrap_or(0) as usize;
293    let segment = &source[line_start..offset_usize];
294    // Strip trailing \r to handle CRLF: don't count \r as a column.
295    let segment = segment.strip_suffix('\r').unwrap_or(segment);
296    let character = segment.chars().map(|c| c.len_utf16() as u32).sum::<u32>();
297    Position { line, character }
298}
299
300/// Convert a `Span` (byte-offset pair) to an LSP `Range`.
301pub fn span_to_range(source: &str, line_starts: &[u32], span: Span) -> Range {
302    Range {
303        start: offset_to_position(source, line_starts, span.start),
304        end: offset_to_position(source, line_starts, span.end),
305    }
306}
307
308/// Return the byte offset of `substr` within `source`, or None if not found.
309///
310/// Uses pointer arithmetic when `substr` is a true sub-slice of `source`
311/// (i.e. arena-allocated names pointing into the same backing string).
312/// Falls back to a content search when the pointers differ — this handles
313/// tests and callers that pass a differently-allocated copy of the source.
314///
315/// When falling back, prefers matches that are preceded and followed by
316/// non-identifier characters (word boundaries), to avoid matching name parts
317/// (e.g., finding "B" in "Base" when searching for a class named "B").
318pub fn str_offset(source: &str, substr: &str) -> Option<u32> {
319    let src_ptr = source.as_ptr() as usize;
320    let sub_ptr = substr.as_ptr() as usize;
321    if sub_ptr >= src_ptr && sub_ptr + substr.len() <= src_ptr + source.len() {
322        return Some((sub_ptr - src_ptr) as u32);
323    }
324    // Fallback: locate by content with word-boundary preference.
325    // When pointer matching fails, prefer matches where substr is surrounded by
326    // non-identifier characters to avoid matching partial names (e.g., "B" in "Base").
327    let mut search_pos = 0;
328    while let Some(offset) = source[search_pos..].find(substr) {
329        let abs_offset = search_pos + offset;
330        let is_start_boundary = abs_offset == 0
331            || !source[..abs_offset]
332                .chars()
333                .last()
334                .map(|c| c.is_alphanumeric() || c == '_')
335                .unwrap_or(false);
336        let end_pos = abs_offset + substr.len();
337        let is_end_boundary = end_pos >= source.len()
338            || !source[end_pos..]
339                .chars()
340                .next()
341                .map(|c| c.is_alphanumeric() || c == '_')
342                .unwrap_or(false);
343
344        if is_start_boundary && is_end_boundary {
345            return Some(abs_offset as u32);
346        }
347
348        search_pos = abs_offset + 1;
349    }
350    None
351}
352
353/// Build an LSP `Range` for a name that is a sub-slice of `source`, or None if not found.
354pub fn name_range(source: &str, line_starts: &[u32], name: &str) -> Option<Range> {
355    let start = str_offset(source, name)?;
356    Some(Range {
357        start: offset_to_position(source, line_starts, start),
358        end: offset_to_position(source, line_starts, start + name.len() as u32),
359    })
360}
361
362/// Find a name within a specific byte range of the source, with word-boundary matching.
363/// Returns the absolute byte offset if found, None otherwise.
364pub fn str_offset_in_range(source: &str, span: Span, name: &str) -> Option<u32> {
365    let span_start = span.start as usize;
366    let span_end = span.end as usize;
367    if span_end > source.len() {
368        return None;
369    }
370    let span_text = &source[span_start..span_end];
371    let offset = str_offset(span_text, name)?;
372    Some(span_start as u32 + offset)
373}
374
375// ── TypeHint formatting ────────────────────────────────────────────────────────
376
377/// Format a `TypeHint` as a PHP type string, e.g. `?int`, `string|null`.
378pub fn format_type_hint(hint: &TypeHint<'_, '_>) -> String {
379    fmt_kind(&hint.kind)
380}
381
382fn fmt_kind(kind: &TypeHintKind<'_, '_>) -> String {
383    match kind {
384        TypeHintKind::Named(name) => name.to_string_repr().to_string(),
385        TypeHintKind::Keyword(builtin, _) => builtin.as_str().to_string(),
386        TypeHintKind::Nullable(inner) => format!("?{}", format_type_hint(inner)),
387        TypeHintKind::Union(types) => types
388            .iter()
389            .map(format_type_hint)
390            .collect::<Vec<_>>()
391            .join("|"),
392        TypeHintKind::Intersection(types) => types
393            .iter()
394            .map(format_type_hint)
395            .collect::<Vec<_>>()
396            .join("&"),
397    }
398}
399
400#[cfg(test)]
401mod tests {
402    use super::*;
403
404    #[test]
405    fn parses_empty_source() {
406        let doc = ParsedDoc::parse("<?php".to_string());
407        assert!(doc.errors.is_empty());
408        assert!(doc.program().stmts.is_empty());
409    }
410
411    #[test]
412    fn parses_function() {
413        let doc = ParsedDoc::parse("<?php\nfunction foo() {}".to_string());
414        assert_eq!(doc.program().stmts.len(), 1);
415    }
416
417    #[test]
418    fn offset_to_position_first_line() {
419        let src = "<?php\nfoo";
420        let doc = ParsedDoc::parse(src.to_string());
421        assert_eq!(
422            offset_to_position(src, doc.line_starts(), 0),
423            Position {
424                line: 0,
425                character: 0
426            }
427        );
428    }
429
430    #[test]
431    fn offset_to_position_second_line() {
432        // "<?php\n" — offset 6 is start of line 1
433        let src = "<?php\nfoo";
434        let doc = ParsedDoc::parse(src.to_string());
435        assert_eq!(
436            offset_to_position(src, doc.line_starts(), 6),
437            Position {
438                line: 1,
439                character: 0
440            }
441        );
442    }
443
444    #[test]
445    fn offset_to_position_multibyte_utf16() {
446        // "é" is U+00E9: 2 UTF-8 bytes, 1 UTF-16 code unit.
447        // "😀" is U+1F600: 4 UTF-8 bytes, 2 UTF-16 code units.
448        // source: "a😀b" — byte offsets: a=0, 😀=1..5, b=5
449        // UTF-16:            a=col 0, 😀=col 1..3, b=col 3
450        let src = "a\u{1F600}b";
451        let doc = ParsedDoc::parse(src.to_string());
452        assert_eq!(
453            offset_to_position(src, doc.line_starts(), 5), // byte offset of 'b'
454            Position {
455                line: 0,
456                character: 3
457            }  // UTF-16 col 3
458        );
459    }
460
461    #[test]
462    fn offset_to_position_crlf_start_of_line() {
463        // CRLF: offset pointing to first char of line 1 must give character=0.
464        // "foo\r\nbar": f=0 o=1 o=2 \r=3 \n=4 b=5 a=6 r=7
465        let src = "foo\r\nbar";
466        let doc = ParsedDoc::parse(src.to_string());
467        assert_eq!(
468            offset_to_position(src, doc.line_starts(), 5), // 'b'
469            Position {
470                line: 1,
471                character: 0
472            }
473        );
474    }
475
476    #[test]
477    fn offset_to_position_crlf_does_not_count_cr_in_column() {
478        // Offset pointing to the \r itself must not count it as a column.
479        // "foo\r\nbar": the \r is at offset 3, column must be 3 (length of "foo").
480        let src = "foo\r\nbar";
481        let doc = ParsedDoc::parse(src.to_string());
482        assert_eq!(
483            offset_to_position(src, doc.line_starts(), 3), // '\r'
484            Position {
485                line: 0,
486                character: 3
487            }
488        );
489    }
490
491    #[test]
492    fn offset_to_position_crlf_multiline() {
493        // Multiple CRLF lines: columns must not accumulate stray \r counts.
494        // "a\r\nb\r\nc": a=0 \r=1 \n=2 b=3 \r=4 \n=5 c=6
495        let src = "a\r\nb\r\nc";
496        let doc = ParsedDoc::parse(src.to_string());
497        assert_eq!(
498            offset_to_position(src, doc.line_starts(), 6), // 'c'
499            Position {
500                line: 2,
501                character: 0
502            }
503        );
504        assert_eq!(
505            offset_to_position(src, doc.line_starts(), 3), // 'b'
506            Position {
507                line: 1,
508                character: 0
509            }
510        );
511    }
512
513    #[test]
514    fn str_offset_finds_substr() {
515        let src = "<?php\nfunction foo() {}";
516        let name = &src[15..18]; // "foo"
517        assert_eq!(str_offset(src, name), Some(15));
518    }
519
520    #[test]
521    fn str_offset_content_fallback_for_different_allocation() {
522        // "foo" is a separately owned String (not a sub-slice of the source),
523        // so pointer arithmetic fails. The fallback finds it by content.
524        let owned = "foo".to_string();
525        assert_eq!(str_offset("<?php foo", &owned), Some(6));
526    }
527
528    #[test]
529    fn str_offset_unrelated_content_returns_none() {
530        let owned = "bar".to_string();
531        assert_eq!(str_offset("<?php foo", &owned), None);
532    }
533}