Skip to main content

php_lsp/document/
ast.rs

1//! Core AST infrastructure: arena-backed `ParsedDoc`, span utilities, and TypeHint formatting.
2//!
3//! Convention: a borrowed [`SourceView`] is idiomatically bound to `sv` across
4//! the crate (offset↔position math). A `ParsedDoc` value is `doc`. See the
5//! crate-root glossary in `lib.rs`.
6use std::mem::ManuallyDrop;
7use std::sync::{Arc, LazyLock, Mutex};
8
9use php_ast::{Program, Span, TypeHint, TypeHintKind};
10use tower_lsp::lsp_types::{Position, Range};
11
12// ── BumpPool ──────────────────────────────────────────────────────────────────
13
14const POOL_CAP: usize = 8;
15
16struct BumpPool {
17    // Box<Bump> required: arena_box.as_ref() is transmuted to &'static Bump in
18    // ParsedDoc::parse(). The Box keeps the Bump at a stable heap address so
19    // that reference remains valid after the Box is moved into ArenaGuard.
20    #[allow(clippy::vec_box)]
21    pool: Mutex<Vec<Box<bumpalo::Bump>>>,
22}
23
24impl BumpPool {
25    fn take(&self) -> Box<bumpalo::Bump> {
26        self.pool
27            .lock()
28            .unwrap()
29            .pop()
30            .unwrap_or_else(|| Box::new(bumpalo::Bump::new()))
31    }
32
33    fn give(&self, mut arena: Box<bumpalo::Bump>) {
34        arena.reset();
35        let mut p = self.pool.lock().unwrap();
36        if p.len() < POOL_CAP {
37            p.push(arena);
38        }
39    }
40}
41
42static BUMP_POOL: LazyLock<BumpPool> = LazyLock::new(|| BumpPool {
43    pool: Mutex::new(Vec::new()),
44});
45
46// ── ArenaGuard ────────────────────────────────────────────────────────────────
47
48/// Returns the arena to the pool on drop.
49struct ArenaGuard(Option<Box<bumpalo::Bump>>);
50
51impl Drop for ArenaGuard {
52    fn drop(&mut self) {
53        if let Some(arena) = self.0.take() {
54            BUMP_POOL.give(arena);
55        }
56    }
57}
58
59// ── ParsedDoc ─────────────────────────────────────────────────────────────────
60
61/// Owns a parsed PHP document: the bumpalo arena, source snapshot, and Program.
62///
63/// SAFETY invariants:
64/// - `program` uses `ManuallyDrop` and is explicitly dropped in `Drop::drop()`
65///   before any field auto-drop runs. This guarantees arena-allocated nodes are
66///   gone before `ArenaGuard` recycles the arena — regardless of field order.
67/// - `_source: Arc<str>` keeps the source text alive at a stable heap address.
68///   The transmuted `&'static str` in the parser arena remains valid for the
69///   lifetime of `ParsedDoc` because `_source` is dropped last (after `program`
70///   is manually dropped and after `_arena` is recycled).
71/// - The `'static` lifetimes in `ManuallyDrop<Box<Program<'static, 'static>>>`
72///   are erased versions of the true lifetimes `'_arena` and `'_source`. The
73///   public `program()` accessor re-attaches them to `&self`, preventing any
74///   reference from escaping beyond the lifetime of the `ParsedDoc`.
75pub struct ParsedDoc {
76    program: ManuallyDrop<Box<Program<'static, 'static>>>,
77    pub errors: Vec<php_rs_parser::diagnostics::ParseError>,
78    _source: Arc<str>,
79    line_starts: Vec<u32>,
80    _arena: ArenaGuard,
81}
82
83impl Drop for ParsedDoc {
84    fn drop(&mut self) {
85        // Drop program explicitly before any field auto-drop runs (including
86        // _arena). ManuallyDrop prevents a second drop after this method returns.
87        // SAFETY: called exactly once here; no other code drops this field.
88        unsafe { ManuallyDrop::drop(&mut self.program) };
89    }
90}
91
92// SAFETY: Program nodes contain only data; no thread-local state.
93unsafe impl Send for ParsedDoc {}
94unsafe impl Sync for ParsedDoc {}
95
96impl ParsedDoc {
97    /// Parse PHP source text. Accepts `String`, `&str`, or `Arc<str>` — when
98    /// the caller already holds an `Arc<str>` (e.g. the salsa `parsed_doc`
99    /// query), no heap allocation occurs (refcount bump only).
100    pub fn parse(source: impl Into<Arc<str>>) -> Self {
101        let source: Arc<str> = source.into();
102        // Take a pre-warmed arena from the pool (or allocate a fresh one).
103        let arena_box = BUMP_POOL.take();
104
105        // SAFETY: `Arc<str>` data lives at a stable heap address for as long as
106        // the Arc is alive. `_source` keeps the Arc alive for `ParsedDoc`'s
107        // lifetime, so the transmuted `&'static str` remains valid. The arena
108        // box is similarly stable — moving a `Box` moves the pointer, not the
109        // heap data.
110        let src_ref: &'static str = unsafe { std::mem::transmute::<&str, &'static str>(&*source) };
111        let arena_ref: &'static bumpalo::Bump = unsafe {
112            std::mem::transmute::<&bumpalo::Bump, &'static bumpalo::Bump>(arena_box.as_ref())
113        };
114
115        let result = php_rs_parser::parse_arena(arena_ref, src_ref);
116
117        let line_starts = build_line_starts(src_ref);
118
119        ParsedDoc {
120            program: ManuallyDrop::new(Box::new(result.program)),
121            errors: result.errors,
122            _source: source,
123            line_starts,
124            _arena: ArenaGuard(Some(arena_box)),
125        }
126    }
127
128    /// Borrow the program with lifetimes bounded by `&self`.
129    ///
130    /// SAFETY: covariance of `Program<'arena, 'src>` in both parameters lets
131    /// `&Program<'static, 'static>` shorten to `&Program<'_, '_>`.
132    #[inline]
133    pub fn program(&self) -> &Program<'_, '_> {
134        &self.program
135    }
136
137    /// Borrow the source text used when parsing.
138    #[inline]
139    pub fn source(&self) -> &str {
140        &self._source
141    }
142
143    /// Clone the `Arc<str>` backing the source text (refcount bump only).
144    ///
145    /// Callers that need to store the same source text in a salsa input
146    /// should use this to get the identical pointer — enabling cheap
147    /// `Arc::ptr_eq` validation in the `parsed_cache`.
148    #[inline]
149    pub fn source_arc(&self) -> Arc<str> {
150        self._source.clone()
151    }
152
153    /// Borrow the precomputed line-start byte offsets.
154    /// `line_starts[i]` is the byte offset of the first character on line `i`.
155    pub fn line_starts(&self) -> &[u32] {
156        &self.line_starts
157    }
158
159    /// Bundle source and line index for position lookups.
160    pub fn view(&self) -> SourceView<'_> {
161        SourceView {
162            source: self.source(),
163            line_starts: self.line_starts(),
164        }
165    }
166}
167
168impl Default for ParsedDoc {
169    fn default() -> Self {
170        ParsedDoc::parse("")
171    }
172}
173
174// ── Span / position utilities ─────────────────────────────────────────────────
175
176/// Build a table of byte offsets for the start of each line.
177/// `result[i]` is the byte offset of the first character on line `i`.
178fn build_line_starts(source: &str) -> Vec<u32> {
179    let mut starts = vec![0u32];
180    for (i, b) in source.bytes().enumerate() {
181        if b == b'\n' {
182            starts.push(i as u32 + 1);
183        }
184    }
185    starts
186}
187
188/// Bundles source text with its precomputed line-start table.
189/// `Copy` so inner functions can pass it by value without lifetime annotation churn.
190#[derive(Copy, Clone)]
191pub struct SourceView<'a> {
192    source: &'a str,
193    line_starts: &'a [u32],
194}
195
196impl<'a> SourceView<'a> {
197    #[inline]
198    pub fn source(self) -> &'a str {
199        self.source
200    }
201
202    pub fn position_of(self, offset: u32) -> Position {
203        offset_to_position(self.source, self.line_starts, offset)
204    }
205
206    #[inline]
207    pub fn line_starts(self) -> &'a [u32] {
208        self.line_starts
209    }
210
211    /// O(log lines) line number for a byte offset. Cheaper than
212    /// `position_of(..).line` because it skips the UTF-16 column scan.
213    #[inline]
214    pub fn line_of(self, offset: u32) -> u32 {
215        match self.line_starts.partition_point(|&s| s <= offset) {
216            0 => 0,
217            i => (i - 1) as u32,
218        }
219    }
220
221    /// O(log lines) UTF-16 `Position` -> byte offset. Uses the precomputed
222    /// `line_starts` table to jump to the target line and only scans that
223    /// line's characters. Much faster than a linear scan over the whole
224    /// source when the cursor is late in the file.
225    pub fn byte_of_position(self, pos: Position) -> u32 {
226        let line_idx = pos.line as usize;
227        let line_start = self.line_starts.get(line_idx).copied().unwrap_or(0) as usize;
228        let line_end = self
229            .line_starts
230            .get(line_idx + 1)
231            .map(|&s| (s as usize).saturating_sub(1))
232            .unwrap_or(self.source.len());
233        let raw = &self.source[line_start..line_end.min(self.source.len())];
234        let line = raw.strip_suffix('\r').unwrap_or(raw);
235        let mut col_utf16: u32 = 0;
236        let mut byte_in_line: usize = 0;
237        for ch in line.chars() {
238            if col_utf16 >= pos.character {
239                break;
240            }
241            col_utf16 += ch.len_utf16() as u32;
242            byte_in_line += ch.len_utf8();
243        }
244        (line_start + byte_in_line) as u32
245    }
246
247    pub fn range_of(self, span: Span) -> Range {
248        Range {
249            start: self.position_of(span.start),
250            end: self.position_of(span.end),
251        }
252    }
253
254    pub fn name_range(self, name: &str) -> Range {
255        let start = str_offset(self.source, name).unwrap_or(0);
256        Range {
257            start: self.position_of(start),
258            end: self.position_of(start + name.len() as u32),
259        }
260    }
261
262    /// Like [`name_range`], but searches for `name` *within* `span` instead
263    /// of the whole source. Needed when the same name appears in earlier
264    /// docblock comments / other declarations — a global search would
265    /// otherwise point at the first textual occurrence, not the AST node
266    /// the caller actually meant.
267    pub fn name_range_in_span(self, name: &str, span: php_ast::Span) -> Range {
268        let s = span.start as usize;
269        let e = (span.end as usize).min(self.source.len());
270        let start = self
271            .source
272            .get(s..e)
273            .and_then(|slice| slice.find(name))
274            .map(|off| span.start + off as u32)
275            .unwrap_or_else(|| str_offset(self.source, name).unwrap_or(0));
276        Range {
277            start: self.position_of(start),
278            end: self.position_of(start + name.len() as u32),
279        }
280    }
281}
282
283/// Convert a byte offset into `source` to an LSP `Position` (0-based line/char).
284///
285/// Uses a precomputed `line_starts` table for O(log n) binary search.
286/// Handles both LF-only and CRLF line endings: a trailing `\r` before `\n` is
287/// not counted as a column so that positions are consistent regardless of
288/// line-ending style.
289pub fn offset_to_position(source: &str, line_starts: &[u32], offset: u32) -> Position {
290    let offset_usize = (offset as usize).min(source.len());
291    // Binary search: find the last line_start ≤ offset.
292    let line = match line_starts.partition_point(|&s| s <= offset) {
293        0 => 0u32,
294        i => (i - 1) as u32,
295    };
296    let line_start = line_starts.get(line as usize).copied().unwrap_or(0) as usize;
297    let segment = &source[line_start..offset_usize];
298    // Strip trailing \r to handle CRLF: don't count \r as a column.
299    let segment = segment.strip_suffix('\r').unwrap_or(segment);
300    let character = segment.chars().map(|c| c.len_utf16() as u32).sum::<u32>();
301    Position { line, character }
302}
303
304/// Convert a `Span` (byte-offset pair) to an LSP `Range`.
305pub fn span_to_range(source: &str, line_starts: &[u32], span: Span) -> Range {
306    Range {
307        start: offset_to_position(source, line_starts, span.start),
308        end: offset_to_position(source, line_starts, span.end),
309    }
310}
311
312/// Return the byte offset of `substr` within `source`, or None if not found.
313///
314/// Uses pointer arithmetic when `substr` is a true sub-slice of `source`
315/// (i.e. arena-allocated names pointing into the same backing string).
316/// Falls back to a content search when the pointers differ — this handles
317/// tests and callers that pass a differently-allocated copy of the source.
318///
319/// When falling back, prefers matches that are preceded and followed by
320/// non-identifier characters (word boundaries), to avoid matching name parts
321/// (e.g., finding "B" in "Base" when searching for a class named "B").
322pub fn str_offset(source: &str, substr: &str) -> Option<u32> {
323    let src_ptr = source.as_ptr() as usize;
324    let sub_ptr = substr.as_ptr() as usize;
325    if sub_ptr >= src_ptr && sub_ptr + substr.len() <= src_ptr + source.len() {
326        return Some((sub_ptr - src_ptr) as u32);
327    }
328    // Fallback: locate by content with word-boundary preference.
329    // When pointer matching fails, prefer matches where substr is surrounded by
330    // non-identifier characters to avoid matching partial names (e.g., "B" in "Base").
331    let mut search_pos = 0;
332    while let Some(offset) = source[search_pos..].find(substr) {
333        let abs_offset = search_pos + offset;
334        let is_start_boundary = abs_offset == 0
335            || !source[..abs_offset]
336                .chars()
337                .last()
338                .map(|c| c.is_alphanumeric() || c == '_')
339                .unwrap_or(false);
340        let end_pos = abs_offset + substr.len();
341        let is_end_boundary = end_pos >= source.len()
342            || !source[end_pos..]
343                .chars()
344                .next()
345                .map(|c| c.is_alphanumeric() || c == '_')
346                .unwrap_or(false);
347
348        if is_start_boundary && is_end_boundary {
349            return Some(abs_offset as u32);
350        }
351
352        search_pos = abs_offset + 1;
353    }
354    None
355}
356
357/// Build an LSP `Range` for a name that is a sub-slice of `source`, or None if not found.
358pub fn name_range(source: &str, line_starts: &[u32], name: &str) -> Option<Range> {
359    let start = str_offset(source, name)?;
360    Some(Range {
361        start: offset_to_position(source, line_starts, start),
362        end: offset_to_position(source, line_starts, start + name.len() as u32),
363    })
364}
365
366/// Find a name within a specific byte range of the source, with word-boundary matching.
367/// Returns the absolute byte offset if found, None otherwise.
368pub fn str_offset_in_range(source: &str, span: Span, name: &str) -> Option<u32> {
369    let span_start = span.start as usize;
370    let span_end = span.end as usize;
371    if span_end > source.len() {
372        return None;
373    }
374    let span_text = &source[span_start..span_end];
375    let offset = str_offset(span_text, name)?;
376    Some(span_start as u32 + offset)
377}
378
379// ── TypeHint formatting ────────────────────────────────────────────────────────
380
381/// Format a `TypeHint` as a PHP type string, e.g. `?int`, `string|null`.
382pub fn format_type_hint(hint: &TypeHint<'_, '_>) -> String {
383    fmt_kind(&hint.kind)
384}
385
386fn fmt_kind(kind: &TypeHintKind<'_, '_>) -> String {
387    match kind {
388        TypeHintKind::Named(name) => name.to_string_repr().to_string(),
389        TypeHintKind::Keyword(builtin, _) => builtin.as_str().to_string(),
390        TypeHintKind::Nullable(inner) => format!("?{}", format_type_hint(inner)),
391        TypeHintKind::Union(types) => types
392            .iter()
393            .map(format_type_hint)
394            .collect::<Vec<_>>()
395            .join("|"),
396        TypeHintKind::Intersection(types) => types
397            .iter()
398            .map(format_type_hint)
399            .collect::<Vec<_>>()
400            .join("&"),
401    }
402}
403
404#[cfg(test)]
405mod tests {
406    use super::*;
407
408    #[test]
409    fn parses_empty_source() {
410        let doc = ParsedDoc::parse("<?php".to_string());
411        assert!(doc.errors.is_empty());
412        assert!(doc.program().stmts.is_empty());
413    }
414
415    #[test]
416    fn parses_function() {
417        let doc = ParsedDoc::parse("<?php\nfunction foo() {}".to_string());
418        assert_eq!(doc.program().stmts.len(), 1);
419    }
420
421    #[test]
422    fn offset_to_position_first_line() {
423        let src = "<?php\nfoo";
424        let doc = ParsedDoc::parse(src.to_string());
425        assert_eq!(
426            offset_to_position(src, doc.line_starts(), 0),
427            Position {
428                line: 0,
429                character: 0
430            }
431        );
432    }
433
434    #[test]
435    fn offset_to_position_second_line() {
436        // "<?php\n" — offset 6 is start of line 1
437        let src = "<?php\nfoo";
438        let doc = ParsedDoc::parse(src.to_string());
439        assert_eq!(
440            offset_to_position(src, doc.line_starts(), 6),
441            Position {
442                line: 1,
443                character: 0
444            }
445        );
446    }
447
448    #[test]
449    fn offset_to_position_multibyte_utf16() {
450        // "é" is U+00E9: 2 UTF-8 bytes, 1 UTF-16 code unit.
451        // "😀" is U+1F600: 4 UTF-8 bytes, 2 UTF-16 code units.
452        // source: "a😀b" — byte offsets: a=0, 😀=1..5, b=5
453        // UTF-16:            a=col 0, 😀=col 1..3, b=col 3
454        let src = "a\u{1F600}b";
455        let doc = ParsedDoc::parse(src.to_string());
456        assert_eq!(
457            offset_to_position(src, doc.line_starts(), 5), // byte offset of 'b'
458            Position {
459                line: 0,
460                character: 3
461            }  // UTF-16 col 3
462        );
463    }
464
465    #[test]
466    fn offset_to_position_crlf_start_of_line() {
467        // CRLF: offset pointing to first char of line 1 must give character=0.
468        // "foo\r\nbar": f=0 o=1 o=2 \r=3 \n=4 b=5 a=6 r=7
469        let src = "foo\r\nbar";
470        let doc = ParsedDoc::parse(src.to_string());
471        assert_eq!(
472            offset_to_position(src, doc.line_starts(), 5), // 'b'
473            Position {
474                line: 1,
475                character: 0
476            }
477        );
478    }
479
480    #[test]
481    fn offset_to_position_crlf_does_not_count_cr_in_column() {
482        // Offset pointing to the \r itself must not count it as a column.
483        // "foo\r\nbar": the \r is at offset 3, column must be 3 (length of "foo").
484        let src = "foo\r\nbar";
485        let doc = ParsedDoc::parse(src.to_string());
486        assert_eq!(
487            offset_to_position(src, doc.line_starts(), 3), // '\r'
488            Position {
489                line: 0,
490                character: 3
491            }
492        );
493    }
494
495    #[test]
496    fn offset_to_position_crlf_multiline() {
497        // Multiple CRLF lines: columns must not accumulate stray \r counts.
498        // "a\r\nb\r\nc": a=0 \r=1 \n=2 b=3 \r=4 \n=5 c=6
499        let src = "a\r\nb\r\nc";
500        let doc = ParsedDoc::parse(src.to_string());
501        assert_eq!(
502            offset_to_position(src, doc.line_starts(), 6), // 'c'
503            Position {
504                line: 2,
505                character: 0
506            }
507        );
508        assert_eq!(
509            offset_to_position(src, doc.line_starts(), 3), // 'b'
510            Position {
511                line: 1,
512                character: 0
513            }
514        );
515    }
516
517    #[test]
518    fn str_offset_finds_substr() {
519        let src = "<?php\nfunction foo() {}";
520        let name = &src[15..18]; // "foo"
521        assert_eq!(str_offset(src, name), Some(15));
522    }
523
524    #[test]
525    fn str_offset_content_fallback_for_different_allocation() {
526        // "foo" is a separately owned String (not a sub-slice of the source),
527        // so pointer arithmetic fails. The fallback finds it by content.
528        let owned = "foo".to_string();
529        assert_eq!(str_offset("<?php foo", &owned), Some(6));
530    }
531
532    #[test]
533    fn str_offset_unrelated_content_returns_none() {
534        let owned = "bar".to_string();
535        assert_eq!(str_offset("<?php foo", &owned), None);
536    }
537}