Skip to main content

php_lsp/
ast.rs

1/// Core AST infrastructure: arena-backed `ParsedDoc`, span utilities, and TypeHint formatting.
2use std::collections::HashMap;
3use std::mem::ManuallyDrop;
4use std::sync::{LazyLock, Mutex};
5
6use php_ast::{Program, Span, TypeHint, TypeHintKind};
7use tower_lsp::lsp_types::{Position, Range};
8
9/// Cached per-doc map of `class_name -> method_name -> return_class_name`.
10/// Exposed here (rather than in `type_map`) because it lives on `ParsedDoc`.
11pub type MethodReturnsMap = HashMap<String, HashMap<String, String>>;
12
13// ── BumpPool ──────────────────────────────────────────────────────────────────
14
15const POOL_CAP: usize = 8;
16
17struct BumpPool {
18    // Box<Bump> required: arena_box.as_ref() is transmuted to &'static Bump in
19    // ParsedDoc::parse(). The Box keeps the Bump at a stable heap address so
20    // that reference remains valid after the Box is moved into ArenaGuard.
21    #[allow(clippy::vec_box)]
22    pool: Mutex<Vec<Box<bumpalo::Bump>>>,
23}
24
25impl BumpPool {
26    fn take(&self) -> Box<bumpalo::Bump> {
27        self.pool
28            .lock()
29            .unwrap()
30            .pop()
31            .unwrap_or_else(|| Box::new(bumpalo::Bump::new()))
32    }
33
34    fn give(&self, mut arena: Box<bumpalo::Bump>) {
35        arena.reset();
36        let mut p = self.pool.lock().unwrap();
37        if p.len() < POOL_CAP {
38            p.push(arena);
39        }
40    }
41}
42
43static BUMP_POOL: LazyLock<BumpPool> = LazyLock::new(|| BumpPool {
44    pool: Mutex::new(Vec::new()),
45});
46
47// ── ArenaGuard ────────────────────────────────────────────────────────────────
48
49/// Returns the arena to the pool on drop.
50struct ArenaGuard(Option<Box<bumpalo::Bump>>);
51
52impl Drop for ArenaGuard {
53    fn drop(&mut self) {
54        if let Some(arena) = self.0.take() {
55            BUMP_POOL.give(arena);
56        }
57    }
58}
59
60// ── ParsedDoc ─────────────────────────────────────────────────────────────────
61
62/// Owns a parsed PHP document: the bumpalo arena, source snapshot, and Program.
63///
64/// SAFETY invariants:
65/// - `program` uses `ManuallyDrop` and is explicitly dropped in `Drop::drop()`
66///   before any field auto-drop runs. This guarantees arena-allocated nodes are
67///   gone before `ArenaGuard` recycles the arena — regardless of field order.
68/// - Both `_arena` and `_source` are `Box`-allocated; their heap addresses are
69///   stable and never move.
70/// - The `'static` lifetimes in `ManuallyDrop<Box<Program<'static, 'static>>>`
71///   are erased versions of the true lifetimes `'_arena` and `'_source`. The
72///   public `program()` accessor re-attaches them to `&self`, preventing any
73///   reference from escaping beyond the lifetime of the `ParsedDoc`.
74pub struct ParsedDoc {
75    program: ManuallyDrop<Box<Program<'static, 'static>>>,
76    pub errors: Vec<php_rs_parser::diagnostics::ParseError>,
77    #[allow(clippy::box_collection)]
78    _source: Box<String>,
79    line_starts: Vec<u32>,
80    _arena: ArenaGuard,
81}
82
83impl Drop for ParsedDoc {
84    fn drop(&mut self) {
85        // Drop program explicitly before any field auto-drop runs (including
86        // _arena). ManuallyDrop prevents a second drop after this method returns.
87        // SAFETY: called exactly once here; no other code drops this field.
88        unsafe { ManuallyDrop::drop(&mut self.program) };
89    }
90}
91
92// SAFETY: Program nodes contain only data; no thread-local state.
93unsafe impl Send for ParsedDoc {}
94unsafe impl Sync for ParsedDoc {}
95
96impl ParsedDoc {
97    pub fn parse(source: String) -> Self {
98        let source_box = Box::new(source);
99        // Take a pre-warmed arena from the pool (or allocate a fresh one).
100        let arena_box = BUMP_POOL.take();
101
102        // SAFETY: Both boxes are on the heap; moving a Box<T> moves the pointer,
103        // not the heap data. These references therefore remain valid for as long
104        // as the boxes (and hence `self`) are alive.
105        let src_ref: &'static str =
106            unsafe { std::mem::transmute::<&str, &'static str>(source_box.as_str()) };
107        let arena_ref: &'static bumpalo::Bump = unsafe {
108            std::mem::transmute::<&bumpalo::Bump, &'static bumpalo::Bump>(arena_box.as_ref())
109        };
110
111        let result = php_rs_parser::parse(arena_ref, src_ref);
112
113        let line_starts = build_line_starts(src_ref);
114
115        ParsedDoc {
116            program: ManuallyDrop::new(Box::new(result.program)),
117            errors: result.errors,
118            _source: source_box,
119            line_starts,
120            _arena: ArenaGuard(Some(arena_box)),
121        }
122    }
123
124    /// Borrow the program with lifetimes bounded by `&self`.
125    ///
126    /// SAFETY: covariance of `Program<'arena, 'src>` in both parameters lets
127    /// `&Program<'static, 'static>` shorten to `&Program<'_, '_>`.
128    #[inline]
129    pub fn program(&self) -> &Program<'_, '_> {
130        &self.program
131    }
132
133    /// Borrow the source text used when parsing.
134    #[inline]
135    pub fn source(&self) -> &str {
136        &self._source
137    }
138
139    /// Borrow the precomputed line-start byte offsets.
140    /// `line_starts[i]` is the byte offset of the first character on line `i`.
141    pub fn line_starts(&self) -> &[u32] {
142        &self.line_starts
143    }
144
145    /// Bundle source and line index for position lookups.
146    pub fn view(&self) -> SourceView<'_> {
147        SourceView {
148            source: self.source(),
149            line_starts: self.line_starts(),
150        }
151    }
152}
153
154impl Default for ParsedDoc {
155    fn default() -> Self {
156        ParsedDoc::parse(String::new())
157    }
158}
159
160// ── Span / position utilities ─────────────────────────────────────────────────
161
162/// Build a table of byte offsets for the start of each line.
163/// `result[i]` is the byte offset of the first character on line `i`.
164fn build_line_starts(source: &str) -> Vec<u32> {
165    let mut starts = vec![0u32];
166    for (i, b) in source.bytes().enumerate() {
167        if b == b'\n' {
168            starts.push(i as u32 + 1);
169        }
170    }
171    starts
172}
173
174/// Bundles source text with its precomputed line-start table.
175/// `Copy` so inner functions can pass it by value without lifetime annotation churn.
176#[derive(Copy, Clone)]
177pub struct SourceView<'a> {
178    source: &'a str,
179    line_starts: &'a [u32],
180}
181
182impl<'a> SourceView<'a> {
183    #[inline]
184    pub fn source(self) -> &'a str {
185        self.source
186    }
187
188    pub fn position_of(self, offset: u32) -> Position {
189        offset_to_position(self.source, self.line_starts, offset)
190    }
191
192    #[inline]
193    pub fn line_starts(self) -> &'a [u32] {
194        self.line_starts
195    }
196
197    /// O(log lines) line number for a byte offset. Cheaper than
198    /// `position_of(..).line` because it skips the UTF-16 column scan.
199    #[inline]
200    pub fn line_of(self, offset: u32) -> u32 {
201        match self.line_starts.partition_point(|&s| s <= offset) {
202            0 => 0,
203            i => (i - 1) as u32,
204        }
205    }
206
207    /// O(log lines) UTF-16 `Position` -> byte offset. Uses the precomputed
208    /// `line_starts` table to jump to the target line and only scans that
209    /// line's characters. Much faster than a linear scan over the whole
210    /// source when the cursor is late in the file.
211    pub fn byte_of_position(self, pos: Position) -> u32 {
212        let line_idx = pos.line as usize;
213        let line_start = self.line_starts.get(line_idx).copied().unwrap_or(0) as usize;
214        let line_end = self
215            .line_starts
216            .get(line_idx + 1)
217            .map(|&s| (s as usize).saturating_sub(1))
218            .unwrap_or(self.source.len());
219        let raw = &self.source[line_start..line_end.min(self.source.len())];
220        let line = raw.strip_suffix('\r').unwrap_or(raw);
221        let mut col_utf16: u32 = 0;
222        let mut byte_in_line: usize = 0;
223        for ch in line.chars() {
224            if col_utf16 >= pos.character {
225                break;
226            }
227            col_utf16 += ch.len_utf16() as u32;
228            byte_in_line += ch.len_utf8();
229        }
230        (line_start + byte_in_line) as u32
231    }
232
233    pub fn range_of(self, span: Span) -> Range {
234        Range {
235            start: self.position_of(span.start),
236            end: self.position_of(span.end),
237        }
238    }
239
240    pub fn name_range(self, name: &str) -> Range {
241        let start = str_offset(self.source, name);
242        Range {
243            start: self.position_of(start),
244            end: self.position_of(start + name.len() as u32),
245        }
246    }
247}
248
249/// Convert a byte offset into `source` to an LSP `Position` (0-based line/char).
250///
251/// Uses a precomputed `line_starts` table for O(log n) binary search.
252/// Handles both LF-only and CRLF line endings: a trailing `\r` before `\n` is
253/// not counted as a column so that positions are consistent regardless of
254/// line-ending style.
255pub fn offset_to_position(source: &str, line_starts: &[u32], offset: u32) -> Position {
256    let offset_usize = (offset as usize).min(source.len());
257    // Binary search: find the last line_start ≤ offset.
258    let line = match line_starts.partition_point(|&s| s <= offset) {
259        0 => 0u32,
260        i => (i - 1) as u32,
261    };
262    let line_start = line_starts.get(line as usize).copied().unwrap_or(0) as usize;
263    let segment = &source[line_start..offset_usize];
264    // Strip trailing \r to handle CRLF: don't count \r as a column.
265    let segment = segment.strip_suffix('\r').unwrap_or(segment);
266    let character = segment.chars().map(|c| c.len_utf16() as u32).sum::<u32>();
267    Position { line, character }
268}
269
270/// Convert a `Span` (byte-offset pair) to an LSP `Range`.
271pub fn span_to_range(source: &str, line_starts: &[u32], span: Span) -> Range {
272    Range {
273        start: offset_to_position(source, line_starts, span.start),
274        end: offset_to_position(source, line_starts, span.end),
275    }
276}
277
278/// Return the byte offset of `substr` within `source`.
279///
280/// Uses pointer arithmetic when `substr` is a true sub-slice of `source`
281/// (i.e. arena-allocated names pointing into the same backing string).
282/// Falls back to a content search when the pointers differ — this handles
283/// tests and callers that pass a differently-allocated copy of the source.
284pub fn str_offset(source: &str, substr: &str) -> u32 {
285    let src_ptr = source.as_ptr() as usize;
286    let sub_ptr = substr.as_ptr() as usize;
287    if sub_ptr >= src_ptr && sub_ptr + substr.len() <= src_ptr + source.len() {
288        return (sub_ptr - src_ptr) as u32;
289    }
290    // Fallback: locate by content (same text, different allocation).
291    source.find(substr).unwrap_or(0) as u32
292}
293
294/// Build an LSP `Range` for a name that is a sub-slice of `source`.
295pub fn name_range(source: &str, line_starts: &[u32], name: &str) -> Range {
296    let start = str_offset(source, name);
297    Range {
298        start: offset_to_position(source, line_starts, start),
299        end: offset_to_position(source, line_starts, start + name.len() as u32),
300    }
301}
302
303// ── TypeHint formatting ────────────────────────────────────────────────────────
304
305/// Format a `TypeHint` as a PHP type string, e.g. `?int`, `string|null`.
306pub fn format_type_hint(hint: &TypeHint<'_, '_>) -> String {
307    fmt_kind(&hint.kind)
308}
309
310fn fmt_kind(kind: &TypeHintKind<'_, '_>) -> String {
311    match kind {
312        TypeHintKind::Named(name) => name.to_string_repr().to_string(),
313        TypeHintKind::Keyword(builtin, _) => builtin.as_str().to_string(),
314        TypeHintKind::Nullable(inner) => format!("?{}", format_type_hint(inner)),
315        TypeHintKind::Union(types) => types
316            .iter()
317            .map(format_type_hint)
318            .collect::<Vec<_>>()
319            .join("|"),
320        TypeHintKind::Intersection(types) => types
321            .iter()
322            .map(format_type_hint)
323            .collect::<Vec<_>>()
324            .join("&"),
325    }
326}
327
328#[cfg(test)]
329mod tests {
330    use super::*;
331
332    #[test]
333    fn parses_empty_source() {
334        let doc = ParsedDoc::parse("<?php".to_string());
335        assert!(doc.errors.is_empty());
336        assert!(doc.program().stmts.is_empty());
337    }
338
339    #[test]
340    fn parses_function() {
341        let doc = ParsedDoc::parse("<?php\nfunction foo() {}".to_string());
342        assert_eq!(doc.program().stmts.len(), 1);
343    }
344
345    #[test]
346    fn offset_to_position_first_line() {
347        let src = "<?php\nfoo";
348        let doc = ParsedDoc::parse(src.to_string());
349        assert_eq!(
350            offset_to_position(src, doc.line_starts(), 0),
351            Position {
352                line: 0,
353                character: 0
354            }
355        );
356    }
357
358    #[test]
359    fn offset_to_position_second_line() {
360        // "<?php\n" — offset 6 is start of line 1
361        let src = "<?php\nfoo";
362        let doc = ParsedDoc::parse(src.to_string());
363        assert_eq!(
364            offset_to_position(src, doc.line_starts(), 6),
365            Position {
366                line: 1,
367                character: 0
368            }
369        );
370    }
371
372    #[test]
373    fn offset_to_position_multibyte_utf16() {
374        // "é" is U+00E9: 2 UTF-8 bytes, 1 UTF-16 code unit.
375        // "😀" is U+1F600: 4 UTF-8 bytes, 2 UTF-16 code units.
376        // source: "a😀b" — byte offsets: a=0, 😀=1..5, b=5
377        // UTF-16:            a=col 0, 😀=col 1..3, b=col 3
378        let src = "a\u{1F600}b";
379        let doc = ParsedDoc::parse(src.to_string());
380        assert_eq!(
381            offset_to_position(src, doc.line_starts(), 5), // byte offset of 'b'
382            Position {
383                line: 0,
384                character: 3
385            }  // UTF-16 col 3
386        );
387    }
388
389    #[test]
390    fn offset_to_position_crlf_start_of_line() {
391        // CRLF: offset pointing to first char of line 1 must give character=0.
392        // "foo\r\nbar": f=0 o=1 o=2 \r=3 \n=4 b=5 a=6 r=7
393        let src = "foo\r\nbar";
394        let doc = ParsedDoc::parse(src.to_string());
395        assert_eq!(
396            offset_to_position(src, doc.line_starts(), 5), // 'b'
397            Position {
398                line: 1,
399                character: 0
400            }
401        );
402    }
403
404    #[test]
405    fn offset_to_position_crlf_does_not_count_cr_in_column() {
406        // Offset pointing to the \r itself must not count it as a column.
407        // "foo\r\nbar": the \r is at offset 3, column must be 3 (length of "foo").
408        let src = "foo\r\nbar";
409        let doc = ParsedDoc::parse(src.to_string());
410        assert_eq!(
411            offset_to_position(src, doc.line_starts(), 3), // '\r'
412            Position {
413                line: 0,
414                character: 3
415            }
416        );
417    }
418
419    #[test]
420    fn offset_to_position_crlf_multiline() {
421        // Multiple CRLF lines: columns must not accumulate stray \r counts.
422        // "a\r\nb\r\nc": a=0 \r=1 \n=2 b=3 \r=4 \n=5 c=6
423        let src = "a\r\nb\r\nc";
424        let doc = ParsedDoc::parse(src.to_string());
425        assert_eq!(
426            offset_to_position(src, doc.line_starts(), 6), // 'c'
427            Position {
428                line: 2,
429                character: 0
430            }
431        );
432        assert_eq!(
433            offset_to_position(src, doc.line_starts(), 3), // 'b'
434            Position {
435                line: 1,
436                character: 0
437            }
438        );
439    }
440
441    #[test]
442    fn str_offset_finds_substr() {
443        let src = "<?php\nfunction foo() {}";
444        let name = &src[15..18]; // "foo"
445        assert_eq!(str_offset(src, name), 15);
446    }
447
448    #[test]
449    fn str_offset_content_fallback_for_different_allocation() {
450        // "foo" is a separately owned String (not a sub-slice of the source),
451        // so pointer arithmetic fails. The fallback finds it by content.
452        let owned = "foo".to_string();
453        assert_eq!(str_offset("<?php foo", &owned), 6);
454    }
455
456    #[test]
457    fn str_offset_unrelated_content_returns_zero() {
458        let owned = "bar".to_string();
459        assert_eq!(str_offset("<?php foo", &owned), 0);
460    }
461}