Skip to main content

php_lsp/
ast.rs

1/// Core AST infrastructure: arena-backed `ParsedDoc`, span utilities, and TypeHint formatting.
2use php_ast::{Program, Span, TypeHint, TypeHintKind};
3use tower_lsp::lsp_types::{Position, Range};
4
5// ── ParsedDoc ─────────────────────────────────────────────────────────────────
6
7/// Owns a parsed PHP document: the bumpalo arena, source snapshot, and Program.
8///
9/// SAFETY invariants:
10/// - `program` is dropped before `_arena` and `_source` (field declaration order).
11/// - Both `_arena` and `_source` are `Box`-allocated; their heap addresses are
12///   stable and never move.
13/// - The `'static` lifetimes in `Box<Program<'static, 'static>>` are erased
14///   versions of the true lifetimes `'_arena` and `'_source`. The public
15///   `program()` accessor re-attaches them to `&self`, preventing any reference
16///   from escaping beyond the lifetime of the `ParsedDoc`.
17pub struct ParsedDoc {
18    // Drop order is declaration order in Rust — program drops first.
19    program: Box<Program<'static, 'static>>,
20    pub errors: Vec<php_rs_parser::diagnostics::ParseError>,
21    _arena: Box<bumpalo::Bump>,
22    #[allow(clippy::box_collection)]
23    _source: Box<String>,
24}
25
26// SAFETY: Program nodes contain only data; no thread-local state.
27unsafe impl Send for ParsedDoc {}
28unsafe impl Sync for ParsedDoc {}
29
30impl ParsedDoc {
31    pub fn parse(source: String) -> Self {
32        let source_box = Box::new(source);
33        let arena_box = Box::new(bumpalo::Bump::new());
34
35        // SAFETY: Both boxes are on the heap; moving a Box<T> moves the pointer,
36        // not the heap data. These references therefore remain valid for as long
37        // as the boxes (and hence `self`) are alive.
38        let src_ref: &'static str =
39            unsafe { std::mem::transmute::<&str, &'static str>(source_box.as_str()) };
40        let arena_ref: &'static bumpalo::Bump = unsafe {
41            std::mem::transmute::<&bumpalo::Bump, &'static bumpalo::Bump>(arena_box.as_ref())
42        };
43
44        let result = php_rs_parser::parse(arena_ref, src_ref);
45
46        ParsedDoc {
47            program: Box::new(result.program),
48            errors: result.errors,
49            _arena: arena_box,
50            _source: source_box,
51        }
52    }
53
54    /// Borrow the program with lifetimes bounded by `&self`.
55    ///
56    /// SAFETY: covariance of `Program<'arena, 'src>` in both parameters lets
57    /// `&Program<'static, 'static>` shorten to `&Program<'_, '_>`.
58    #[inline]
59    pub fn program(&self) -> &Program<'_, '_> {
60        &self.program
61    }
62
63    /// Borrow the source text used when parsing.
64    #[inline]
65    pub fn source(&self) -> &str {
66        &self._source
67    }
68}
69
70impl Default for ParsedDoc {
71    fn default() -> Self {
72        ParsedDoc::parse(String::new())
73    }
74}
75
76// ── Span / position utilities ─────────────────────────────────────────────────
77
78/// Convert a byte offset into `source` to an LSP `Position` (0-based line/char).
79///
80/// Handles both LF-only and CRLF line endings. When the offset lands on or
81/// after a `\r` that immediately precedes `\n`, the `\r` is not counted as a
82/// column so that positions are consistent regardless of line-ending style.
83pub fn offset_to_position(source: &str, offset: u32) -> Position {
84    let offset = (offset as usize).min(source.len());
85    let prefix = &source[..offset];
86    let line = prefix.bytes().filter(|&b| b == b'\n').count() as u32;
87    let last_nl = prefix.rfind('\n').map(|i| i + 1).unwrap_or(0);
88    // Strip a trailing \r so CRLF line endings don't inflate the column count.
89    let line_segment = prefix[last_nl..]
90        .strip_suffix('\r')
91        .unwrap_or(&prefix[last_nl..]);
92    let character = line_segment
93        .chars()
94        .map(|c| c.len_utf16() as u32)
95        .sum::<u32>();
96    Position { line, character }
97}
98
99/// Convert a `Span` (byte-offset pair) to an LSP `Range`.
100pub fn span_to_range(source: &str, span: Span) -> Range {
101    Range {
102        start: offset_to_position(source, span.start),
103        end: offset_to_position(source, span.end),
104    }
105}
106
107/// Return the byte offset of `substr` within `source`.
108///
109/// Uses pointer arithmetic when `substr` is a true sub-slice of `source`
110/// (i.e. arena-allocated names pointing into the same backing string).
111/// Falls back to a content search when the pointers differ — this handles
112/// tests and callers that pass a differently-allocated copy of the source.
113pub fn str_offset(source: &str, substr: &str) -> u32 {
114    let src_ptr = source.as_ptr() as usize;
115    let sub_ptr = substr.as_ptr() as usize;
116    if sub_ptr >= src_ptr && sub_ptr + substr.len() <= src_ptr + source.len() {
117        return (sub_ptr - src_ptr) as u32;
118    }
119    // Fallback: locate by content (same text, different allocation).
120    source.find(substr).unwrap_or(0) as u32
121}
122
123/// Build an LSP `Range` for a name that is a sub-slice of `source`.
124pub fn name_range(source: &str, name: &str) -> Range {
125    let start = str_offset(source, name);
126    Range {
127        start: offset_to_position(source, start),
128        end: offset_to_position(source, start + name.len() as u32),
129    }
130}
131
132// ── TypeHint formatting ────────────────────────────────────────────────────────
133
134/// Format a `TypeHint` as a PHP type string, e.g. `?int`, `string|null`.
135pub fn format_type_hint(hint: &TypeHint<'_, '_>) -> String {
136    fmt_kind(&hint.kind)
137}
138
139fn fmt_kind(kind: &TypeHintKind<'_, '_>) -> String {
140    match kind {
141        TypeHintKind::Named(name) => name.to_string_repr().to_string(),
142        TypeHintKind::Keyword(builtin, _) => builtin.as_str().to_string(),
143        TypeHintKind::Nullable(inner) => format!("?{}", format_type_hint(inner)),
144        TypeHintKind::Union(types) => types
145            .iter()
146            .map(format_type_hint)
147            .collect::<Vec<_>>()
148            .join("|"),
149        TypeHintKind::Intersection(types) => types
150            .iter()
151            .map(format_type_hint)
152            .collect::<Vec<_>>()
153            .join("&"),
154    }
155}
156
157#[cfg(test)]
158mod tests {
159    use super::*;
160
161    #[test]
162    fn parses_empty_source() {
163        let doc = ParsedDoc::parse("<?php".to_string());
164        assert!(doc.errors.is_empty());
165        assert!(doc.program().stmts.is_empty());
166    }
167
168    #[test]
169    fn parses_function() {
170        let doc = ParsedDoc::parse("<?php\nfunction foo() {}".to_string());
171        assert_eq!(doc.program().stmts.len(), 1);
172    }
173
174    #[test]
175    fn offset_to_position_first_line() {
176        assert_eq!(
177            offset_to_position("<?php\nfoo", 0),
178            Position {
179                line: 0,
180                character: 0
181            }
182        );
183    }
184
185    #[test]
186    fn offset_to_position_second_line() {
187        // "<?php\n" — offset 6 is start of line 1
188        assert_eq!(
189            offset_to_position("<?php\nfoo", 6),
190            Position {
191                line: 1,
192                character: 0
193            }
194        );
195    }
196
197    #[test]
198    fn offset_to_position_multibyte_utf16() {
199        // "é" is U+00E9: 2 UTF-8 bytes, 1 UTF-16 code unit.
200        // "😀" is U+1F600: 4 UTF-8 bytes, 2 UTF-16 code units.
201        // source: "a😀b" — byte offsets: a=0, 😀=1..5, b=5
202        // UTF-16:            a=col 0, 😀=col 1..3, b=col 3
203        let src = "a\u{1F600}b";
204        assert_eq!(
205            offset_to_position(src, 5), // byte offset of 'b'
206            Position {
207                line: 0,
208                character: 3
209            }  // UTF-16 col 3
210        );
211    }
212
213    #[test]
214    fn offset_to_position_crlf_start_of_line() {
215        // CRLF: offset pointing to first char of line 1 must give character=0.
216        // "foo\r\nbar": f=0 o=1 o=2 \r=3 \n=4 b=5 a=6 r=7
217        let src = "foo\r\nbar";
218        assert_eq!(
219            offset_to_position(src, 5), // 'b'
220            Position {
221                line: 1,
222                character: 0
223            }
224        );
225    }
226
227    #[test]
228    fn offset_to_position_crlf_does_not_count_cr_in_column() {
229        // Offset pointing to the \r itself must not count it as a column.
230        // "foo\r\nbar": the \r is at offset 3, column must be 3 (length of "foo").
231        let src = "foo\r\nbar";
232        assert_eq!(
233            offset_to_position(src, 3), // '\r'
234            Position {
235                line: 0,
236                character: 3
237            }
238        );
239    }
240
241    #[test]
242    fn offset_to_position_crlf_multiline() {
243        // Multiple CRLF lines: columns must not accumulate stray \r counts.
244        // "a\r\nb\r\nc": a=0 \r=1 \n=2 b=3 \r=4 \n=5 c=6
245        let src = "a\r\nb\r\nc";
246        assert_eq!(
247            offset_to_position(src, 6), // 'c'
248            Position {
249                line: 2,
250                character: 0
251            }
252        );
253        assert_eq!(
254            offset_to_position(src, 3), // 'b'
255            Position {
256                line: 1,
257                character: 0
258            }
259        );
260    }
261
262    #[test]
263    fn str_offset_finds_substr() {
264        let src = "<?php\nfunction foo() {}";
265        let name = &src[15..18]; // "foo"
266        assert_eq!(str_offset(src, name), 15);
267    }
268
269    #[test]
270    fn str_offset_content_fallback_for_different_allocation() {
271        // "foo" is a separately owned String (not a sub-slice of the source),
272        // so pointer arithmetic fails. The fallback finds it by content.
273        let owned = "foo".to_string();
274        assert_eq!(str_offset("<?php foo", &owned), 6);
275    }
276
277    #[test]
278    fn str_offset_unrelated_content_returns_zero() {
279        let owned = "bar".to_string();
280        assert_eq!(str_offset("<?php foo", &owned), 0);
281    }
282}