phpdoc_parser/
parser.rs

1//! Top-level PHPDoc comment parser.
2//!
3//! Parses `/** ... */` doc-block comments into a [`PhpDoc`] with summary,
4//! description, and generic tags. All spans are comment-relative
5//! (0 = start of `/**`).
6//!
7//! Tag bodies are exposed as raw [`PhpDocText`] — callers apply their own
8//! type parsers and validation rules to the body text.
9
10use crate::ast::{InlineTag, PhpDoc, PhpDocTag, PhpDocText, TextSegment};
11use crate::Span;
12
13// =============================================================================
14// Public entry point
15// =============================================================================
16
17/// Parse a raw doc-comment string into a [`PhpDoc`].
18///
19/// The input may include `/**` and `*/` delimiters or be already stripped.
20pub fn parse(text: &str) -> PhpDoc {
21    let span = Span::new(0, text.len() as u32);
22    let (inner, content_start) = strip_delimiters(text);
23    let lines = clean_lines(inner, content_start);
24    let (summary, description, tag_start) = extract_prose(&lines);
25    let tags = if tag_start < lines.len() {
26        parse_tags(&lines[tag_start..])
27    } else {
28        Vec::new()
29    };
30    PhpDoc {
31        summary,
32        description,
33        tags,
34        span,
35    }
36}
37
38// =============================================================================
39// Internal types
40// =============================================================================
41
42struct CleanLine {
43    text: String,
44    /// Byte offset of `text[0]` within the original comment string.
45    base_offset: u32,
46}
47
48// =============================================================================
49// Delimiter stripping
50// =============================================================================
51
52fn strip_delimiters(text: &str) -> (&str, u32) {
53    let (s, start) = if let Some(rest) = text.strip_prefix("/**") {
54        (rest, 3u32)
55    } else if let Some(rest) = text.strip_prefix("/*") {
56        (rest, 2u32)
57    } else {
58        (text, 0u32)
59    };
60    let s = s.strip_suffix("*/").unwrap_or(s);
61    (s, start)
62}
63
64// =============================================================================
65// Line cleaning
66// =============================================================================
67
68fn clean_lines(inner: &str, content_start: u32) -> Vec<CleanLine> {
69    let mut lines = Vec::new();
70    let mut offset_in_inner: u32 = 0;
71
72    for raw_line in inner.split('\n') {
73        let line_abs_start = content_start + offset_in_inner;
74
75        // Strip trailing \r so CRLF input doesn't leak into text content.
76        // Use raw_line.len() (including the \r) for offset tracking so that
77        // byte positions in the original string remain accurate.
78        let line = raw_line.strip_suffix('\r').unwrap_or(raw_line);
79        let bytes = line.as_bytes();
80
81        let mut stripped_bytes: u32 = 0;
82
83        let ws_count = bytes
84            .iter()
85            .take_while(|&&b| b == b' ' || b == b'\t')
86            .count();
87        stripped_bytes += ws_count as u32;
88        let after_ws = &line[ws_count..];
89
90        let (cleaned, extra_stripped) = if let Some(rest) = after_ws.strip_prefix("* ") {
91            (rest, 2u32)
92        } else if let Some(rest) = after_ws.strip_prefix('*') {
93            (rest, 1u32)
94        } else {
95            (after_ws, 0u32)
96        };
97        stripped_bytes += extra_stripped;
98
99        lines.push(CleanLine {
100            text: cleaned.to_owned(),
101            base_offset: line_abs_start + stripped_bytes,
102        });
103
104        offset_in_inner += raw_line.len() as u32 + 1;
105    }
106
107    lines
108}
109
110// =============================================================================
111// Prose extraction
112// =============================================================================
113
114fn extract_prose(lines: &[CleanLine]) -> (Option<PhpDocText>, Option<PhpDocText>, usize) {
115    let tag_start = lines
116        .iter()
117        .position(|l| l.text.trim_start().starts_with('@'))
118        .unwrap_or(lines.len());
119
120    let prose_lines = &lines[..tag_start];
121
122    let Some(start) = prose_lines.iter().position(|l| !l.text.trim().is_empty()) else {
123        return (None, None, tag_start);
124    };
125
126    let summary = {
127        let line = &prose_lines[start];
128        let trimmed = line.text.trim();
129        if trimmed.is_empty() {
130            None
131        } else {
132            let leading = (line.text.len() - line.text.trim_start().len()) as u32;
133            Some(text_from_str(trimmed, line.base_offset + leading))
134        }
135    };
136
137    let blank_after_summary = prose_lines[start..]
138        .iter()
139        .position(|l| l.text.trim().is_empty())
140        .map(|i| i + start);
141
142    let description = if let Some(blank) = blank_after_summary {
143        let desc_start = prose_lines[blank..]
144            .iter()
145            .position(|l| !l.text.trim().is_empty())
146            .map(|i| i + blank);
147
148        if let Some(ds) = desc_start {
149            let desc_end = prose_lines
150                .iter()
151                .rposition(|l| !l.text.trim().is_empty())
152                .map(|i| i + 1)
153                .unwrap_or(ds);
154
155            let slice: Vec<&CleanLine> = prose_lines[ds..desc_end].iter().collect();
156            description_to_text(&slice)
157        } else {
158            None
159        }
160    } else {
161        None
162    };
163
164    (summary, description, tag_start)
165}
166
167// =============================================================================
168// Tag parsing
169// =============================================================================
170
171fn parse_tags(lines: &[CleanLine]) -> Vec<PhpDocTag> {
172    let mut tags = Vec::new();
173    let mut i = 0;
174
175    while i < lines.len() {
176        let line_text = lines[i].text.trim_start();
177        if !line_text.starts_with('@') {
178            i += 1;
179            continue;
180        }
181
182        let tag_start_offset = lines[i].base_offset;
183
184        let mut tag_lines: Vec<&CleanLine> = vec![&lines[i]];
185        i += 1;
186        while i < lines.len() && !lines[i].text.trim_start().starts_with('@') {
187            tag_lines.push(&lines[i]);
188            i += 1;
189        }
190
191        let last = tag_lines.last().unwrap();
192        let tag_end_offset = last.base_offset + last.text.len() as u32;
193        let tag_span = Span::new(tag_start_offset, tag_end_offset);
194
195        let first = tag_lines[0]
196            .text
197            .trim_start()
198            .strip_prefix('@')
199            .unwrap_or("");
200
201        let (tag_name, body_on_first) = match first.find(|c: char| c.is_whitespace()) {
202            Some(pos) => {
203                let body = first[pos..].trim();
204                (
205                    &first[..pos],
206                    if body.is_empty() { None } else { Some(body) },
207                )
208            }
209            None => (first, None),
210        };
211
212        let body_base_offset = {
213            let after_at = &tag_lines[0].text.trim_start()[1 + tag_name.len()..];
214            let ws = (after_at.len() - after_at.trim_start().len()) as u32;
215            tag_lines[0].base_offset + 1 + tag_name.len() as u32 + ws
216        };
217
218        let first_piece = body_on_first.map(|t| (t, body_base_offset));
219        let body = tag_body_to_text(first_piece, &tag_lines[1..]);
220
221        tags.push(PhpDocTag {
222            name: tag_name.to_owned(),
223            body,
224            span: tag_span,
225        });
226    }
227
228    tags
229}
230
231// =============================================================================
232// Text builders
233// =============================================================================
234
235/// What blank source lines mean to a [`PhpDocTextBuilder`].
236#[derive(Clone, Copy)]
237enum BlankLinePolicy {
238    /// Blank lines carry no meaning — dropped entirely, no separator or break.
239    /// Used for tag bodies, which are a single logical value with no paragraphs.
240    Insignificant,
241    /// A blank line is a paragraph separator, so consecutive blanks become `\n\n`.
242    /// Used for descriptions.
243    ParagraphBreak,
244}
245
246/// Accumulates trimmed source lines into a single [`PhpDocText`].
247///
248/// Each non-blank line is trimmed, scanned for `{@inline}` tags at its own true
249/// source offset (so spans stay accurate across lines), and appended after a `\n`
250/// separator. The running [`Span`] covers the first through last non-blank line.
251/// Blank-line handling is governed by [`BlankLinePolicy`].
252struct PhpDocTextBuilder {
253    segments: Vec<TextSegment>,
254    span_start: Option<u32>,
255    span_end: u32,
256    blank_policy: BlankLinePolicy,
257    /// Whether any line has been pushed yet — controls the leading separator so
258    /// the first line never gets a `\n` in front of it.
259    started: bool,
260}
261
262impl PhpDocTextBuilder {
263    fn new(blank_policy: BlankLinePolicy) -> Self {
264        Self {
265            segments: Vec::new(),
266            span_start: None,
267            span_end: 0,
268            blank_policy,
269            started: false,
270        }
271    }
272
273    /// Push one source line. `text` is the raw (untrimmed) line and `base` is the
274    /// source offset of its first byte.
275    fn push_line(&mut self, text: &str, base: u32) {
276        let trimmed = text.trim();
277
278        if trimmed.is_empty() {
279            if let BlankLinePolicy::ParagraphBreak = self.blank_policy {
280                if self.started {
281                    push_text(&mut self.segments, "\n");
282                }
283                self.started = true;
284            }
285            return;
286        }
287
288        if self.started {
289            push_text(&mut self.segments, "\n");
290        }
291        self.started = true;
292
293        let leading = (text.len() - text.trim_start().len()) as u32;
294        let content_offset = base + leading;
295        if self.span_start.is_none() {
296            self.span_start = Some(content_offset);
297        }
298        self.span_end = content_offset + trimmed.len() as u32;
299        merge_into(
300            &mut self.segments,
301            text_from_str(trimmed, content_offset).segments,
302        );
303    }
304
305    fn build(self) -> Option<PhpDocText> {
306        self.span_start.map(|start| PhpDocText {
307            segments: self.segments,
308            span: Span::new(start, self.span_end),
309        })
310    }
311}
312
313/// Build a [`PhpDocText`] for a `@tag` body.
314///
315/// `first_piece` is `Some((text, base_offset))` for the text on the `@tag` line
316/// itself (after the tag name); continuation lines follow. Lines are joined with
317/// a newline, preserving line boundaries so callers can separate a type
318/// expression from its description (e.g. `@var T\nThe description`).
319fn tag_body_to_text(
320    first_piece: Option<(&str, u32)>,
321    continuation: &[&CleanLine],
322) -> Option<PhpDocText> {
323    let mut builder = PhpDocTextBuilder::new(BlankLinePolicy::Insignificant);
324    if let Some((text, base)) = first_piece {
325        builder.push_line(text, base);
326    }
327    for line in continuation {
328        builder.push_line(&line.text, line.base_offset);
329    }
330    builder.build()
331}
332
333/// Build a [`PhpDocText`] for a description (multi-line prose after the summary).
334///
335/// Lines are joined with `\n`; blank lines produce `\n\n` paragraph breaks.
336fn description_to_text(lines: &[&CleanLine]) -> Option<PhpDocText> {
337    let mut builder = PhpDocTextBuilder::new(BlankLinePolicy::ParagraphBreak);
338    for line in lines {
339        builder.push_line(&line.text, line.base_offset);
340    }
341    builder.build()
342}
343
344/// Append `text` to the last `Text` segment, or push a new one.
345fn push_text(segments: &mut Vec<TextSegment>, text: &str) {
346    if text.is_empty() {
347        return;
348    }
349    if let Some(TextSegment::Text(last)) = segments.last_mut() {
350        last.push_str(text);
351    } else {
352        segments.push(TextSegment::Text(text.to_owned()));
353    }
354}
355
356/// Extend `dest` with `src`, merging adjacent `Text` segments at the boundary.
357fn merge_into(dest: &mut Vec<TextSegment>, src: Vec<TextSegment>) {
358    for seg in src {
359        match seg {
360            TextSegment::Text(t) => push_text(dest, &t),
361            other => dest.push(other),
362        }
363    }
364}
365
366// =============================================================================
367// Inline-tag scanning
368// =============================================================================
369
370/// Build a [`PhpDocText`] from a string, scanning for `{@tagname body}` inline tags.
371fn text_from_str(s: &str, base_offset: u32) -> PhpDocText {
372    let mut segments = Vec::new();
373    let bytes = s.as_bytes();
374    let mut i = 0;
375    let mut text_start = 0;
376
377    while i < bytes.len() {
378        if bytes[i] == b'{' && bytes.get(i + 1) == Some(&b'@') {
379            if i > text_start {
380                segments.push(TextSegment::Text(s[text_start..i].to_owned()));
381            }
382
383            let tag_abs_start = i;
384            i += 2; // skip `{@`
385
386            let name_start = i;
387            while i < bytes.len() && !bytes[i].is_ascii_whitespace() && bytes[i] != b'}' {
388                i += 1;
389            }
390            let name = s[name_start..i].to_owned();
391
392            while i < bytes.len() && bytes[i].is_ascii_whitespace() {
393                i += 1;
394            }
395
396            let body_start = i;
397            let mut depth = 1i32;
398            while i < bytes.len() {
399                match bytes[i] {
400                    b'{' => {
401                        depth += 1;
402                        i += 1;
403                    }
404                    b'}' if depth == 1 => break,
405                    b'}' => {
406                        depth -= 1;
407                        i += 1;
408                    }
409                    _ => {
410                        i += 1;
411                    }
412                }
413            }
414
415            let body_raw = s[body_start..i].trim();
416            let body = if body_raw.is_empty() {
417                None
418            } else {
419                Some(body_raw.to_owned())
420            };
421
422            if i < bytes.len() {
423                i += 1; // consume `}`
424            }
425
426            segments.push(TextSegment::InlineTag(InlineTag {
427                name,
428                body,
429                span: Span::new(base_offset + tag_abs_start as u32, base_offset + i as u32),
430            }));
431
432            text_start = i;
433        } else {
434            i += 1;
435        }
436    }
437
438    if text_start < s.len() {
439        segments.push(TextSegment::Text(s[text_start..].to_owned()));
440    }
441
442    PhpDocText {
443        segments,
444        span: Span::new(base_offset, base_offset + s.len() as u32),
445    }
446}
phpdoc_parser/parser.rs

phpdoc_parser/
parser.rs