phpdoc_parser/
parser.rs

1//! Top-level PHPDoc comment parser.
2//!
3//! Parses `/** ... */` doc-block comments into a [`PhpDoc`] with summary,
4//! description, and generic tags. All spans are comment-relative
5//! (0 = start of `/**`).
6//!
7//! Tag bodies are exposed as raw [`PhpDocText`] — callers apply their own
8//! type parsers and validation rules to the body text.
9
10use crate::ast::{InlineTag, PhpDoc, PhpDocTag, PhpDocText, TextSegment};
11use crate::Span;
12
13// =============================================================================
14// Public entry point
15// =============================================================================
16
17/// Parse a raw doc-comment string into a [`PhpDoc`].
18///
19/// The input may include `/**` and `*/` delimiters or be already stripped.
20pub fn parse(text: &str) -> PhpDoc {
21    let span = Span::new(0, text.len() as u32);
22    let (inner, content_start) = strip_delimiters(text);
23    let lines = clean_lines(inner, content_start);
24    let (summary, description, tag_start) = extract_prose(&lines);
25    let tags = if tag_start < lines.len() {
26        parse_tags(&lines[tag_start..])
27    } else {
28        Vec::new()
29    };
30    PhpDoc {
31        summary,
32        description,
33        tags,
34        span,
35    }
36}
37
38// =============================================================================
39// Internal types
40// =============================================================================
41
42struct CleanLine {
43    text: String,
44    /// Byte offset of `text[0]` within the original comment string.
45    base_offset: u32,
46}
47
48// =============================================================================
49// Delimiter stripping
50// =============================================================================
51
52fn strip_delimiters(text: &str) -> (&str, u32) {
53    let (s, start) = if let Some(rest) = text.strip_prefix("/**") {
54        (rest, 3u32)
55    } else if let Some(rest) = text.strip_prefix("/*") {
56        (rest, 2u32)
57    } else {
58        (text, 0u32)
59    };
60    let s = s.strip_suffix("*/").unwrap_or(s);
61    (s, start)
62}
63
64// =============================================================================
65// Line cleaning
66// =============================================================================
67
68fn clean_lines(inner: &str, content_start: u32) -> Vec<CleanLine> {
69    let mut lines = Vec::new();
70    let mut offset_in_inner: u32 = 0;
71
72    for raw_line in inner.split('\n') {
73        let line_abs_start = content_start + offset_in_inner;
74
75        // Strip trailing \r so CRLF input doesn't leak into text content.
76        // Use raw_line.len() (including the \r) for offset tracking so that
77        // byte positions in the original string remain accurate.
78        let line = raw_line.strip_suffix('\r').unwrap_or(raw_line);
79        let bytes = line.as_bytes();
80
81        let mut stripped_bytes: u32 = 0;
82
83        let ws_count = bytes
84            .iter()
85            .take_while(|&&b| b == b' ' || b == b'\t')
86            .count();
87        stripped_bytes += ws_count as u32;
88        let after_ws = &line[ws_count..];
89
90        let (cleaned, extra_stripped) = if let Some(rest) = after_ws.strip_prefix("* ") {
91            (rest, 2u32)
92        } else if let Some(rest) = after_ws.strip_prefix('*') {
93            (rest, 1u32)
94        } else {
95            (after_ws, 0u32)
96        };
97        stripped_bytes += extra_stripped;
98
99        lines.push(CleanLine {
100            text: cleaned.to_owned(),
101            base_offset: line_abs_start + stripped_bytes,
102        });
103
104        offset_in_inner += raw_line.len() as u32 + 1;
105    }
106
107    lines
108}
109
110// =============================================================================
111// Prose extraction
112// =============================================================================
113
114fn extract_prose(lines: &[CleanLine]) -> (Option<PhpDocText>, Option<PhpDocText>, usize) {
115    let tag_start = lines
116        .iter()
117        .position(|l| l.text.trim_start().starts_with('@'))
118        .unwrap_or(lines.len());
119
120    let prose_lines = &lines[..tag_start];
121
122    let Some(start) = prose_lines.iter().position(|l| !l.text.trim().is_empty()) else {
123        return (None, None, tag_start);
124    };
125
126    let summary = {
127        let line = &prose_lines[start];
128        let trimmed = line.text.trim();
129        if trimmed.is_empty() {
130            None
131        } else {
132            let leading = (line.text.len() - line.text.trim_start().len()) as u32;
133            Some(text_from_str(trimmed, line.base_offset + leading))
134        }
135    };
136
137    let blank_after_summary = prose_lines[start..]
138        .iter()
139        .position(|l| l.text.trim().is_empty())
140        .map(|i| i + start);
141
142    let description = if let Some(blank) = blank_after_summary {
143        let desc_start = prose_lines[blank..]
144            .iter()
145            .position(|l| !l.text.trim().is_empty())
146            .map(|i| i + blank);
147
148        if let Some(ds) = desc_start {
149            let desc_end = prose_lines
150                .iter()
151                .rposition(|l| !l.text.trim().is_empty())
152                .map(|i| i + 1)
153                .unwrap_or(ds);
154
155            let slice: Vec<&CleanLine> = prose_lines[ds..desc_end].iter().collect();
156            description_to_text(&slice)
157        } else {
158            None
159        }
160    } else {
161        None
162    };
163
164    (summary, description, tag_start)
165}
166
167// =============================================================================
168// Tag parsing
169// =============================================================================
170
171fn parse_tags(lines: &[CleanLine]) -> Vec<PhpDocTag> {
172    let mut tags = Vec::new();
173    let mut i = 0;
174
175    while i < lines.len() {
176        let line_text = lines[i].text.trim_start();
177        if !line_text.starts_with('@') {
178            i += 1;
179            continue;
180        }
181
182        let tag_start_offset = lines[i].base_offset;
183
184        let mut tag_lines: Vec<&CleanLine> = vec![&lines[i]];
185        i += 1;
186        while i < lines.len() && !lines[i].text.trim_start().starts_with('@') {
187            tag_lines.push(&lines[i]);
188            i += 1;
189        }
190
191        let last = tag_lines.last().unwrap();
192        let tag_end_offset = last.base_offset + last.text.len() as u32;
193        let tag_span = Span::new(tag_start_offset, tag_end_offset);
194
195        let first = tag_lines[0]
196            .text
197            .trim_start()
198            .strip_prefix('@')
199            .unwrap_or("");
200
201        let (tag_name, body_on_first) = match first.find(|c: char| c.is_whitespace()) {
202            Some(pos) => {
203                let body = first[pos..].trim();
204                (
205                    &first[..pos],
206                    if body.is_empty() { None } else { Some(body) },
207                )
208            }
209            None => (first, None),
210        };
211
212        let body_base_offset = {
213            let after_at = &tag_lines[0].text.trim_start()[1 + tag_name.len()..];
214            let ws = (after_at.len() - after_at.trim_start().len()) as u32;
215            tag_lines[0].base_offset + 1 + tag_name.len() as u32 + ws
216        };
217
218        let first_piece = body_on_first.map(|t| (t, body_base_offset));
219        let body = tag_body_to_text(first_piece, &tag_lines[1..]);
220
221        tags.push(PhpDocTag {
222            name: tag_name.to_owned(),
223            body,
224            span: tag_span,
225        });
226    }
227
228    tags
229}
230
231// =============================================================================
232// Text builders
233// =============================================================================
234
235/// Build a [`PhpDocText`] for a `@tag` body.
236///
237/// `first_piece` is `Some((text, base_offset))` for the text on the `@tag` line
238/// itself (after the tag name). Continuation lines follow. Pieces are joined with
239/// a single space; each piece is scanned for inline tags at its own true offset,
240/// so spans are accurate even on multi-line tag bodies.
241fn tag_body_to_text(
242    first_piece: Option<(&str, u32)>,
243    continuation: &[&CleanLine],
244) -> Option<PhpDocText> {
245    let mut segments: Vec<TextSegment> = Vec::new();
246    let mut span_start: Option<u32> = None;
247    let mut span_end: u32 = 0;
248
249    if let Some((text, base)) = first_piece {
250        let trimmed = text.trim();
251        if !trimmed.is_empty() {
252            let leading = (text.len() - text.trim_start().len()) as u32;
253            let real_base = base + leading;
254            span_start = Some(real_base);
255            span_end = real_base + trimmed.len() as u32;
256            merge_into(&mut segments, text_from_str(trimmed, real_base).segments);
257        }
258    }
259
260    for line in continuation {
261        let trimmed = line.text.trim();
262        if trimmed.is_empty() {
263            continue;
264        }
265        let leading = (line.text.len() - line.text.trim_start().len()) as u32;
266        let real_base = line.base_offset + leading;
267
268        if span_start.is_none() {
269            span_start = Some(real_base);
270        }
271        span_end = real_base + trimmed.len() as u32;
272
273        if !segments.is_empty() {
274            push_text(&mut segments, " ");
275        }
276        merge_into(&mut segments, text_from_str(trimmed, real_base).segments);
277    }
278
279    span_start.map(|start| PhpDocText {
280        segments,
281        span: Span::new(start, span_end),
282    })
283}
284
285/// Build a [`PhpDocText`] for a description (multi-line prose after the summary).
286///
287/// Lines are joined with `\n`; blank lines produce `\n\n` paragraph breaks.
288/// Each non-blank line is scanned for inline tags at its own true offset.
289fn description_to_text(lines: &[&CleanLine]) -> Option<PhpDocText> {
290    let mut segments: Vec<TextSegment> = Vec::new();
291    let mut span_start: Option<u32> = None;
292    let mut span_end: u32 = 0;
293
294    for (i, line) in lines.iter().enumerate() {
295        let trimmed = line.text.trim();
296
297        if i > 0 {
298            push_text(&mut segments, "\n");
299        }
300
301        if trimmed.is_empty() {
302            continue;
303        }
304
305        let leading = (line.text.len() - line.text.trim_start().len()) as u32;
306        let real_base = line.base_offset + leading;
307
308        if span_start.is_none() {
309            span_start = Some(real_base);
310        }
311        span_end = real_base + trimmed.len() as u32;
312
313        merge_into(&mut segments, text_from_str(trimmed, real_base).segments);
314    }
315
316    span_start.map(|start| PhpDocText {
317        segments,
318        span: Span::new(start, span_end),
319    })
320}
321
322/// Append `text` to the last `Text` segment, or push a new one.
323fn push_text(segments: &mut Vec<TextSegment>, text: &str) {
324    if text.is_empty() {
325        return;
326    }
327    if let Some(TextSegment::Text(last)) = segments.last_mut() {
328        last.push_str(text);
329    } else {
330        segments.push(TextSegment::Text(text.to_owned()));
331    }
332}
333
334/// Extend `dest` with `src`, merging adjacent `Text` segments at the boundary.
335fn merge_into(dest: &mut Vec<TextSegment>, src: Vec<TextSegment>) {
336    for seg in src {
337        match seg {
338            TextSegment::Text(t) => push_text(dest, &t),
339            other => dest.push(other),
340        }
341    }
342}
343
344// =============================================================================
345// Inline-tag scanning
346// =============================================================================
347
348/// Build a [`PhpDocText`] from a string, scanning for `{@tagname body}` inline tags.
349fn text_from_str(s: &str, base_offset: u32) -> PhpDocText {
350    let mut segments = Vec::new();
351    let bytes = s.as_bytes();
352    let mut i = 0;
353    let mut text_start = 0;
354
355    while i < bytes.len() {
356        if bytes[i] == b'{' && bytes.get(i + 1) == Some(&b'@') {
357            if i > text_start {
358                segments.push(TextSegment::Text(s[text_start..i].to_owned()));
359            }
360
361            let tag_abs_start = i;
362            i += 2; // skip `{@`
363
364            let name_start = i;
365            while i < bytes.len() && !bytes[i].is_ascii_whitespace() && bytes[i] != b'}' {
366                i += 1;
367            }
368            let name = s[name_start..i].to_owned();
369
370            while i < bytes.len() && bytes[i].is_ascii_whitespace() {
371                i += 1;
372            }
373
374            let body_start = i;
375            let mut depth = 1i32;
376            while i < bytes.len() {
377                match bytes[i] {
378                    b'{' => {
379                        depth += 1;
380                        i += 1;
381                    }
382                    b'}' if depth == 1 => break,
383                    b'}' => {
384                        depth -= 1;
385                        i += 1;
386                    }
387                    _ => {
388                        i += 1;
389                    }
390                }
391            }
392
393            let body_raw = s[body_start..i].trim();
394            let body = if body_raw.is_empty() {
395                None
396            } else {
397                Some(body_raw.to_owned())
398            };
399
400            if i < bytes.len() {
401                i += 1; // consume `}`
402            }
403
404            segments.push(TextSegment::InlineTag(InlineTag {
405                name,
406                body,
407                span: Span::new(base_offset + tag_abs_start as u32, base_offset + i as u32),
408            }));
409
410            text_start = i;
411        } else {
412            i += 1;
413        }
414    }
415
416    if text_start < s.len() {
417        segments.push(TextSegment::Text(s[text_start..].to_owned()));
418    }
419
420    PhpDocText {
421        segments,
422        span: Span::new(base_offset, base_offset + s.len() as u32),
423    }
424}
phpdoc_parser/parser.rs

phpdoc_parser/
parser.rs