Skip to main content

perl_parser_core/syntax/
heredoc.rs

1//! Heredoc collector and processor for Perl.
2//!
3//! This module handles the logic of collecting heredoc content from source code,
4//! dealing with indentation stripping (`<<~`), and line termination.
5
6use perl_position_tracking::ByteSpan;
7use std::collections::VecDeque;
8use std::sync::Arc;
9
10pub use perl_position_tracking::ByteSpan as Span;
11
12/// Quoting style used in a heredoc declaration.
13#[derive(Debug, Copy, Clone)]
14pub enum QuoteKind {
15    /// Bare identifier (e.g., `<<EOF`), interpolates like double-quoted.
16    Unquoted,
17    /// Single-quoted (e.g., `<<'EOF'`), no interpolation.
18    Single,
19    /// Double-quoted (e.g., `<<"EOF"`), interpolates variables and escapes.
20    Double,
21    /// Backtick (e.g., `<<`EOF``), command execution.
22    Backtick,
23}
24
25/// Declaration info captured at parse time.
26#[derive(Debug, Clone)]
27pub struct PendingHeredoc {
28    /// Exact terminator token that ends this heredoc.
29    pub label: Arc<str>,
30    /// True for indented heredocs (`<<~`), allows leading whitespace before terminator.
31    pub allow_indent: bool,
32    /// Quoting style determining interpolation behavior.
33    pub quote: QuoteKind,
34    /// Source span of the heredoc declaration (e.g., `<<EOF`).
35    pub decl_span: ByteSpan,
36    // Optional: add your node id here if convenient for AST attachment.
37    // pub node_id: NodeId,
38}
39
40/// Collected content. Each segment is a line after indent stripping (no CR/LF).
41#[derive(Debug)]
42pub struct HeredocContent {
43    /// Individual line spans after indent stripping, excluding line terminators.
44    pub segments: Vec<ByteSpan>,
45    /// Span from start of first segment to end of last segment (empty span if no content).
46    pub full_span: ByteSpan,
47    /// Whether the heredoc was correctly terminated by its label.
48    pub terminated: bool,
49}
50
51/// Result of collecting one or more heredocs from source.
52#[derive(Debug)]
53pub struct CollectionResult {
54    /// Collected heredoc contents in FIFO order, aligned to pending declarations.
55    pub contents: Vec<HeredocContent>,
56    /// Whether each heredoc terminator was found (aligned to `contents`).
57    pub terminators_found: Vec<bool>,
58    /// Byte offset immediately after the final terminator newline.
59    pub next_offset: usize,
60}
61
62/// Collects all pending heredocs from source starting at the given offset.
63///
64/// Processes heredocs in FIFO order, returning their contents and the byte offset
65/// after the final terminator.
66pub fn collect_all(
67    src: &[u8],
68    mut offset: usize,
69    mut pending: VecDeque<PendingHeredoc>,
70) -> CollectionResult {
71    let mut results = Vec::with_capacity(pending.len());
72    let mut terminators_found = Vec::with_capacity(pending.len());
73    while let Some(hd) = pending.pop_front() {
74        let (content, off2, found) = collect_one(src, offset, &hd);
75        results.push(content);
76        terminators_found.push(found);
77        offset = off2;
78    }
79    CollectionResult { contents: results, terminators_found, next_offset: offset }
80}
81
82/// Reads content lines until `label` matches after optional leading whitespace.
83/// For `<<~`, capture the terminator's leading whitespace as the indent baseline
84/// and strip the longest common BYTE prefix on each content line.
85/// CRLF is normalized **only** for terminator comparison; content spans exclude
86/// CR and LF bytes by construction.
87fn collect_one(src: &[u8], mut off: usize, hd: &PendingHeredoc) -> (HeredocContent, usize, bool) {
88    #[derive(Debug)]
89    struct Line {
90        start: usize,
91        end_no_eol: usize,
92    } // [start, end_no_eol)
93
94    let mut raw_lines: Vec<Line> = Vec::new();
95    let mut baseline_indent: Vec<u8> = Vec::new();
96    let mut after_terminator_off = off;
97    let mut found = false;
98
99    // Note: Use < not <= to avoid infinite loop at EOF (next_line_bounds returns same offset at EOF)
100    while off < src.len() {
101        let (ls, le, next) = next_line_bounds(src, off);
102        let line = &src[ls..le];
103
104        // For terminator: ignore leading spaces/tabs; ignore trailing CR.
105        let (lead_ws, rest) = split_leading_ws(line);
106        let rest_no_cr = strip_trailing_cr(rest);
107
108        if rest_no_cr == hd.label.as_bytes() {
109            if hd.allow_indent {
110                baseline_indent.clear();
111                baseline_indent.extend_from_slice(&line[..lead_ws]);
112            } else {
113                baseline_indent.clear();
114            }
115            after_terminator_off = next;
116            found = true;
117            break;
118        }
119
120        raw_lines.push(Line { start: ls, end_no_eol: le });
121        off = next;
122    }
123
124    let segments: Vec<ByteSpan> = raw_lines
125        .iter()
126        .map(|ln| {
127            if baseline_indent.is_empty() {
128                ByteSpan { start: ln.start, end: ln.end_no_eol }
129            } else {
130                let bytes = &src[ln.start..ln.end_no_eol];
131                let strip = common_prefix_len(bytes, &baseline_indent);
132                ByteSpan { start: ln.start + strip, end: ln.end_no_eol }
133            }
134        })
135        .collect();
136
137    let full_span = match (segments.first(), segments.last()) {
138        (Some(f), Some(l)) => ByteSpan { start: f.start, end: l.end },
139        _ => ByteSpan { start: off, end: off }, // empty heredoc
140    };
141
142    if !found {
143        // Unterminated; return what we have (upstream should report a syntax error)
144        return (HeredocContent { segments, full_span, terminated: false }, off, false);
145    }
146
147    (HeredocContent { segments, full_span, terminated: true }, after_terminator_off, true)
148}
149
150/// (line_start, line_end_excluding_newline, next_offset_after_newline)
151/// Treats "\r\n" as one newline; "\n" also supported. EOF without newline ok.
152fn next_line_bounds(src: &[u8], mut off: usize) -> (usize, usize, usize) {
153    let start = off;
154    while off < src.len() && src[off] != b'\n' && src[off] != b'\r' {
155        off += 1;
156    }
157    let end_no_eol = off;
158    if off < src.len() {
159        if src[off] == b'\r' {
160            off += 1;
161            if off < src.len() && src[off] == b'\n' {
162                off += 1;
163            }
164        } else if src[off] == b'\n' {
165            off += 1;
166        }
167    }
168    (start, end_no_eol, off)
169}
170
171/// Splits a byte slice into leading whitespace length and the remainder.
172fn split_leading_ws(s: &[u8]) -> (usize, &[u8]) {
173    let mut i = 0;
174    while i < s.len() && (s[i] == b' ' || s[i] == b'\t') {
175        i += 1;
176    }
177    (i, &s[i..])
178}
179
180/// For label comparison only, drop a trailing '\r' (CRLF normalization).
181fn strip_trailing_cr(s: &[u8]) -> &[u8] {
182    if s.last().copied() == Some(b'\r') { &s[..s.len() - 1] } else { s }
183}
184
185/// Returns the length of the common byte prefix between two slices.
186fn common_prefix_len(a: &[u8], b: &[u8]) -> usize {
187    let n = a.len().min(b.len());
188    let mut i = 0;
189    while i < n && a[i] == b[i] {
190        i += 1;
191    }
192    i
193}
194
195#[cfg(test)]
196mod tests {
197    use super::*;
198    use std::collections::VecDeque;
199    use std::sync::Arc;
200
201    type TestResult = Result<(), Box<dyn std::error::Error>>;
202
203    fn pending(label: &str, allow_indent: bool) -> PendingHeredoc {
204        PendingHeredoc {
205            label: Arc::from(label),
206            allow_indent,
207            quote: QuoteKind::Unquoted,
208            decl_span: ByteSpan { start: 0, end: 0 },
209        }
210    }
211
212    fn slice(src: &[u8], span: ByteSpan) -> Result<&str, Box<dyn std::error::Error>> {
213        Ok(std::str::from_utf8(&src[span.start..span.end])?)
214    }
215
216    #[test]
217    fn collect_all_consumes_heredocs_in_fifo_order() -> TestResult {
218        let src = b"one\nEOF\ntwo\nBAR\nrest";
219        let mut pending_docs = VecDeque::new();
220        pending_docs.push_back(pending("EOF", false));
221        pending_docs.push_back(pending("BAR", false));
222
223        let result = collect_all(src, 0, pending_docs);
224
225        assert_eq!(result.terminators_found, vec![true, true]);
226        assert_eq!(result.contents.len(), 2);
227        assert_eq!(slice(src, result.contents[0].segments[0])?, "one");
228        assert_eq!(slice(src, result.contents[1].segments[0])?, "two");
229        assert_eq!(result.next_offset, 16);
230
231        Ok(())
232    }
233
234    #[test]
235    fn collect_all_strips_indented_heredoc_baseline_from_content_segments() -> TestResult {
236        let src = b"    first\n  second\n  EOF\nafter";
237        let mut pending_docs = VecDeque::new();
238        pending_docs.push_back(pending("EOF", true));
239
240        let result = collect_all(src, 0, pending_docs);
241        let content = &result.contents[0];
242
243        assert_eq!(result.terminators_found, vec![true]);
244        assert!(content.terminated);
245        assert_eq!(slice(src, content.segments[0])?, "  first");
246        assert_eq!(slice(src, content.segments[1])?, "second");
247        assert_eq!(content.full_span, ByteSpan { start: 2, end: 18 });
248        assert_eq!(result.next_offset, 25);
249
250        Ok(())
251    }
252
253    #[test]
254    fn collect_all_matches_crlf_terminators_without_including_line_endings() -> TestResult {
255        let src = b"alpha\r\nEOF\r\nafter";
256        let mut pending_docs = VecDeque::new();
257        pending_docs.push_back(pending("EOF", false));
258
259        let result = collect_all(src, 0, pending_docs);
260        let content = &result.contents[0];
261
262        assert_eq!(result.terminators_found, vec![true]);
263        assert_eq!(slice(src, content.segments[0])?, "alpha");
264        assert_eq!(content.full_span, ByteSpan { start: 0, end: 5 });
265        assert_eq!(result.next_offset, 12);
266
267        Ok(())
268    }
269
270    #[test]
271    fn collect_all_reports_unterminated_content_and_stops_at_eof() -> TestResult {
272        let src = b"alpha\nbeta";
273        let mut pending_docs = VecDeque::new();
274        pending_docs.push_back(pending("EOF", false));
275
276        let result = collect_all(src, 0, pending_docs);
277        let content = &result.contents[0];
278
279        assert_eq!(result.terminators_found, vec![false]);
280        assert!(!content.terminated);
281        assert_eq!(content.segments.len(), 2);
282        assert_eq!(slice(src, content.segments[0])?, "alpha");
283        assert_eq!(slice(src, content.segments[1])?, "beta");
284        assert_eq!(result.next_offset, src.len());
285
286        Ok(())
287    }
288
289    #[test]
290    fn collect_all_preserves_spaces_when_indent_is_not_allowed() -> TestResult {
291        let src = b"  content\nEOF\n";
292        let mut pending_docs = VecDeque::new();
293        pending_docs.push_back(pending("EOF", false));
294
295        let result = collect_all(src, 0, pending_docs);
296
297        assert_eq!(slice(src, result.contents[0].segments[0])?, "  content");
298        assert_eq!(result.contents[0].full_span, ByteSpan { start: 0, end: 9 });
299
300        Ok(())
301    }
302}