Skip to main content

panache_parser/parser/utils/
hashpipe_normalizer.rs

1//! Shared hashpipe header detection and normalization utilities.
2//!
3//! This module detects the contiguous hashpipe YAML preamble at the start of a
4//! code block content string, strips line prefixes into normalized YAML text,
5//! and records deterministic host↔normalized range mappings.
6
7use std::ops::Range;
8
9/// Prefix markers explicitly supported by hashpipe normalization.
10pub const SUPPORTED_HASHPIPE_PREFIXES: [&str; 3] = ["#|", "//|", "--|"];
11
12/// Per-line mapping between host (original content) and normalized YAML text.
13#[derive(Debug, Clone, PartialEq, Eq)]
14pub struct HashpipeLineMapping {
15    /// Byte range of the full host line (including newline, if present).
16    pub host_line_range: Range<usize>,
17    /// Byte range of stripped host line content (without trailing newline bytes).
18    pub host_stripped_range: Range<usize>,
19    /// Byte range of normalized line content (without normalized newline byte).
20    pub normalized_content_range: Range<usize>,
21    /// Byte range of normalized line including normalized newline, if present.
22    pub normalized_line_range: Range<usize>,
23    /// Host newline byte length for this line (0, 1 for LF, 2 for CRLF).
24    pub host_newline_len: usize,
25}
26
27/// Result of hashpipe header detection and stripping.
28#[derive(Debug, Clone, PartialEq, Eq)]
29pub struct HashpipeHeaderNormalization {
30    /// Prefix that was used for detection and stripping.
31    pub prefix: String,
32    /// Number of contiguous hashpipe header lines consumed from the start.
33    pub header_line_count: usize,
34    /// Byte span of the detected header in host content.
35    pub header_byte_span: Range<usize>,
36    /// Prefix-stripped YAML text with deterministic `\n` newlines.
37    pub normalized_yaml: String,
38    /// Per-line host↔normalized mapping metadata.
39    pub line_mappings: Vec<HashpipeLineMapping>,
40}
41
42#[derive(Debug, Clone, Copy)]
43struct LineSlice<'a> {
44    line_without_newline: &'a str,
45    start: usize,
46    end: usize,
47    newline_len: usize,
48}
49
50/// Normalize a contiguous leading hashpipe header into YAML text.
51///
52/// Returns `None` when the input does not start with a valid hashpipe option
53/// line for the provided prefix.
54pub fn normalize_hashpipe_header(
55    content: &str,
56    prefix: &str,
57) -> Option<HashpipeHeaderNormalization> {
58    if !SUPPORTED_HASHPIPE_PREFIXES.contains(&prefix) {
59        return None;
60    }
61
62    let lines = split_lines_with_offsets(content);
63    if lines.is_empty() {
64        return None;
65    }
66
67    let mut consumed = 0usize;
68    let mut saw_prefix = false;
69    let mut open_quoted: Option<String> = None;
70    let mut open_block_scalar = false;
71    let mut open_flow_collection = false;
72    let mut open_indented_yaml_value = false;
73
74    while consumed < lines.len() {
75        let line = lines[consumed];
76        let trimmed = line.line_without_newline.trim_start_matches([' ', '\t']);
77
78        if let Some(mut value) = open_quoted.take()
79            && let Some(fragment) = continuation_value(trimmed, prefix)
80        {
81            if !value.ends_with(' ') {
82                value.push(' ');
83            }
84            value.push_str(&fragment);
85            consumed += 1;
86            if is_unclosed_double_quoted(&value) {
87                open_quoted = Some(value);
88            }
89            continue;
90        }
91
92        if open_block_scalar {
93            if let Some(after_prefix) = trimmed.strip_prefix(prefix)
94                && is_block_scalar_continuation_line(after_prefix)
95            {
96                consumed += 1;
97                continue;
98            }
99            open_block_scalar = false;
100        }
101
102        if open_flow_collection {
103            if let Some(after_prefix) = trimmed.strip_prefix(prefix)
104                && is_flow_collection_continuation_line(after_prefix)
105            {
106                consumed += 1;
107                if let Some(value) = option_value(trimmed, prefix)
108                    && !is_unclosed_flow_collection(&value)
109                {
110                    open_flow_collection = false;
111                }
112                continue;
113            }
114            open_flow_collection = false;
115        }
116
117        if open_indented_yaml_value {
118            if let Some(after_prefix) = trimmed.strip_prefix(prefix)
119                && is_block_scalar_continuation_line(after_prefix)
120            {
121                consumed += 1;
122                continue;
123            }
124            open_indented_yaml_value = false;
125        }
126
127        if is_hashpipe_option_line(trimmed, prefix) {
128            saw_prefix = true;
129            if let Some(value) = option_value(trimmed, prefix) {
130                if is_unclosed_double_quoted(&value) {
131                    open_quoted = Some(value);
132                } else if is_yaml_block_scalar_indicator(&value) {
133                    open_block_scalar = true;
134                } else if is_unclosed_flow_collection(&value) {
135                    open_flow_collection = true;
136                } else if value.is_empty() {
137                    open_indented_yaml_value = true;
138                }
139            }
140            consumed += 1;
141            continue;
142        }
143
144        break;
145    }
146
147    if !saw_prefix || consumed == 0 {
148        return None;
149    }
150
151    let header_end = lines[consumed - 1].end;
152    let mut normalized_yaml = String::new();
153    let mut line_mappings = Vec::with_capacity(consumed);
154    let mut normalized_pos = 0usize;
155
156    for line in &lines[..consumed] {
157        let stripped = strip_hashpipe_prefix_once(line.line_without_newline, prefix)?;
158
159        let trimmed_start = line.line_without_newline.trim_start_matches([' ', '\t']);
160        let leading_ws_len = line.line_without_newline.len() - trimmed_start.len();
161        let after_prefix = &trimmed_start[prefix.len()..];
162        let removed_space_len = usize::from(after_prefix.starts_with([' ', '\t']));
163        let host_stripped_start = line.start + leading_ws_len + prefix.len() + removed_space_len;
164        let host_stripped_end = line.start + line.line_without_newline.len();
165
166        let normalized_content_start = normalized_pos;
167        normalized_yaml.push_str(stripped);
168        normalized_pos += stripped.len();
169        if line.newline_len > 0 {
170            normalized_yaml.push('\n');
171            normalized_pos += 1;
172        }
173
174        line_mappings.push(HashpipeLineMapping {
175            host_line_range: line.start..line.end,
176            host_stripped_range: host_stripped_start..host_stripped_end,
177            normalized_content_range: normalized_content_start
178                ..(normalized_content_start + stripped.len()),
179            normalized_line_range: normalized_content_start..normalized_pos,
180            host_newline_len: line.newline_len,
181        });
182    }
183
184    Some(HashpipeHeaderNormalization {
185        prefix: prefix.to_string(),
186        header_line_count: consumed,
187        header_byte_span: 0..header_end,
188        normalized_yaml,
189        line_mappings,
190    })
191}
192
193fn split_lines_with_offsets(content: &str) -> Vec<LineSlice<'_>> {
194    let mut lines = Vec::new();
195    let mut idx = 0usize;
196    let bytes = content.as_bytes();
197
198    while idx < content.len() {
199        let mut end = idx;
200        while end < content.len() && bytes[end] != b'\n' {
201            end += 1;
202        }
203        if end < content.len() {
204            end += 1; // include '\n'
205        }
206
207        let full = &content[idx..end];
208        let newline_len = if full.ends_with("\r\n") {
209            2
210        } else if full.ends_with('\n') {
211            1
212        } else {
213            0
214        };
215        let line_without_newline = &full[..full.len().saturating_sub(newline_len)];
216
217        lines.push(LineSlice {
218            line_without_newline,
219            start: idx,
220            end,
221            newline_len,
222        });
223
224        idx = end;
225    }
226
227    lines
228}
229
230fn strip_hashpipe_prefix_once<'a>(line_without_newline: &'a str, prefix: &str) -> Option<&'a str> {
231    let trimmed_start = line_without_newline.trim_start_matches([' ', '\t']);
232    let after_prefix = trimmed_start.strip_prefix(prefix)?;
233    if let Some(rest) = after_prefix.strip_prefix(' ') {
234        return Some(rest);
235    }
236    if let Some(rest) = after_prefix.strip_prefix('\t') {
237        return Some(rest);
238    }
239    Some(after_prefix)
240}
241
242fn is_hashpipe_option_line(line_without_newline: &str, prefix: &str) -> bool {
243    let trimmed_start = line_without_newline.trim_start_matches([' ', '\t']);
244    if !trimmed_start.starts_with(prefix) {
245        return false;
246    }
247    let after_prefix = &trimmed_start[prefix.len()..];
248    let rest = after_prefix.trim_start_matches([' ', '\t']);
249    let Some(colon_idx) = rest.find(':') else {
250        return false;
251    };
252    let key = rest[..colon_idx].trim_end_matches([' ', '\t']);
253    !key.is_empty()
254}
255
256fn option_value(line_without_newline: &str, prefix: &str) -> Option<String> {
257    if !is_hashpipe_option_line(line_without_newline, prefix) {
258        return None;
259    }
260    let trimmed_start = line_without_newline.trim_start_matches([' ', '\t']);
261    let after_prefix = &trimmed_start[prefix.len()..];
262    let rest = after_prefix.trim_start_matches([' ', '\t']);
263    let colon_idx = rest.find(':')?;
264    let value = rest[colon_idx + 1..]
265        .trim_start_matches([' ', '\t'])
266        .trim_end_matches([' ', '\t']);
267    Some(value.to_string())
268}
269
270fn continuation_value(line_without_newline: &str, prefix: &str) -> Option<String> {
271    let trimmed_start = line_without_newline.trim_start_matches([' ', '\t']);
272    if !trimmed_start.starts_with(prefix) {
273        return None;
274    }
275    let after_prefix = &trimmed_start[prefix.len()..];
276    let first = after_prefix.chars().next()?;
277    if first != ' ' && first != '\t' {
278        return None;
279    }
280    let value = after_prefix
281        .trim_start_matches([' ', '\t'])
282        .trim_end_matches([' ', '\t']);
283    if value.is_empty() {
284        None
285    } else {
286        Some(value.to_string())
287    }
288}
289
290fn is_yaml_block_scalar_indicator(value: &str) -> bool {
291    let s = value.trim();
292    if s.is_empty() {
293        return false;
294    }
295    let mut chars = s.chars();
296    let Some(style) = chars.next() else {
297        return false;
298    };
299    if style != '|' && style != '>' {
300        return false;
301    }
302    chars.all(|ch| ch == '+' || ch == '-' || ch.is_ascii_digit())
303}
304
305fn leading_ws_count(text: &str) -> usize {
306    text.chars().take_while(|c| matches!(c, ' ' | '\t')).count()
307}
308
309fn is_block_scalar_continuation_line(after_prefix: &str) -> bool {
310    let text = after_prefix.trim_end_matches(['\n', '\r']);
311    if text.trim().is_empty() {
312        return true;
313    }
314    leading_ws_count(text) >= 2
315}
316
317fn is_flow_collection_continuation_line(after_prefix: &str) -> bool {
318    if is_block_scalar_continuation_line(after_prefix) {
319        return true;
320    }
321    let trimmed = after_prefix
322        .trim_end_matches(['\n', '\r'])
323        .trim_start_matches([' ', '\t']);
324    trimmed.starts_with(']') || trimmed.starts_with('}')
325}
326
327fn is_unclosed_double_quoted(value: &str) -> bool {
328    if !value.starts_with('"') {
329        return false;
330    }
331    let mut escaped = false;
332    let mut quote_count = 0usize;
333    for ch in value.chars() {
334        if escaped {
335            escaped = false;
336            continue;
337        }
338        if ch == '\\' {
339            escaped = true;
340            continue;
341        }
342        if ch == '"' {
343            quote_count += 1;
344        }
345    }
346    quote_count % 2 == 1
347}
348
349fn is_unclosed_flow_collection(value: &str) -> bool {
350    let trimmed = value.trim_start();
351    if !trimmed.starts_with('[') && !trimmed.starts_with('{') {
352        return false;
353    }
354
355    let mut stack: Vec<char> = Vec::new();
356    let mut in_single = false;
357    let mut in_double = false;
358    let mut escaped = false;
359
360    for ch in value.chars() {
361        if escaped {
362            escaped = false;
363            continue;
364        }
365        match ch {
366            '\\' if in_double => escaped = true,
367            '\'' if !in_double => in_single = !in_single,
368            '"' if !in_single => in_double = !in_double,
369            '[' | '{' if !in_single && !in_double => stack.push(ch),
370            ']' if !in_single && !in_double => {
371                if stack.pop() != Some('[') {
372                    return false;
373                }
374            }
375            '}' if !in_single && !in_double => {
376                if stack.pop() != Some('{') {
377                    return false;
378                }
379            }
380            _ => {}
381        }
382    }
383
384    !stack.is_empty() || in_single || in_double
385}
386
387#[cfg(test)]
388mod tests {
389    use super::normalize_hashpipe_header;
390
391    #[test]
392    fn normalizes_supported_prefixes() {
393        for prefix in ["#|", "//|", "--|"] {
394            let input = format!("{prefix} echo: true\n{prefix} warning: false\nx <- 1\n");
395            let normalized = normalize_hashpipe_header(&input, prefix).expect("expected header");
396            assert_eq!(normalized.header_line_count, 2);
397            assert_eq!(
398                normalized.header_byte_span,
399                0..(input.lines().take(2).map(|l| l.len() + 1).sum())
400            );
401            assert_eq!(normalized.normalized_yaml, "echo: true\nwarning: false\n");
402            assert_eq!(normalized.line_mappings.len(), 2);
403        }
404    }
405
406    #[test]
407    fn handles_multiline_quoted_value() {
408        let input = "#| title: \"hello\n#|   world\"\n#| echo: true\nbody\n";
409        let normalized = normalize_hashpipe_header(input, "#|").expect("expected header");
410        assert_eq!(normalized.header_line_count, 3);
411        assert_eq!(
412            normalized.normalized_yaml,
413            "title: \"hello\n  world\"\necho: true\n"
414        );
415    }
416
417    #[test]
418    fn handles_flow_collection_and_block_scalar_and_indented_value() {
419        let flow = "#| tags: [a,\n#|   b,\n#|   c]\ncode\n";
420        let flow_norm = normalize_hashpipe_header(flow, "#|").expect("expected flow header");
421        assert_eq!(flow_norm.header_line_count, 3);
422        assert_eq!(flow_norm.normalized_yaml, "tags: [a,\n  b,\n  c]\n");
423
424        let block_scalar = "#| fig-cap: |\n#|   one\n#|   two\n#| echo: true\n";
425        let block_norm =
426            normalize_hashpipe_header(block_scalar, "#|").expect("expected scalar header");
427        assert_eq!(block_norm.header_line_count, 4);
428        assert_eq!(
429            block_norm.normalized_yaml,
430            "fig-cap: |\n  one\n  two\necho: true\n"
431        );
432
433        let indented = "#| fig-cap:\n#|   - A\n#|   - B\nplot()\n";
434        let indented_norm =
435            normalize_hashpipe_header(indented, "#|").expect("expected indented header");
436        assert_eq!(indented_norm.header_line_count, 3);
437        assert_eq!(indented_norm.normalized_yaml, "fig-cap:\n  - A\n  - B\n");
438    }
439
440    #[test]
441    fn handles_no_header_and_partial_header() {
442        assert!(normalize_hashpipe_header("plot(1:3)\n#| echo: true\n", "#|").is_none());
443
444        let input = "#| echo: true\nplot(1:3)\n#| warning: false\n";
445        let normalized = normalize_hashpipe_header(input, "#|").expect("expected leading header");
446        assert_eq!(normalized.header_line_count, 1);
447        assert_eq!(normalized.normalized_yaml, "echo: true\n");
448        assert_eq!(normalized.header_byte_span.end, "#| echo: true\n".len());
449    }
450
451    #[test]
452    fn handles_crlf_deterministically() {
453        let input = "#| echo: true\r\n#|  warning: false\r\nbody\r\n";
454        let normalized = normalize_hashpipe_header(input, "#|").expect("expected header");
455        assert_eq!(normalized.header_line_count, 2);
456        assert_eq!(normalized.normalized_yaml, "echo: true\n warning: false\n");
457        assert_eq!(normalized.line_mappings[0].host_newline_len, 2);
458        assert_eq!(normalized.line_mappings[1].host_newline_len, 2);
459        assert_eq!(
460            normalized.header_byte_span.end,
461            "#| echo: true\r\n#|  warning: false\r\n".len()
462        );
463    }
464}