Skip to main content

nils_common/
markdown.rs

1use std::error::Error;
2use std::fmt;
3
4const LITERAL_ESCAPED_CONTROLS: [&str; 3] = [r"\n", r"\r", r"\t"];
5
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub struct MarkdownPayloadViolation {
8    pub sequence: &'static str,
9    pub count: usize,
10}
11
12#[derive(Debug, Clone, PartialEq, Eq)]
13pub struct MarkdownPayloadError {
14    violations: Vec<MarkdownPayloadViolation>,
15}
16
17impl MarkdownPayloadError {
18    pub fn violations(&self) -> &[MarkdownPayloadViolation] {
19        &self.violations
20    }
21}
22
23impl fmt::Display for MarkdownPayloadError {
24    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
25        let details = self
26            .violations
27            .iter()
28            .map(|entry| format!("{} ({})", entry.sequence, entry.count))
29            .collect::<Vec<_>>()
30            .join(", ");
31        write!(
32            f,
33            "markdown payload contains literal escaped-control artifacts: {details}"
34        )
35    }
36}
37
38impl Error for MarkdownPayloadError {}
39
40pub fn markdown_payload_violations(markdown: &str) -> Vec<MarkdownPayloadViolation> {
41    // Literal escaped controls (`\n`, `\r`, `\t`) are legitimate inside code —
42    // a shell example like `printf 'a\nb'` is not corruption — so only flag them
43    // when they appear in prose / structural markdown. Scan with fenced code
44    // blocks and inline code spans removed.
45    let scannable = strip_code_segments(markdown);
46    let mut violations = Vec::new();
47
48    for sequence in LITERAL_ESCAPED_CONTROLS {
49        let count = scannable.match_indices(sequence).count();
50        if count > 0 {
51            violations.push(MarkdownPayloadViolation { sequence, count });
52        }
53    }
54
55    violations
56}
57
58/// Return `markdown` with fenced code blocks and inline code spans removed so the
59/// escaped-control guard only inspects prose / structure. Removed spans are
60/// replaced with a space (and skipped fenced lines with a newline) so that
61/// neighbouring characters cannot glue into a literal `\n`-style sequence that
62/// was not present in the source.
63fn strip_code_segments(markdown: &str) -> String {
64    let mut out = String::with_capacity(markdown.len());
65    let mut open_fence: Option<(char, usize)> = None;
66
67    for line in markdown.split_inclusive('\n') {
68        if let Some((fence_char, fence_len)) = fence_marker(line) {
69            match open_fence {
70                None => {
71                    // Opening fence — start a code block; drop the fence line.
72                    open_fence = Some((fence_char, fence_len));
73                    out.push('\n');
74                    continue;
75                }
76                Some((open_char, open_len)) if fence_char == open_char && fence_len >= open_len => {
77                    // Closing fence — end the block; drop the fence line.
78                    open_fence = None;
79                    out.push('\n');
80                    continue;
81                }
82                // A fence-looking line of a different kind is block content.
83                Some(_) => {
84                    out.push('\n');
85                    continue;
86                }
87            }
88        }
89        if open_fence.is_some() {
90            // Inside a fenced block — skip the content line.
91            out.push('\n');
92            continue;
93        }
94        out.push_str(&strip_inline_code(line));
95    }
96
97    out
98}
99
100/// If `line` is a fenced-code delimiter (its first non-whitespace content is a
101/// run of three or more backticks or tildes), return the fence character and the
102/// run length; otherwise `None`.
103fn fence_marker(line: &str) -> Option<(char, usize)> {
104    let trimmed = line.trim_start();
105    let fence_char = trimmed.chars().next()?;
106    if fence_char != '`' && fence_char != '~' {
107        return None;
108    }
109    let run = trimmed.chars().take_while(|&c| c == fence_char).count();
110    if run >= 3 {
111        Some((fence_char, run))
112    } else {
113        None
114    }
115}
116
117/// Remove inline code spans (backtick-delimited) from a single line, leaving the
118/// surrounding text. An unterminated backtick run is treated as plain text.
119fn strip_inline_code(line: &str) -> String {
120    let chars: Vec<char> = line.chars().collect();
121    let mut out = String::with_capacity(line.len());
122    let mut i = 0;
123
124    while i < chars.len() {
125        if chars[i] != '`' {
126            out.push(chars[i]);
127            i += 1;
128            continue;
129        }
130
131        // Measure the opening backtick run.
132        let mut run = 0;
133        while i < chars.len() && chars[i] == '`' {
134            run += 1;
135            i += 1;
136        }
137
138        // Find a closing run of exactly the same length.
139        let mut j = i;
140        let mut closed = false;
141        while j < chars.len() {
142            if chars[j] == '`' {
143                let mut close_run = 0;
144                while j < chars.len() && chars[j] == '`' {
145                    close_run += 1;
146                    j += 1;
147                }
148                if close_run == run {
149                    // [opening run .. closing run] is an inline code span; drop
150                    // it, leaving a space so neighbours do not glue together.
151                    out.push(' ');
152                    i = j;
153                    closed = true;
154                    break;
155                }
156            } else {
157                j += 1;
158            }
159        }
160
161        if !closed {
162            // No matching close — the run is literal text.
163            for _ in 0..run {
164                out.push('`');
165            }
166        }
167    }
168
169    out
170}
171
172pub fn validate_markdown_payload(markdown: &str) -> Result<(), MarkdownPayloadError> {
173    let violations = markdown_payload_violations(markdown);
174    if violations.is_empty() {
175        Ok(())
176    } else {
177        Err(MarkdownPayloadError { violations })
178    }
179}
180
181pub fn canonicalize_table_cell(value: &str) -> String {
182    let mut out = String::with_capacity(value.len());
183    let mut in_line_break_run = false;
184
185    for ch in value.chars() {
186        match ch {
187            '\n' | '\r' => {
188                if !in_line_break_run {
189                    out.push(' ');
190                    in_line_break_run = true;
191                }
192            }
193            '|' => {
194                out.push('/');
195                in_line_break_run = false;
196            }
197            _ => {
198                out.push(ch);
199                in_line_break_run = false;
200            }
201        }
202    }
203
204    out
205}
206
207fn sort_json(value: &serde_json::Value) -> serde_json::Value {
208    match value {
209        serde_json::Value::Object(map) => {
210            let mut keys: Vec<&String> = map.keys().collect();
211            keys.sort();
212            let mut out = serde_json::Map::new();
213            for k in keys {
214                let v = map.get(k).expect("key exists");
215                out.insert(k.clone(), sort_json(v));
216            }
217            serde_json::Value::Object(out)
218        }
219        serde_json::Value::Array(values) => {
220            serde_json::Value::Array(values.iter().map(sort_json).collect())
221        }
222        other => other.clone(),
223    }
224}
225
226/// Format JSON similar to `jq -S .` (stable key order, pretty printed).
227pub fn format_json_pretty_sorted(value: &serde_json::Value) -> Result<String, serde_json::Error> {
228    let sorted = sort_json(value);
229    serde_json::to_string_pretty(&sorted)
230}
231
232pub fn heading(level: u8, text: &str) -> String {
233    let level = level.clamp(1, 6);
234    format!("{} {}\n", "#".repeat(level.into()), text.trim())
235}
236
237pub fn code_block(lang: &str, body: &str) -> String {
238    let mut out = String::new();
239    out.push_str("```");
240    out.push_str(lang.trim());
241    out.push('\n');
242    out.push_str(body);
243    if !body.ends_with('\n') {
244        out.push('\n');
245    }
246    out.push_str("```\n");
247    out
248}
249
250#[cfg(test)]
251mod tests {
252    use super::{
253        canonicalize_table_cell, code_block, format_json_pretty_sorted, heading,
254        markdown_payload_violations, validate_markdown_payload,
255    };
256
257    #[test]
258    fn markdown_payload_validator_accepts_real_control_chars() {
259        let payload = "line one\nline two\tvalue\r\n";
260        let result = validate_markdown_payload(payload);
261        assert!(
262            result.is_ok(),
263            "unexpected markdown payload error: {result:?}"
264        );
265    }
266
267    #[test]
268    fn markdown_payload_validator_rejects_literal_escaped_controls() {
269        let payload = r"line one\nline two\rline three\tvalue";
270        let err = validate_markdown_payload(payload).expect_err("expected markdown payload error");
271
272        assert_eq!(err.violations().len(), 3);
273        assert!(
274            err.to_string().contains(r"\n"),
275            "expected escaped-newline mention in {:?}",
276            err
277        );
278        assert!(
279            err.to_string().contains(r"\r"),
280            "expected escaped-return mention in {:?}",
281            err
282        );
283        assert!(
284            err.to_string().contains(r"\t"),
285            "expected escaped-tab mention in {:?}",
286            err
287        );
288    }
289
290    #[test]
291    fn markdown_payload_violations_reports_counts_per_sequence() {
292        let payload = r"one\n two\n three\t";
293        let violations = markdown_payload_violations(payload);
294
295        assert_eq!(violations.len(), 2);
296        assert_eq!(violations[0].sequence, r"\n");
297        assert_eq!(violations[0].count, 2);
298        assert_eq!(violations[1].sequence, r"\t");
299        assert_eq!(violations[1].count, 1);
300    }
301
302    #[test]
303    fn markdown_payload_validator_ignores_escaped_controls_in_fenced_code() {
304        let payload = "Prose before.\n\n```sh\nprintf 'a\\nb'\n```\n\nProse after.\n";
305        assert!(
306            validate_markdown_payload(payload).is_ok(),
307            "escaped controls inside a fenced code block must not be flagged"
308        );
309    }
310
311    #[test]
312    fn markdown_payload_validator_ignores_escaped_controls_in_inline_code() {
313        let payload = r"Run `printf 'a\nb'` to print two lines.";
314        assert!(
315            validate_markdown_payload(payload).is_ok(),
316            "escaped controls inside an inline code span must not be flagged"
317        );
318    }
319
320    #[test]
321    fn markdown_payload_validator_still_flags_escaped_controls_in_prose() {
322        // A real escaped newline in prose (outside code) is still corruption.
323        let violations = markdown_payload_violations(r"Status: done.\nNext: ship it.");
324        assert_eq!(violations.len(), 1);
325        assert_eq!(violations[0].sequence, r"\n");
326        assert_eq!(violations[0].count, 1);
327    }
328
329    #[test]
330    fn markdown_payload_validator_flags_prose_but_not_code_in_mixed_payload() {
331        // The prose `\n` is flagged once; the occurrences inside the fenced block
332        // and the inline span are ignored.
333        let payload = "Bad prose: a\\nb\n\n```\nprintf 'x\\ny'\n```\n\nUse `echo 'p\\nq'` here.\n";
334        let violations = markdown_payload_violations(payload);
335        assert_eq!(
336            violations.len(),
337            1,
338            "only the prose occurrence counts: {violations:?}"
339        );
340        assert_eq!(violations[0].sequence, r"\n");
341        assert_eq!(violations[0].count, 1);
342    }
343
344    #[test]
345    fn canonicalize_table_cell_normalizes_markdown_unsafe_chars() {
346        let value = "A|B\r\nC\nD\rE";
347        assert_eq!(canonicalize_table_cell(value), "A/B C D E");
348    }
349
350    #[test]
351    fn canonicalize_table_cell_is_idempotent() {
352        let first = canonicalize_table_cell("x|y\r\nz");
353        let second = canonicalize_table_cell(&first);
354        assert_eq!(first, second);
355    }
356
357    #[test]
358    fn markdown_code_block_is_newline_stable() {
359        assert_eq!(code_block("json", "{ }"), "```json\n{ }\n```\n");
360        assert_eq!(code_block("json", "{ }\n"), "```json\n{ }\n```\n");
361    }
362
363    #[test]
364    fn markdown_heading_trims_and_clamps_level() {
365        assert_eq!(heading(1, " Title "), "# Title\n");
366        assert_eq!(heading(9, "Title"), "###### Title\n");
367    }
368
369    #[test]
370    fn json_format_sorts_keys_recursively() {
371        let v = serde_json::json!({"b": 1, "a": {"d": 4, "c": 3}});
372        let s = format_json_pretty_sorted(&v).expect("sorted json");
373        assert_eq!(
374            s,
375            "{\n  \"a\": {\n    \"c\": 3,\n    \"d\": 4\n  },\n  \"b\": 1\n}"
376        );
377    }
378}