Skip to main content

tess/
prettify.rs

1//! Content-type detection and pretty-printing for structured data.
2//!
3//! Used by `--prettify` and `--content-type` to lay out JSON, YAML, TOML,
4//! XML, HTML, and CSV inputs in a readable form. The transformation runs once
5//! at startup (or on toggle) and produces a fresh byte buffer that the line
6//! index treats as the new source content. No syntax highlighting / color —
7//! layout only — so search and filter stay byte-clean.
8
9use std::path::Path;
10
11use quick_xml::events::Event;
12use quick_xml::reader::Reader;
13use quick_xml::writer::Writer;
14use std::io::Cursor;
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum PrettifyMode {
18    Off,
19    Json,
20    Yaml,
21    Toml,
22    Xml,
23    Html,
24    Csv,
25}
26
27impl PrettifyMode {
28    /// Status-line label, e.g. `"json"`. Empty when off.
29    pub fn label(self) -> &'static str {
30        match self {
31            Self::Off => "",
32            Self::Json => "json",
33            Self::Yaml => "yaml",
34            Self::Toml => "toml",
35            Self::Xml => "xml",
36            Self::Html => "html",
37            Self::Csv => "csv",
38        }
39    }
40
41    pub fn is_active(self) -> bool {
42        !matches!(self, Self::Off)
43    }
44}
45
46/// Result of resolving the user's content-type intent against the available
47/// signals (explicit flag → extension → byte sniff → raw fallback).
48#[derive(Debug, Clone, PartialEq, Eq)]
49pub enum ResolvedType {
50    Mode(PrettifyMode),
51    /// Auto-detect was requested but nothing matched. Caller should warn and
52    /// fall through to `Off`.
53    Undetected,
54}
55
56/// Parse a `--content-type=NAME` value. Case-insensitive. `auto` returns
57/// `None` (caller should run detection); `raw` maps to `Off`.
58pub fn parse_content_type(name: &str) -> Result<Option<PrettifyMode>, String> {
59    let lc = name.trim().to_ascii_lowercase();
60    let mode = match lc.as_str() {
61        "auto" => return Ok(None),
62        "raw" | "off" | "none" => PrettifyMode::Off,
63        "json" => PrettifyMode::Json,
64        "yaml" | "yml" => PrettifyMode::Yaml,
65        "toml" => PrettifyMode::Toml,
66        "xml" => PrettifyMode::Xml,
67        "html" | "htm" => PrettifyMode::Html,
68        "csv" => PrettifyMode::Csv,
69        other => {
70            return Err(format!(
71                "unknown content type `{other}` (try one of: \
72auto, raw, json, yaml, toml, xml, html, csv)"
73            ));
74        }
75    };
76    Ok(Some(mode))
77}
78
79/// Detect from filename extension. Returns `None` if nothing matches.
80pub fn detect_from_path(path: &Path) -> Option<PrettifyMode> {
81    let ext = path.extension()?.to_str()?.to_ascii_lowercase();
82    Some(match ext.as_str() {
83        "json" => PrettifyMode::Json,
84        "yaml" | "yml" => PrettifyMode::Yaml,
85        "toml" => PrettifyMode::Toml,
86        "xml" => PrettifyMode::Xml,
87        "html" | "htm" => PrettifyMode::Html,
88        "csv" => PrettifyMode::Csv,
89        _ => return None,
90    })
91}
92
93/// Detect from leading bytes. Returns `None` if nothing matches. Cheap;
94/// inspects up to ~512 bytes.
95pub fn detect_from_bytes(bytes: &[u8]) -> Option<PrettifyMode> {
96    let head_len = bytes.len().min(512);
97    let head = &bytes[..head_len];
98    // Skip leading whitespace.
99    let trimmed_start = head.iter().position(|b| !b.is_ascii_whitespace())?;
100    let trimmed = &head[trimmed_start..];
101    if trimmed.is_empty() {
102        return None;
103    }
104    // XML declaration.
105    if trimmed.starts_with(b"<?xml") {
106        return Some(PrettifyMode::Xml);
107    }
108    // HTML doctype or root element. Lowercase comparison on the first ~200 bytes.
109    let head_lc: Vec<u8> = trimmed.iter().take(200).map(|b| b.to_ascii_lowercase()).collect();
110    if head_lc.starts_with(b"<!doctype html") || head_lc.starts_with(b"<html") {
111        return Some(PrettifyMode::Html);
112    }
113    // Generic XML element start.
114    if trimmed[0] == b'<' {
115        return Some(PrettifyMode::Xml);
116    }
117    // JSON object or array.
118    if trimmed[0] == b'{' || trimmed[0] == b'[' {
119        return Some(PrettifyMode::Json);
120    }
121    // YAML document marker on its own line (after optional whitespace).
122    if trimmed.starts_with(b"---") {
123        let rest = &trimmed[3..];
124        if rest.is_empty() || rest[0] == b'\n' || rest[0] == b'\r' {
125            return Some(PrettifyMode::Yaml);
126        }
127    }
128    None
129}
130
131/// Combined resolver: explicit override (already parsed) → path extension
132/// → byte sniff → undetected.
133pub fn resolve(
134    explicit: Option<PrettifyMode>,
135    path: Option<&Path>,
136    bytes: &[u8],
137) -> ResolvedType {
138    if let Some(m) = explicit {
139        return ResolvedType::Mode(m);
140    }
141    if let Some(p) = path {
142        if let Some(m) = detect_from_path(p) {
143            return ResolvedType::Mode(m);
144        }
145    }
146    if let Some(m) = detect_from_bytes(bytes) {
147        return ResolvedType::Mode(m);
148    }
149    ResolvedType::Undetected
150}
151
152/// Run the transform for `mode` over `input`. `Off` returns the input verbatim
153/// (still allocates — callers can short-circuit if they care). On parse
154/// failure, returns the error string for the status line.
155pub fn prettify(mode: PrettifyMode, input: &[u8]) -> Result<Vec<u8>, String> {
156    match mode {
157        PrettifyMode::Off => Ok(input.to_vec()),
158        PrettifyMode::Json => prettify_json(input),
159        PrettifyMode::Yaml => prettify_yaml(input),
160        PrettifyMode::Toml => prettify_toml(input),
161        PrettifyMode::Xml => prettify_xml(input, false),
162        PrettifyMode::Html => prettify_xml(input, true),
163        PrettifyMode::Csv => prettify_csv(input),
164    }
165}
166
167fn prettify_json(input: &[u8]) -> Result<Vec<u8>, String> {
168    let value: serde_json::Value =
169        serde_json::from_slice(input).map_err(|e| format!("json parse: {e}"))?;
170    let mut out = serde_json::to_vec_pretty(&value).map_err(|e| e.to_string())?;
171    if !out.ends_with(b"\n") {
172        out.push(b'\n');
173    }
174    Ok(out)
175}
176
177fn prettify_yaml(input: &[u8]) -> Result<Vec<u8>, String> {
178    let s = std::str::from_utf8(input).map_err(|e| format!("yaml: utf-8: {e}"))?;
179    let value: serde_yml::Value =
180        serde_yml::from_str(s).map_err(|e| format!("yaml parse: {e}"))?;
181    serde_yml::to_string(&value)
182        .map(|s| s.into_bytes())
183        .map_err(|e| format!("yaml emit: {e}"))
184}
185
186fn prettify_toml(input: &[u8]) -> Result<Vec<u8>, String> {
187    let s = std::str::from_utf8(input).map_err(|e| format!("toml: utf-8: {e}"))?;
188    let value: toml::Value = s.parse().map_err(|e: toml::de::Error| format!("toml parse: {e}"))?;
189    toml::to_string_pretty(&value)
190        .map(|s| s.into_bytes())
191        .map_err(|e| format!("toml emit: {e}"))
192}
193
194/// Pretty-print XML/HTML by streaming through quick-xml events and re-emitting
195/// with two-space indentation. `lenient = true` for HTML — turns off the strict
196/// closing-tag-name check so unclosed void elements (`<br>`, `<img>`) and
197/// case-insensitive close tags don't abort the parse.
198fn prettify_xml(input: &[u8], lenient: bool) -> Result<Vec<u8>, String> {
199    let mut reader = Reader::from_reader(input);
200    let cfg = reader.config_mut();
201    cfg.trim_text(true);
202    if lenient {
203        cfg.check_end_names = false;
204    }
205    let mut writer = Writer::new_with_indent(Cursor::new(Vec::new()), b' ', 2);
206    let mut buf = Vec::new();
207    loop {
208        match reader.read_event_into(&mut buf) {
209            Ok(Event::Eof) => break,
210            Ok(e) => writer
211                .write_event(e)
212                .map_err(|e| format!("xml emit: {e}"))?,
213            Err(e) => return Err(format!("xml parse: {e}")),
214        }
215        buf.clear();
216    }
217    let mut out = writer.into_inner().into_inner();
218    if !out.ends_with(b"\n") {
219        out.push(b'\n');
220    }
221    Ok(out)
222}
223
224/// Render CSV as a fixed-width aligned table with `|` separators.
225/// Wide cells are truncated at 60 characters with an ellipsis so a single
226/// runaway free-text column doesn't blow up the layout.
227fn prettify_csv(input: &[u8]) -> Result<Vec<u8>, String> {
228    const COL_CAP: usize = 60;
229    let mut rdr = csv::ReaderBuilder::new()
230        .has_headers(false)
231        .flexible(true)
232        .from_reader(input);
233    let records: Vec<csv::StringRecord> = rdr
234        .records()
235        .collect::<Result<_, _>>()
236        .map_err(|e| format!("csv parse: {e}"))?;
237    if records.is_empty() {
238        return Ok(Vec::new());
239    }
240    let cols = records.iter().map(|r| r.len()).max().unwrap_or(0);
241    let mut widths = vec![0usize; cols];
242    for r in &records {
243        for (i, cell) in r.iter().enumerate() {
244            let w = cell.chars().count().min(COL_CAP);
245            if w > widths[i] {
246                widths[i] = w;
247            }
248        }
249    }
250    let mut out = String::new();
251    for r in &records {
252        let mut parts: Vec<String> = Vec::with_capacity(cols);
253        for (i, width) in widths.iter().enumerate().take(cols) {
254            let cell = r.get(i).unwrap_or("");
255            let truncated: String = if cell.chars().count() > COL_CAP {
256                let mut s: String = cell.chars().take(COL_CAP - 1).collect();
257                s.push('…');
258                s
259            } else {
260                cell.to_string()
261            };
262            let pad = width.saturating_sub(truncated.chars().count());
263            parts.push(format!("{truncated}{}", " ".repeat(pad)));
264        }
265        out.push_str(&parts.join(" | "));
266        out.push('\n');
267    }
268    Ok(out.into_bytes())
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274
275    #[test]
276    fn parse_content_type_recognizes_aliases() {
277        assert_eq!(parse_content_type("auto").unwrap(), None);
278        assert_eq!(parse_content_type("raw").unwrap(), Some(PrettifyMode::Off));
279        assert_eq!(parse_content_type("JSON").unwrap(), Some(PrettifyMode::Json));
280        assert_eq!(parse_content_type(" yml ").unwrap(), Some(PrettifyMode::Yaml));
281        assert_eq!(parse_content_type("htm").unwrap(), Some(PrettifyMode::Html));
282        assert!(parse_content_type("nonsense").is_err());
283    }
284
285    #[test]
286    fn detect_from_path_recognizes_known_extensions() {
287        assert_eq!(detect_from_path(Path::new("a.json")), Some(PrettifyMode::Json));
288        assert_eq!(detect_from_path(Path::new("a.YAML")), Some(PrettifyMode::Yaml));
289        assert_eq!(detect_from_path(Path::new("a.yml")), Some(PrettifyMode::Yaml));
290        assert_eq!(detect_from_path(Path::new("a.toml")), Some(PrettifyMode::Toml));
291        assert_eq!(detect_from_path(Path::new("page.HTML")), Some(PrettifyMode::Html));
292        assert_eq!(detect_from_path(Path::new("data.csv")), Some(PrettifyMode::Csv));
293        assert_eq!(detect_from_path(Path::new("README")), None);
294        assert_eq!(detect_from_path(Path::new("a.txt")), None);
295    }
296
297    #[test]
298    fn detect_from_bytes_sniffs_json() {
299        assert_eq!(detect_from_bytes(b"{\"a\":1}"), Some(PrettifyMode::Json));
300        assert_eq!(detect_from_bytes(b"   [1,2,3]"), Some(PrettifyMode::Json));
301    }
302
303    #[test]
304    fn detect_from_bytes_sniffs_xml_declaration() {
305        assert_eq!(detect_from_bytes(b"<?xml version=\"1.0\"?>"), Some(PrettifyMode::Xml));
306    }
307
308    #[test]
309    fn detect_from_bytes_sniffs_html_doctype_case_insensitive() {
310        assert_eq!(detect_from_bytes(b"<!DOCTYPE html>"), Some(PrettifyMode::Html));
311        assert_eq!(detect_from_bytes(b"<html><body>"), Some(PrettifyMode::Html));
312    }
313
314    #[test]
315    fn detect_from_bytes_sniffs_yaml_doc_marker() {
316        assert_eq!(detect_from_bytes(b"---\nkey: value\n"), Some(PrettifyMode::Yaml));
317        // Triple-dash followed by other text is NOT a YAML doc marker.
318        assert_eq!(detect_from_bytes(b"---changelog"), None);
319    }
320
321    #[test]
322    fn detect_from_bytes_falls_back_to_none() {
323        assert_eq!(detect_from_bytes(b"plain text"), None);
324        assert_eq!(detect_from_bytes(b""), None);
325        assert_eq!(detect_from_bytes(b"   \n\t  "), None);
326    }
327
328    #[test]
329    fn prettify_json_indents_compact_input() {
330        let out = prettify(PrettifyMode::Json, b"{\"a\":1,\"b\":[2,3]}").unwrap();
331        let s = String::from_utf8(out).unwrap();
332        assert!(s.contains("\"a\": 1"));
333        assert!(s.contains("\"b\":"));
334        // Result has newlines.
335        assert!(s.matches('\n').count() >= 4);
336    }
337
338    #[test]
339    fn prettify_json_returns_error_on_bad_input() {
340        assert!(prettify(PrettifyMode::Json, b"{not json").is_err());
341    }
342
343    #[test]
344    fn prettify_yaml_round_trips() {
345        let out = prettify(PrettifyMode::Yaml, b"a: 1\nb:\n  - 2\n  - 3\n").unwrap();
346        let s = String::from_utf8(out).unwrap();
347        assert!(s.contains("a:"));
348        assert!(s.contains("b:"));
349    }
350
351    #[test]
352    fn prettify_toml_indents_compact_input() {
353        let out = prettify(PrettifyMode::Toml, b"a=1\nb=2\n[s]\nc=3\n").unwrap();
354        let s = String::from_utf8(out).unwrap();
355        assert!(s.contains("a = 1"));
356        assert!(s.contains("[s]"));
357    }
358
359    #[test]
360    fn prettify_xml_indents_with_text_preservation() {
361        let out = prettify(PrettifyMode::Xml, b"<root><a>x</a><b/></root>").unwrap();
362        let s = String::from_utf8(out).unwrap();
363        assert!(s.contains("<root>"));
364        assert!(s.contains("<a>x</a>"));
365        // Check there's at least one newline + indentation pattern.
366        assert!(s.contains("\n  "), "expected indented child, got: {s}");
367    }
368
369    #[test]
370    fn prettify_html_handles_unclosed_void_tags() {
371        // <br> and <img> are void in HTML but not self-closed in source — strict
372        // XML mode would error; html mode (lenient) tolerates it.
373        let html = b"<html><body><br><img src=\"x\"></body></html>";
374        let out = prettify(PrettifyMode::Html, html).unwrap();
375        let s = String::from_utf8(out).unwrap();
376        assert!(s.contains("<html>"));
377        assert!(s.contains("<br"));
378    }
379
380    #[test]
381    fn prettify_csv_aligns_columns() {
382        let out = prettify(PrettifyMode::Csv, b"name,age\nalice,30\nbob,4\n").unwrap();
383        let s = String::from_utf8(out).unwrap();
384        // Each row should have the same byte width up to the separator.
385        let lines: Vec<&str> = s.lines().collect();
386        assert_eq!(lines.len(), 3);
387        // The "name" column gets padded so "bob  " has the same visual width as "alice".
388        // Verify by checking that the " | " separator appears at the same byte offset on each line.
389        let first_pipe: Vec<usize> = lines.iter().map(|l| l.find(" | ").unwrap()).collect();
390        assert!(first_pipe.windows(2).all(|w| w[0] == w[1]),
391                "expected aligned columns, got: {lines:?}");
392    }
393
394    #[test]
395    fn prettify_csv_truncates_long_cells() {
396        let big = "x".repeat(200);
397        let input = format!("a,{big}\n1,2\n");
398        let out = prettify(PrettifyMode::Csv, input.as_bytes()).unwrap();
399        let s = String::from_utf8(out).unwrap();
400        assert!(s.contains('…'), "expected ellipsis truncation, got: {s}");
401    }
402
403    #[test]
404    fn prettify_off_passes_through() {
405        let raw = b"arbitrary bytes\nwith newlines\n";
406        let out = prettify(PrettifyMode::Off, raw).unwrap();
407        assert_eq!(&out, raw);
408    }
409
410    #[test]
411    fn resolve_prefers_explicit_then_path_then_sniff() {
412        // Explicit wins.
413        assert_eq!(
414            resolve(Some(PrettifyMode::Yaml), Some(Path::new("a.json")), b"{\"x\":1}"),
415            ResolvedType::Mode(PrettifyMode::Yaml)
416        );
417        // No explicit: path next.
418        assert_eq!(
419            resolve(None, Some(Path::new("a.json")), b"plain text"),
420            ResolvedType::Mode(PrettifyMode::Json)
421        );
422        // No explicit, no path: sniff.
423        assert_eq!(
424            resolve(None, None, b"<?xml version=\"1.0\"?><r/>"),
425            ResolvedType::Mode(PrettifyMode::Xml)
426        );
427        // Nothing matches.
428        assert_eq!(resolve(None, None, b"plain text"), ResolvedType::Undetected);
429    }
430}