Skip to main content

carta_readers/
ipynb.rs

1//! Notebook reader: parses a Jupyter notebook (`.ipynb`, nbformat v4) into the document model.
2//!
3//! A notebook is a JSON document with a `cells` array, a `metadata` object, and the `nbformat`
4//! version pair. Each cell becomes a `Div` carrying the classes `cell` and the cell kind, the
5//! cell's `id` as its identifier, and the cell's `metadata` (plus a code cell's `execution_count`)
6//! as ordered attributes:
7//!
8//! - A **markdown** cell's source is parsed as Markdown and its blocks become the `Div` body. The
9//!   embedded Markdown honors the reader's full extension set, so tables, fenced code, task lists,
10//!   and the rest are recognized exactly as configured. An image whose URL begins with
11//!   `attachment:` refers to the cell's inline attachments; the prefix is stripped to leave the
12//!   bare reference.
13//! - A **code** cell yields a `CodeBlock` of its source (tagged with the kernel language) followed
14//!   by one `Div` per execution output: a `stream` (stdout/stderr text), an `execute_result` or
15//!   `display_data` (the richest renderable bundle in the output's `data`), or an `error`
16//!   (its traceback).
17//! - A **raw** cell yields a single `RawBlock` whose target format is read from the cell's
18//!   `raw_mimetype` metadata, falling back to its `format` metadata.
19//!
20//! Notebook-level metadata is exposed under a single `jupyter` metadata key, with the `nbformat`
21//! and `nbformat_minor` versions folded in. An output's image payload is referenced by a
22//! content-addressed file name (the SHA-1 of the decoded bytes) and a markdown cell's attachment by
23//! a cell-scoped name; in both cases the bytes are lifted out of the tree into the media bag, which
24//! [`IpynbReader::read_media`] returns alongside the document.
25
26use std::collections::BTreeMap;
27
28use carta_ast::{
29    ApiVersion, Attr, Block, Document, Format, Inline, MetaValue, Target, ToCompactString,
30};
31use carta_core::media::{base64_decode, content_addressed_name};
32use carta_core::{Error, MediaBag, Reader, ReaderOptions, Result};
33use serde_json::Value;
34
35use crate::commonmark::CommonmarkReader;
36
37/// Parses a notebook document into the document model.
38#[derive(Debug, Default, Clone, Copy)]
39pub struct IpynbReader;
40
41impl Reader for IpynbReader {
42    fn read(&self, input: &str, options: &ReaderOptions) -> Result<Document> {
43        self.read_media(input, options)
44            .map(|(document, _)| document)
45    }
46
47    fn read_media(&self, input: &str, options: &ReaderOptions) -> Result<(Document, MediaBag)> {
48        let notebook: Value = serde_json::from_str(input)?;
49        let nbformat = notebook
50            .get("nbformat")
51            .and_then(Value::as_i64)
52            .unwrap_or(4);
53        if nbformat < 4 {
54            return Err(Error::UnsupportedFormat(format!(
55                "notebook format version {nbformat} (only nbformat 4 and later are read)"
56            )));
57        }
58        let nbformat_minor = notebook
59            .get("nbformat_minor")
60            .and_then(Value::as_i64)
61            .unwrap_or(0);
62        let language = notebook_language(&notebook);
63        let meta = build_meta(&notebook, nbformat, nbformat_minor);
64
65        let mut media = MediaBag::new();
66        let mut blocks = Vec::new();
67        if let Some(Value::Array(cells)) = notebook.get("cells") {
68            for cell in cells {
69                if let Some(block) = cell_to_block(cell, &language, options, &mut media)? {
70                    blocks.push(block);
71                }
72            }
73        }
74        let document = Document {
75            api_version: ApiVersion::default(),
76            meta: meta.into_iter().map(|(k, v)| (k.into(), v)).collect(),
77            blocks,
78        };
79        Ok((document, media))
80    }
81}
82
83/// The kernel language, taken from `metadata.kernelspec.language`. Code without a declared language
84/// is tagged `python`.
85fn notebook_language(notebook: &Value) -> String {
86    notebook
87        .get("metadata")
88        .and_then(|metadata| metadata.get("kernelspec"))
89        .and_then(|kernelspec| kernelspec.get("language"))
90        .and_then(Value::as_str)
91        .unwrap_or("python")
92        .to_owned()
93}
94
95/// The document metadata: every notebook-level metadata entry, with the `nbformat`/`nbformat_minor`
96/// versions added, all wrapped under a single `jupyter` key.
97fn build_meta(notebook: &Value, nbformat: i64, nbformat_minor: i64) -> BTreeMap<String, MetaValue> {
98    let mut jupyter: BTreeMap<String, MetaValue> = BTreeMap::new();
99    if let Some(Value::Object(metadata)) = notebook.get("metadata") {
100        for (key, value) in metadata {
101            jupyter.insert(key.clone(), meta_value(value));
102        }
103    }
104    jupyter.insert(
105        "nbformat".to_owned(),
106        MetaValue::MetaString(nbformat.to_compact_string()),
107    );
108    jupyter.insert(
109        "nbformat_minor".to_owned(),
110        MetaValue::MetaString(nbformat_minor.to_compact_string()),
111    );
112    let mut meta = BTreeMap::new();
113    meta.insert(
114        "jupyter".to_owned(),
115        MetaValue::MetaMap(jupyter.into_iter().map(|(k, v)| (k.into(), v)).collect()),
116    );
117    meta
118}
119
120/// Convert a JSON value to a metadata value. Scalars become strings (a null becomes the empty
121/// string, a boolean a `MetaBool`); arrays and objects recurse. A number that is integer-valued —
122/// whether written as an integer or as a float like `3.0` — reads as a plain integer; a fractional
123/// number keeps the general decimal form, falling to scientific notation for very small or very
124/// large magnitudes.
125fn meta_value(value: &Value) -> MetaValue {
126    match value {
127        Value::Null => MetaValue::MetaString(carta_ast::Text::default()),
128        Value::Bool(flag) => MetaValue::MetaBool(*flag),
129        Value::Number(number) => MetaValue::MetaString(meta_number(number).into()),
130        Value::String(text) => MetaValue::MetaString(text.clone().into()),
131        Value::Array(items) => MetaValue::MetaList(items.iter().map(meta_value).collect()),
132        Value::Object(map) => MetaValue::MetaMap(
133            map.iter()
134                .map(|(key, value)| (key.clone().into(), meta_value(value)))
135                .collect(),
136        ),
137    }
138}
139
140/// Render a number for notebook-level metadata: an integer-valued number reads as a plain integer,
141/// while a fractional value is rendered in the general decimal form (scientific notation outside the
142/// magnitude range `[0.1, 10^7)`).
143fn meta_number(number: &serde_json::Number) -> String {
144    if let Some(integer) = number.as_i64() {
145        return integer.to_string();
146    }
147    if let Some(integer) = number.as_u64() {
148        return integer.to_string();
149    }
150    match number.as_f64() {
151        Some(value) if value.is_finite() && value.fract() == 0.0 => integer_string(value),
152        Some(value) => general_decimal(value),
153        None => number.to_string(),
154    }
155}
156
157/// Render a number as a JSON scalar would appear in a serialized bundle: an integer keeps its exact
158/// digits, while a fractional number takes the general decimal form. Unlike [`meta_number`], a value
159/// written with a fractional part such as `3.0` keeps that part (it is not folded to an integer).
160fn json_number(number: &serde_json::Number) -> String {
161    if let Some(integer) = number.as_i64() {
162        return integer.to_string();
163    }
164    if let Some(integer) = number.as_u64() {
165        return integer.to_string();
166    }
167    match number.as_f64() {
168        Some(value) => general_decimal(value),
169        None => number.to_string(),
170    }
171}
172
173/// Render an integer-valued floating-point number as a bare integer (no fractional part, no
174/// exponent). Negative zero renders as `0`.
175fn integer_string(value: f64) -> String {
176    if value == 0.0 {
177        return "0".to_owned();
178    }
179    format!("{value}")
180}
181
182/// Render a floating-point number in the general decimal form: fixed-point notation when the
183/// magnitude lies in `[0.1, 10^7)` and scientific notation otherwise, always carrying at least one
184/// fractional digit (`1.0`, never `1`). Zero renders as `0.0`.
185fn general_decimal(value: f64) -> String {
186    if value == 0.0 {
187        return "0.0".to_owned();
188    }
189    let (digits, exponent) = shortest_digits(value.abs());
190    let body = if (-1..=6).contains(&exponent) {
191        fixed_notation(&digits, exponent)
192    } else {
193        scientific_notation(&digits, exponent)
194    };
195    if value.is_sign_negative() {
196        format!("-{body}")
197    } else {
198        body
199    }
200}
201
202/// The shortest decimal digit run of a positive, finite magnitude together with the power of ten of
203/// its leading digit: the value equals `d.ddd… × 10^exponent`. For `0.05` this is (`"5"`, `-2`); for
204/// `1234.5`, (`"12345"`, `3`).
205fn shortest_digits(magnitude: f64) -> (String, i32) {
206    let formatted = format!("{magnitude:e}");
207    let (mantissa, exponent) = match formatted.split_once('e') {
208        Some((mantissa, exponent)) => (mantissa, exponent.parse::<i32>().unwrap_or(0)),
209        None => (formatted.as_str(), 0),
210    };
211    let digits = mantissa.chars().filter(char::is_ascii_digit).collect();
212    (digits, exponent)
213}
214
215/// Lay out a digit run in fixed-point notation given the leading digit's power of ten. Called only
216/// for an exponent in `-1..=6`, so a value below one places its single leading digit just after the
217/// point.
218fn fixed_notation(digits: &str, exponent: i32) -> String {
219    if exponent < 0 {
220        let leading_zeros = usize::try_from((-exponent - 1).max(0)).unwrap_or(0);
221        return format!("0.{}{digits}", "0".repeat(leading_zeros));
222    }
223    let integer_len = usize::try_from(exponent).unwrap_or(0) + 1;
224    if digits.len() <= integer_len {
225        let trailing_zeros = integer_len - digits.len();
226        format!("{digits}{}.0", "0".repeat(trailing_zeros))
227    } else {
228        let (integer_part, fraction) = digits.split_at(integer_len);
229        format!("{integer_part}.{fraction}")
230    }
231}
232
233/// Lay out a digit run in scientific notation: one digit before the point, the rest after (`0` when
234/// there are none), then the exponent (no `+` sign for a non-negative exponent).
235fn scientific_notation(digits: &str, exponent: i32) -> String {
236    let (first, rest) = digits.split_at(1.min(digits.len()));
237    let mantissa = if rest.is_empty() {
238        format!("{first}.0")
239    } else {
240        format!("{first}.{rest}")
241    };
242    format!("{mantissa}e{exponent}")
243}
244
245/// Serialize a JSON value to compact text, rendering numbers as [`json_number`] does. Object keys are
246/// emitted in sorted order (the parser stores them sorted), so the output is deterministic.
247fn json_render(value: &Value) -> String {
248    let mut out = String::new();
249    json_write(value, &mut out);
250    out
251}
252
253fn json_write(value: &Value, out: &mut String) {
254    match value {
255        Value::Number(number) => out.push_str(&json_number(number)),
256        Value::Array(items) => {
257            out.push('[');
258            for (index, item) in items.iter().enumerate() {
259                if index != 0 {
260                    out.push(',');
261                }
262                json_write(item, out);
263            }
264            out.push(']');
265        }
266        Value::Object(map) => {
267            out.push('{');
268            for (index, (key, item)) in map.iter().enumerate() {
269                if index != 0 {
270                    out.push(',');
271                }
272                out.push_str(&Value::String(key.clone()).to_string());
273                out.push(':');
274                json_write(item, out);
275            }
276            out.push('}');
277        }
278        other => out.push_str(&other.to_string()),
279    }
280}
281
282/// Convert one cell into its `Div`, or `None` for an unrecognized cell kind. Any image bytes the
283/// cell carries — a code cell's image outputs, a markdown cell's attachments — are lifted into
284/// `media`.
285fn cell_to_block(
286    cell: &Value,
287    language: &str,
288    options: &ReaderOptions,
289    media: &mut MediaBag,
290) -> Result<Option<Block>> {
291    let Some(kind) = cell.get("cell_type").and_then(Value::as_str) else {
292        return Ok(None);
293    };
294    let attr = cell_attr(cell, kind);
295    let block = match kind {
296        "markdown" => Block::Div(Box::new(attr), markdown_cell_blocks(cell, options, media)?),
297        "code" => Block::Div(Box::new(attr), code_cell_blocks(cell, language, media)),
298        "raw" => Block::Div(Box::new(attr), vec![raw_cell_block(cell)]),
299        _ => return Ok(None),
300    };
301    Ok(Some(block))
302}
303
304/// The cell's attributes: its `id`, the classes `cell` and the cell kind, then a code cell's
305/// `execution_count` followed by the cell's own metadata entries in key order.
306fn cell_attr(cell: &Value, kind: &str) -> Attr {
307    let id = cell
308        .get("id")
309        .and_then(Value::as_str)
310        .unwrap_or_default()
311        .to_owned();
312    let classes = vec!["cell".to_owned(), kind.to_owned()];
313    let mut attributes = Vec::new();
314    if kind == "code"
315        && let Some(count) = cell.get("execution_count").and_then(Value::as_i64)
316    {
317        attributes.push(("execution_count".to_owned(), count.to_string()));
318    }
319    if let Some(Value::Object(metadata)) = cell.get("metadata") {
320        for (key, value) in metadata {
321            attributes.push((key.clone(), attribute_value(value)));
322        }
323    }
324    Attr {
325        id: id.into(),
326        classes: classes.into_iter().map(Into::into).collect(),
327        attributes: attributes
328            .into_iter()
329            .map(|(k, v)| (k.into(), v.into()))
330            .collect(),
331    }
332}
333
334/// Render a JSON value as an attribute string. A non-string takes its compact JSON form, with numbers
335/// rendered as [`json_number`] does. A string keeps its own text, except that one which would
336/// otherwise read back as a number, a boolean, or the empty attribute — an all-digit run such as
337/// `007`, the literal `true`/`false`, or `""` — is wrapped in double quotes so the distinction
338/// between the string and the scalar survives the round trip.
339fn attribute_value(value: &Value) -> String {
340    match value {
341        Value::String(text)
342            if text.is_empty() || is_integer_literal(text) || text == "true" || text == "false" =>
343        {
344            format!("\"{text}\"")
345        }
346        Value::String(text) => text.clone(),
347        other => json_render(other),
348    }
349}
350
351/// Whether `text` is a non-empty run of ASCII digits (`^[0-9]+$`).
352fn is_integer_literal(text: &str) -> bool {
353    !text.is_empty() && text.bytes().all(|byte| byte.is_ascii_digit())
354}
355
356/// A markdown cell's blocks: its source parsed as Markdown with the reader's extensions, then with
357/// `attachment:` image references rewritten to the cell-scoped name `<cell id>-<reference>`. A cell
358/// that carries no `id` field leaves the bare reference in place.
359fn markdown_cell_blocks(
360    cell: &Value,
361    options: &ReaderOptions,
362    media: &mut MediaBag,
363) -> Result<Vec<Block>> {
364    let source = multiline_text(cell.get("source"));
365    let mut markdown_options = ReaderOptions::default();
366    markdown_options.extensions = options.extensions;
367    // A notebook's markdown cells are written in the broad Markdown dialect (greedy paragraphs),
368    // not strict CommonMark: nested emphasis nests strong outside emph, a bare URI or email becomes
369    // a classed autolink, an ordered list's marker style and start are normalized unless the
370    // fancy-list and start-number extensions ask otherwise, and a raw HTML block carries no trailing
371    // newline.
372    markdown_options.greedy_paragraphs = true;
373    let mut blocks = CommonmarkReader.read(&source, &markdown_options)?.blocks;
374    let prefix = cell
375        .get("id")
376        .map(|id| format!("{}-", id.as_str().unwrap_or_default()));
377    capture_attachments(cell, prefix.as_deref(), media);
378    let prefix = prefix.as_deref().unwrap_or_default();
379    carta_core::walk::for_each_image_target(&mut blocks, &mut |target| {
380        if let Some(bare) = target.url.strip_prefix("attachment:") {
381            target.url = format!("{prefix}{bare}").into();
382        }
383    });
384    Ok(blocks)
385}
386
387/// Lift a markdown cell's inline attachments into the media bag. Each entry in the cell's
388/// `attachments` object maps a reference name to a MIME→payload bundle; its bytes are stored under
389/// the cell-scoped name (`<prefix><reference>`) the `attachment:` references resolve to, so a later
390/// extract or re-embed step finds them. The bundle's image representation is preferred; failing that,
391/// its first entry in key order is taken.
392fn capture_attachments(cell: &Value, prefix: Option<&str>, media: &mut MediaBag) {
393    let Some(Value::Object(attachments)) = cell.get("attachments") else {
394        return;
395    };
396    for (reference, bundle) in attachments {
397        let Value::Object(by_mime) = bundle else {
398            continue;
399        };
400        let chosen = by_mime
401            .iter()
402            .find(|(mime, _)| is_image_like(mime))
403            .or_else(|| by_mime.iter().next());
404        let Some((mime, payload)) = chosen else {
405            continue;
406        };
407        let name = match prefix {
408            Some(prefix) => format!("{prefix}{reference}"),
409            None => reference.clone(),
410        };
411        media.insert(name, Some(mime.clone()), decode_payload(mime, payload));
412    }
413}
414
415/// A code cell's blocks: a `CodeBlock` of its source tagged with the kernel language, then one
416/// block per renderable output.
417fn code_cell_blocks(cell: &Value, language: &str, media: &mut MediaBag) -> Vec<Block> {
418    let source = multiline_text(cell.get("source"));
419    let source_attr = Attr {
420        id: carta_ast::Text::default(),
421        classes: vec![language.into()],
422        attributes: Vec::new(),
423    };
424    let mut blocks = vec![Block::CodeBlock(Box::new(source_attr), source.into())];
425    if let Some(Value::Array(outputs)) = cell.get("outputs") {
426        for output in outputs {
427            if let Some(block) = output_to_block(output, media) {
428                blocks.push(block);
429            }
430        }
431    }
432    blocks
433}
434
435/// A raw cell's block: a `RawBlock` whose format is read from the cell's media type, or `ipynb`
436/// when none is declared. The media type is taken from `raw_mimetype`, falling back to `format` when
437/// the former is absent.
438fn raw_cell_block(cell: &Value) -> Block {
439    let source = multiline_text(cell.get("source"));
440    let metadata = cell.get("metadata");
441    let mime = metadata
442        .and_then(|metadata| metadata.get("raw_mimetype"))
443        .or_else(|| metadata.and_then(|metadata| metadata.get("format")))
444        .and_then(Value::as_str);
445    let format = mime.map_or_else(|| "ipynb".to_owned(), format_from_mime);
446    Block::RawBlock(Format(format.into()), source.into())
447}
448
449/// Convert one execution output into its `Div`, or `None` for an unrecognized output kind. An image
450/// output's bytes are lifted into `media`.
451fn output_to_block(output: &Value, media: &mut MediaBag) -> Option<Block> {
452    match output.get("output_type").and_then(Value::as_str)? {
453        "stream" => Some(stream_output(output)),
454        "execute_result" => Some(result_output(output, true, media)),
455        "display_data" => Some(result_output(output, false, media)),
456        "error" => Some(error_output(output)),
457        _ => None,
458    }
459}
460
461/// A `stream` output: a plain `CodeBlock` of the stream text inside a `Div` classed by the stream
462/// name (stdout or stderr). Terminal control sequences in the text are removed.
463fn stream_output(output: &Value) -> Block {
464    let name = output
465        .get("name")
466        .and_then(Value::as_str)
467        .unwrap_or("stdout");
468    let text = strip_ansi(&multiline_text(output.get("text")));
469    let attr = Attr {
470        id: carta_ast::Text::default(),
471        classes: vec!["output".into(), "stream".into(), name.into()],
472        attributes: Vec::new(),
473    };
474    Block::Div(
475        Box::new(attr),
476        vec![Block::CodeBlock(Box::default(), text.into())],
477    )
478}
479
480/// An `execute_result` or `display_data` output: the richest renderable bundle from its `data`,
481/// inside a `Div`. A result carries its `execution_count` as an attribute.
482fn result_output(output: &Value, is_result: bool, media: &mut MediaBag) -> Block {
483    let kind = if is_result {
484        "execute_result"
485    } else {
486        "display_data"
487    };
488    let mut attributes = Vec::new();
489    if is_result && let Some(count) = output.get("execution_count").and_then(Value::as_i64) {
490        attributes.push(("execution_count".to_owned(), count.to_string()));
491    }
492    let attr = Attr {
493        id: carta_ast::Text::default(),
494        classes: vec!["output".into(), kind.into()],
495        attributes: attributes
496            .into_iter()
497            .map(|(k, v)| (k.into(), v.into()))
498            .collect(),
499    };
500    Block::Div(
501        Box::new(attr),
502        data_to_blocks(output.get("data"), output.get("metadata"), media),
503    )
504}
505
506/// An `error` output: its traceback as a plain `CodeBlock` inside a `Div` carrying the exception
507/// name and value. Terminal control sequences in the traceback are removed.
508fn error_output(output: &Value) -> Block {
509    let ename = output
510        .get("ename")
511        .and_then(Value::as_str)
512        .unwrap_or_default()
513        .to_owned();
514    let evalue = output
515        .get("evalue")
516        .and_then(Value::as_str)
517        .unwrap_or_default()
518        .to_owned();
519    let traceback = match output.get("traceback") {
520        Some(Value::Array(lines)) => {
521            let joined = lines
522                .iter()
523                .filter_map(Value::as_str)
524                .collect::<Vec<_>>()
525                .join("\n");
526            format!("{joined}\n")
527        }
528        Some(Value::String(text)) => text.clone(),
529        _ => String::new(),
530    };
531    let attr = Attr {
532        id: carta_ast::Text::default(),
533        classes: vec!["output".into(), "error".into()],
534        attributes: vec![
535            ("ename".into(), ename.into()),
536            ("evalue".into(), evalue.into()),
537        ],
538    };
539    Block::Div(
540        Box::new(attr),
541        vec![Block::CodeBlock(
542            Box::default(),
543            strip_ansi(&traceback).into(),
544        )],
545    )
546}
547
548/// Pick the richest renderable representation from an output's `data` bundle. An image (or PDF)
549/// representation wins, taken in MIME-name order; otherwise structured JSON — `application/json` or
550/// any `+json` structured-syntax type — then plain text, HTML, LaTeX, and Markdown are tried in that
551/// order. Among several JSON representations the lowest MIME name is taken. An empty or absent bundle
552/// yields no blocks.
553fn data_to_blocks(
554    data: Option<&Value>,
555    metadata: Option<&Value>,
556    media: &mut MediaBag,
557) -> Vec<Block> {
558    let Some(Value::Object(data)) = data else {
559        return Vec::new();
560    };
561    if let Some((mime, value)) = data.iter().find(|(mime, _)| is_image_like(mime)) {
562        return vec![image_block(mime, value, metadata, media)];
563    }
564    if let Some((mime, value)) = data.iter().find(|(mime, _)| is_json_like(mime)) {
565        return vec![non_image_block(mime, value)];
566    }
567    for mime in ["text/plain", "text/html", "text/latex", "text/markdown"] {
568        if let Some(value) = data.get(mime) {
569            return vec![non_image_block(mime, value)];
570        }
571    }
572    Vec::new()
573}
574
575/// Render a non-image output representation. Structured JSON becomes a `json`-classed code block of
576/// its compact form; plain text becomes a code block (control sequences removed); HTML, LaTeX, and
577/// Markdown become raw passthrough blocks.
578fn non_image_block(mime: &str, value: &Value) -> Block {
579    if is_json_like(mime) {
580        return Block::CodeBlock(
581            Box::new(Attr {
582                id: carta_ast::Text::default(),
583                classes: vec!["json".into()],
584                attributes: Vec::new(),
585            }),
586            json_render(value).into(),
587        );
588    }
589    match mime {
590        "text/html" => Block::RawBlock(Format("html".into()), multiline_text(Some(value)).into()),
591        "text/latex" => Block::RawBlock(Format("latex".into()), multiline_text(Some(value)).into()),
592        "text/markdown" => Block::RawBlock(
593            Format("markdown".into()),
594            multiline_text(Some(value)).into(),
595        ),
596        // The fallthrough is plain text; the preference list only routes the cases above here.
597        _ => Block::CodeBlock(
598            Box::default(),
599            strip_ansi(&multiline_text(Some(value))).into(),
600        ),
601    }
602}
603
604/// A `Para` holding a single image whose URL is the content-addressed file name of the decoded
605/// payload, whose bytes are lifted into `media` under that same name. Any entry the output's
606/// `metadata` records under the chosen MIME type becomes an attribute on the image.
607fn image_block(mime: &str, value: &Value, metadata: Option<&Value>, media: &mut MediaBag) -> Block {
608    let bytes = decode_payload(mime, value);
609    let name = content_addressed_name(mime, &bytes);
610    media.insert(name.clone(), Some(mime.to_owned()), bytes);
611    Block::Para(vec![Inline::Image(
612        Box::new(image_attr(mime, metadata)),
613        Vec::new(),
614        Box::new(Target {
615            url: name.into(),
616            title: carta_ast::Text::default(),
617        }),
618    )])
619}
620
621/// The raw bytes of an image payload. An SVG representation is its own UTF-8 source; every other type
622/// is base64-decoded, falling back to the raw source bytes when the payload is not well-formed
623/// base64.
624fn decode_payload(mime: &str, value: &Value) -> Vec<u8> {
625    let payload = multiline_text(Some(value));
626    if mime == "image/svg+xml" {
627        payload.into_bytes()
628    } else {
629        base64_decode(&payload).unwrap_or_else(|| payload.into_bytes())
630    }
631}
632
633/// The image attributes drawn from an output's `metadata`: every key under the entry named for the
634/// chosen MIME type, in sorted order, each value rendered as an attribute string.
635fn image_attr(mime: &str, metadata: Option<&Value>) -> Attr {
636    let mut attributes = Vec::new();
637    if let Some(Value::Object(by_mime)) = metadata
638        && let Some(Value::Object(entry)) = by_mime.get(mime)
639    {
640        for (key, value) in entry {
641            attributes.push((key.clone(), attribute_value(value)));
642        }
643    }
644    Attr {
645        id: carta_ast::Text::default(),
646        classes: Vec::new(),
647        attributes: attributes
648            .into_iter()
649            .map(|(k, v)| (k.into(), v.into()))
650            .collect(),
651    }
652}
653
654/// Whether a MIME type denotes an image-like payload that is referenced as a file: any `image/*`
655/// type, or PDF.
656fn is_image_like(mime: &str) -> bool {
657    mime.starts_with("image/") || mime == "application/pdf"
658}
659
660/// Whether a MIME type denotes structured JSON: the `application/json` type or any type whose
661/// structured-syntax suffix is `+json` (for example `application/geo+json`).
662fn is_json_like(mime: &str) -> bool {
663    mime == "application/json" || mime.ends_with("+json")
664}
665
666/// The raw-passthrough format name for a cell's `format` MIME type. A few media types map onto a
667/// writer's short name; anything else is kept verbatim.
668fn format_from_mime(mime: &str) -> String {
669    match mime {
670        "text/html" => "html",
671        "text/latex" | "application/pdf" => "latex",
672        "text/markdown" => "markdown",
673        "text/restructuredtext" | "text/x-rst" => "rst",
674        "text/asciidoc" => "asciidoc",
675        other => other,
676    }
677    .to_owned()
678}
679
680/// Join a JSON value that is either a single string or an array of string lines into one string.
681/// Array elements are concatenated as stored (each line carries its own trailing newline).
682fn multiline_text(value: Option<&Value>) -> String {
683    match value {
684        Some(Value::String(text)) => text.clone(),
685        Some(Value::Array(lines)) => lines.iter().filter_map(Value::as_str).collect(),
686        _ => String::new(),
687    }
688}
689
690/// Remove ANSI terminal control sequences from text. A control sequence introducer (`ESC [`) and
691/// its parameter, intermediate, and final bytes are dropped; a stray escape is dropped on its own.
692fn strip_ansi(text: &str) -> String {
693    let mut out = String::with_capacity(text.len());
694    let mut chars = text.chars().peekable();
695    while let Some(ch) = chars.next() {
696        if ch != '\u{1b}' {
697            out.push(ch);
698            continue;
699        }
700        if chars.peek() == Some(&'[') {
701            chars.next();
702            for byte in chars.by_ref() {
703                if ('\u{40}'..='\u{7e}').contains(&byte) {
704                    break;
705                }
706            }
707        }
708    }
709    out
710}
711
712#[cfg(test)]
713mod tests {
714    use super::*;
715    use carta_core::MediaBag;
716
717    fn read(input: &str) -> Document {
718        IpynbReader
719            .read(input, &ReaderOptions::default())
720            .expect("notebook input parses")
721    }
722
723    fn read_with(input: &str, extensions: carta_core::Extensions) -> Document {
724        let mut options = ReaderOptions::default();
725        options.extensions = extensions;
726        IpynbReader.read(input, &options).expect("notebook parses")
727    }
728
729    fn read_media(input: &str) -> (Document, MediaBag) {
730        IpynbReader
731            .read_media(input, &ReaderOptions::default())
732            .expect("notebook input parses")
733    }
734
735    fn jupyter(document: &Document) -> &BTreeMap<carta_ast::Text, MetaValue> {
736        match document.meta.get("jupyter") {
737            Some(MetaValue::MetaMap(map)) => map,
738            _ => panic!("expected a jupyter metadata map"),
739        }
740    }
741
742    #[test]
743    fn empty_notebook_exposes_only_version_metadata() {
744        let document = read(r#"{"cells": [], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#);
745        assert!(document.blocks.is_empty());
746        let map = jupyter(&document);
747        assert_eq!(
748            map.get("nbformat"),
749            Some(&MetaValue::MetaString("4".to_owned().into()))
750        );
751        assert_eq!(
752            map.get("nbformat_minor"),
753            Some(&MetaValue::MetaString("5".to_owned().into()))
754        );
755    }
756
757    #[test]
758    fn missing_minor_version_defaults_to_zero() {
759        let document = read(r#"{"cells": [], "metadata": {}, "nbformat": 4}"#);
760        assert_eq!(
761            jupyter(&document).get("nbformat_minor"),
762            Some(&MetaValue::MetaString("0".to_owned().into()))
763        );
764    }
765
766    #[test]
767    fn metadata_scalars_normalize_and_recurse() {
768        let document = read(
769            r#"{"cells": [], "metadata": {"afloat": 3.0, "aint": 7, "abool": true,
770               "anull": null, "alist": [1, "two", 3.0], "amap": {"z": 1, "a": 2.0}},
771               "nbformat": 4, "nbformat_minor": 5}"#,
772        );
773        let map = jupyter(&document);
774        assert_eq!(
775            map.get("afloat"),
776            Some(&MetaValue::MetaString("3".to_owned().into()))
777        );
778        assert_eq!(
779            map.get("aint"),
780            Some(&MetaValue::MetaString("7".to_owned().into()))
781        );
782        assert_eq!(map.get("abool"), Some(&MetaValue::MetaBool(true)));
783        assert_eq!(
784            map.get("anull"),
785            Some(&MetaValue::MetaString(carta_ast::Text::default()))
786        );
787        assert_eq!(
788            map.get("alist"),
789            Some(&MetaValue::MetaList(vec![
790                MetaValue::MetaString("1".to_owned().into()),
791                MetaValue::MetaString("two".to_owned().into()),
792                MetaValue::MetaString("3".to_owned().into()),
793            ]))
794        );
795        let Some(MetaValue::MetaMap(nested)) = map.get("amap") else {
796            panic!("expected a nested map");
797        };
798        assert_eq!(
799            nested.get("a"),
800            Some(&MetaValue::MetaString("2".to_owned().into()))
801        );
802        assert_eq!(
803            nested.get("z"),
804            Some(&MetaValue::MetaString("1".to_owned().into()))
805        );
806    }
807
808    #[test]
809    fn markdown_cell_becomes_a_div_with_parsed_blocks() {
810        let document = read(
811            r##"{"cells": [{"cell_type": "markdown", "id": "m1", "metadata": {},
812               "source": ["# Title\n", "\n", "text"]}],
813               "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"##,
814        );
815        let Some(Block::Div(attr, blocks)) = document.blocks.first() else {
816            panic!("expected a cell div");
817        };
818        assert_eq!(attr.id, "m1");
819        assert_eq!(attr.classes, vec!["cell".to_owned(), "markdown".to_owned()]);
820        assert!(matches!(blocks.first(), Some(Block::Header(1, _, _))));
821        assert!(matches!(blocks.get(1), Some(Block::Para(_))));
822    }
823
824    #[test]
825    fn markdown_cell_honors_forwarded_extensions() {
826        // A pipe table is recognized only when the table extension is on, confirming the reader's
827        // extensions reach the embedded Markdown.
828        let input = r#"{"cells": [{"cell_type": "markdown", "metadata": {},
829            "source": ["| a | b |\n|---|---|\n| 1 | 2 |\n"]}],
830            "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#;
831        let with_tables = read_with(input, carta_core::presets::GFM);
832        let Some(Block::Div(_, blocks)) = with_tables.blocks.first() else {
833            panic!("expected a cell div");
834        };
835        assert!(matches!(blocks.first(), Some(Block::Table(_))));
836
837        let strict = read_with(input, carta_core::Extensions::empty());
838        let Some(Block::Div(_, blocks)) = strict.blocks.first() else {
839            panic!("expected a cell div");
840        };
841        assert!(!matches!(blocks.first(), Some(Block::Table(_))));
842    }
843
844    #[test]
845    fn markdown_attachment_prefix_is_stripped_from_images() {
846        let document = read(
847            r#"{"cells": [{"cell_type": "markdown", "metadata": {},
848               "attachments": {"a.png": {"image/png": "x"}},
849               "source": ["![alt](attachment:a.png)"]}],
850               "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#,
851        );
852        let Some(Block::Div(_, blocks)) = document.blocks.first() else {
853            panic!("expected a cell div");
854        };
855        let Some(Block::Para(inlines)) = blocks.first() else {
856            panic!("expected a paragraph");
857        };
858        let Some(Inline::Image(_, _, target)) = inlines.first() else {
859            panic!("expected an image");
860        };
861        // A cell without an `id` leaves the bare reference in place.
862        assert_eq!(target.url, "a.png");
863    }
864
865    #[test]
866    fn markdown_attachment_reference_is_scoped_to_the_cell_id() {
867        let document = read(
868            r#"{"cells": [{"cell_type": "markdown", "id": "cell9", "metadata": {},
869               "attachments": {"a.png": {"image/png": "x"}},
870               "source": ["![alt](attachment:a.png)"]}],
871               "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#,
872        );
873        let Some(Block::Div(_, blocks)) = document.blocks.first() else {
874            panic!("expected a cell div");
875        };
876        let Some(Block::Para(inlines)) = blocks.first() else {
877            panic!("expected a paragraph");
878        };
879        let Some(Inline::Image(_, _, target)) = inlines.first() else {
880            panic!("expected an image");
881        };
882        // A cell with an `id` scopes the reference to that cell.
883        assert_eq!(target.url, "cell9-a.png");
884    }
885
886    #[test]
887    fn code_cell_emits_source_then_outputs() {
888        let document = read(
889            r#"{"cells": [{"cell_type": "code", "metadata": {"scrolled": true},
890               "execution_count": 5, "source": ["import os\n", "print(1)"],
891               "outputs": [
892                 {"output_type": "stream", "name": "stdout", "text": ["hello\n"]},
893                 {"output_type": "execute_result", "execution_count": 5,
894                  "data": {"text/plain": ["42"]}, "metadata": {}},
895                 {"output_type": "error", "ename": "E", "evalue": "v",
896                  "traceback": ["line1", "line2"]}
897               ]}],
898               "metadata": {"kernelspec": {"language": "python"}},
899               "nbformat": 4, "nbformat_minor": 5}"#,
900        );
901        let Some(Block::Div(attr, blocks)) = document.blocks.first() else {
902            panic!("expected a cell div");
903        };
904        assert_eq!(
905            attr.attributes,
906            vec![
907                ("execution_count".into(), "5".into()),
908                ("scrolled".into(), "true".into()),
909            ]
910        );
911        // Source code block tagged with the language.
912        let Some(Block::CodeBlock(source_attr, source)) = blocks.first() else {
913            panic!("expected a source code block");
914        };
915        assert_eq!(source_attr.classes, vec!["python".to_owned()]);
916        assert_eq!(source, "import os\nprint(1)");
917
918        // Stream output.
919        let Some(Block::Div(stream_attr, stream_body)) = blocks.get(1) else {
920            panic!("expected a stream div");
921        };
922        assert_eq!(
923            stream_attr.classes,
924            vec![
925                "output".to_owned(),
926                "stream".to_owned(),
927                "stdout".to_owned()
928            ]
929        );
930        assert!(matches!(
931            stream_body.first(),
932            Some(Block::CodeBlock(_, text)) if text == "hello\n"
933        ));
934
935        // execute_result carries its execution count and renders text/plain as a code block.
936        let Some(Block::Div(result_attr, result_body)) = blocks.get(2) else {
937            panic!("expected a result div");
938        };
939        assert_eq!(
940            result_attr.classes,
941            vec!["output".to_owned(), "execute_result".to_owned()]
942        );
943        assert_eq!(
944            result_attr.attributes,
945            vec![("execution_count".into(), "5".into())]
946        );
947        assert!(matches!(
948            result_body.first(),
949            Some(Block::CodeBlock(_, text)) if text == "42"
950        ));
951
952        // error renders its joined traceback with a trailing newline.
953        let Some(Block::Div(error_attr, error_body)) = blocks.get(3) else {
954            panic!("expected an error div");
955        };
956        assert_eq!(
957            error_attr.attributes,
958            vec![("ename".into(), "E".into()), ("evalue".into(), "v".into()),]
959        );
960        assert!(matches!(
961            error_body.first(),
962            Some(Block::CodeBlock(_, text)) if text == "line1\nline2\n"
963        ));
964    }
965
966    #[test]
967    fn null_execution_count_yields_no_attribute() {
968        let document = read(
969            r#"{"cells": [{"cell_type": "code", "metadata": {}, "execution_count": null,
970               "source": [], "outputs": []}],
971               "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#,
972        );
973        let Some(Block::Div(attr, _)) = document.blocks.first() else {
974            panic!("expected a cell div");
975        };
976        assert!(attr.attributes.is_empty());
977    }
978
979    #[test]
980    fn image_output_is_content_addressed() {
981        // PNG bytes from base64 are hashed; SVG is hashed as its own text.
982        let document = read(
983            r#"{"cells": [{"cell_type": "code", "metadata": {}, "execution_count": 1,
984               "source": [], "outputs": [
985                 {"output_type": "display_data", "data": {"image/png": "iVBORw0KGgoAAAANSUhEUg=="},
986                  "metadata": {}}]}],
987               "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#,
988        );
989        let Some(Block::Div(_, body)) = first_output(&document) else {
990            panic!("expected an output div");
991        };
992        let Some(Block::Para(inlines)) = body.first() else {
993            panic!("expected a paragraph");
994        };
995        let Some(Inline::Image(_, _, target)) = inlines.first() else {
996            panic!("expected an image");
997        };
998        assert_eq!(target.url, "22f545ac6b50163ce39bac49094c3f64e0858403.png");
999
1000        let svg = read(
1001            r#"{"cells": [{"cell_type": "code", "metadata": {}, "execution_count": 1,
1002               "source": [], "outputs": [
1003                 {"output_type": "display_data", "data": {"image/svg+xml": ["<svg/>"]},
1004                  "metadata": {}}]}],
1005               "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#,
1006        );
1007        let Some(Block::Div(_, body)) = first_output(&svg) else {
1008            panic!("expected an output div");
1009        };
1010        let Some(Block::Para(inlines)) = body.first() else {
1011            panic!("expected a paragraph");
1012        };
1013        let Some(Inline::Image(_, _, target)) = inlines.first() else {
1014            panic!("expected an image");
1015        };
1016        assert_eq!(target.url, "1c3ba3b813e1080e9721846f23a21c09e5c3fd27.svg");
1017    }
1018
1019    #[test]
1020    fn image_output_bytes_are_lifted_into_the_media_bag() {
1021        let (document, media) = read_media(
1022            r#"{"cells": [{"cell_type": "code", "metadata": {}, "execution_count": 1,
1023               "source": [], "outputs": [
1024                 {"output_type": "display_data", "data": {"image/png": "iVBORw0KGgoAAAANSUhEUg=="},
1025                  "metadata": {}}]}],
1026               "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#,
1027        );
1028        // The tree references the image by its content-addressed name...
1029        let Some(Block::Div(_, body)) = first_output(&document) else {
1030            panic!("expected an output div");
1031        };
1032        let Some(Block::Para(inlines)) = body.first() else {
1033            panic!("expected a paragraph");
1034        };
1035        let Some(Inline::Image(_, _, target)) = inlines.first() else {
1036            panic!("expected an image");
1037        };
1038        let name = "22f545ac6b50163ce39bac49094c3f64e0858403.png";
1039        assert_eq!(target.url, name);
1040        // ...and the bag holds exactly that name, with the decoded bytes and their MIME type.
1041        assert_eq!(media.len(), 1);
1042        let item = media.get(name).expect("image is in the bag");
1043        assert_eq!(item.mime.as_deref(), Some("image/png"));
1044        assert_eq!(
1045            item.bytes,
1046            carta_core::media::base64_decode("iVBORw0KGgoAAAANSUhEUg==").unwrap()
1047        );
1048    }
1049
1050    #[test]
1051    fn svg_output_is_stored_as_its_source_bytes() {
1052        let (_, media) = read_media(
1053            r#"{"cells": [{"cell_type": "code", "metadata": {}, "execution_count": 1,
1054               "source": [], "outputs": [
1055                 {"output_type": "display_data", "data": {"image/svg+xml": ["<svg/>"]},
1056                  "metadata": {}}]}],
1057               "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#,
1058        );
1059        let name = "1c3ba3b813e1080e9721846f23a21c09e5c3fd27.svg";
1060        let item = media.get(name).expect("svg is in the bag");
1061        assert_eq!(item.mime.as_deref(), Some("image/svg+xml"));
1062        assert_eq!(item.bytes, b"<svg/>");
1063    }
1064
1065    #[test]
1066    fn identical_image_outputs_share_one_bag_entry() {
1067        let (_, media) = read_media(
1068            r#"{"cells": [{"cell_type": "code", "metadata": {}, "execution_count": 1,
1069               "source": [], "outputs": [
1070                 {"output_type": "display_data", "data": {"image/png": "iVBORw0KGgoAAAANSUhEUg=="},
1071                  "metadata": {}},
1072                 {"output_type": "display_data", "data": {"image/png": "iVBORw0KGgoAAAANSUhEUg=="},
1073                  "metadata": {}}]}],
1074               "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#,
1075        );
1076        // Content addressing means equal bytes collapse to a single entry.
1077        assert_eq!(media.len(), 1);
1078    }
1079
1080    #[test]
1081    fn markdown_attachment_bytes_are_lifted_into_the_media_bag() {
1082        let (_, media) = read_media(
1083            r#"{"cells": [{"cell_type": "markdown", "id": "cell9", "metadata": {},
1084               "attachments": {"a.png": {"image/png": "iVBORw0KGgoAAAANSUhEUg=="}},
1085               "source": ["![alt](attachment:a.png)"]}],
1086               "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#,
1087        );
1088        // The attachment is keyed by the same cell-scoped name the image reference resolves to.
1089        let item = media.get("cell9-a.png").expect("attachment is in the bag");
1090        assert_eq!(item.mime.as_deref(), Some("image/png"));
1091        assert_eq!(
1092            item.bytes,
1093            carta_core::media::base64_decode("iVBORw0KGgoAAAANSUhEUg==").unwrap()
1094        );
1095    }
1096
1097    #[test]
1098    fn attachment_without_a_cell_id_uses_the_bare_reference() {
1099        let (_, media) = read_media(
1100            r#"{"cells": [{"cell_type": "markdown", "metadata": {},
1101               "attachments": {"a.png": {"image/png": "iVBORw0KGgoAAAANSUhEUg=="}},
1102               "source": ["![alt](attachment:a.png)"]}],
1103               "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#,
1104        );
1105        assert!(media.contains("a.png"));
1106    }
1107
1108    #[test]
1109    fn image_wins_over_text_and_smaller_mime_wins_among_images() {
1110        let document = read(
1111            r#"{"cells": [{"cell_type": "code", "metadata": {}, "execution_count": 1,
1112               "source": [], "outputs": [
1113                 {"output_type": "display_data",
1114                  "data": {"image/png": "iVBORw0KGgoAAAANSUhEUg==", "image/jpeg": "iVBORw0KGgoAAAANSUhEUg==",
1115                           "text/plain": ["p"]},
1116                  "metadata": {}}]}],
1117               "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#,
1118        );
1119        let Some(Block::Div(_, body)) = first_output(&document) else {
1120            panic!("expected an output div");
1121        };
1122        let Some(Block::Para(inlines)) = body.first() else {
1123            panic!("expected a paragraph");
1124        };
1125        let Some(Inline::Image(_, _, target)) = inlines.first() else {
1126            panic!("expected an image");
1127        };
1128        // image/jpeg sorts before image/png and both before text/plain.
1129        assert_eq!(target.url, "22f545ac6b50163ce39bac49094c3f64e0858403.jpg");
1130    }
1131
1132    #[test]
1133    fn image_output_metadata_becomes_sorted_attributes() {
1134        let document = read(
1135            r#"{"cells": [{"cell_type": "code", "metadata": {}, "execution_count": 1,
1136               "source": [], "outputs": [
1137                 {"output_type": "display_data", "data": {"image/png": "iVBORw0KGgoAAAANSUhEUg=="},
1138                  "metadata": {"image/png": {"width": 100, "height": 50, "needs_background": "light"}}}]}],
1139               "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#,
1140        );
1141        let Some(Block::Div(_, body)) = first_output(&document) else {
1142            panic!("expected an output div");
1143        };
1144        let Some(Block::Para(inlines)) = body.first() else {
1145            panic!("expected a paragraph");
1146        };
1147        let Some(Inline::Image(attr, _, _)) = inlines.first() else {
1148            panic!("expected an image");
1149        };
1150        assert_eq!(
1151            attr.attributes,
1152            vec![
1153                ("height".into(), "50".into()),
1154                ("needs_background".into(), "light".into()),
1155                ("width".into(), "100".into()),
1156            ]
1157        );
1158    }
1159
1160    #[test]
1161    fn structured_json_output_is_compact_and_sorted() {
1162        let document = read(
1163            r#"{"cells": [{"cell_type": "code", "metadata": {}, "execution_count": 1,
1164               "source": [], "outputs": [
1165                 {"output_type": "display_data", "data": {"application/json": {"z": 1, "a": 2.0}},
1166                  "metadata": {}}]}],
1167               "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#,
1168        );
1169        let Some(Block::Div(_, body)) = first_output(&document) else {
1170            panic!("expected an output div");
1171        };
1172        let Some(Block::CodeBlock(attr, text)) = body.first() else {
1173            panic!("expected a code block");
1174        };
1175        assert_eq!(attr.classes, vec!["json".to_owned()]);
1176        assert_eq!(text, r#"{"a":2.0,"z":1}"#);
1177    }
1178
1179    #[test]
1180    fn raw_cell_maps_format_to_writer_name() {
1181        let document = read(
1182            r#"{"cells": [
1183                 {"cell_type": "raw", "metadata": {"format": "text/html"}, "source": ["<b>x</b>"]},
1184                 {"cell_type": "raw", "metadata": {}, "source": ["plain"]}],
1185               "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#,
1186        );
1187        let Some(Block::Div(attr, body)) = document.blocks.first() else {
1188            panic!("expected a raw cell div");
1189        };
1190        assert_eq!(attr.attributes, vec![("format".into(), "text/html".into())]);
1191        assert!(matches!(
1192            body.first(),
1193            Some(Block::RawBlock(Format(name), text)) if name == "html" && text == "<b>x</b>"
1194        ));
1195        // No declared format falls back to the notebook's own format name.
1196        let Some(Block::Div(_, body)) = document.blocks.get(1) else {
1197            panic!("expected a raw cell div");
1198        };
1199        assert!(matches!(
1200            body.first(),
1201            Some(Block::RawBlock(Format(name), _)) if name == "ipynb"
1202        ));
1203    }
1204
1205    #[test]
1206    fn unknown_cell_kinds_are_dropped() {
1207        let document = read(
1208            r#"{"cells": [{"cell_type": "heading", "level": 2, "metadata": {}, "source": ["H"]}],
1209               "metadata": {}, "nbformat": 4, "nbformat_minor": 5}"#,
1210        );
1211        assert!(document.blocks.is_empty());
1212    }
1213
1214    #[test]
1215    fn terminal_control_sequences_are_removed_from_text_outputs() {
1216        // A control byte is invalid inside a JSON string, so the escape is carried in its JSON
1217        // numeric form. The escape token is assembled from a backslash char here so this source
1218        // holds no literal control byte: a backslash followed by the escape code's hex digits.
1219        let esc = format!("{}u001b", '\\');
1220        let input = format!(
1221            r#"{{"cells": [{{"cell_type": "code", "metadata": {{}}, "execution_count": 1,
1222               "source": [], "outputs": [
1223                 {{"output_type": "stream", "name": "stdout",
1224                  "text": ["{esc}[31mred{esc}[0m"]}}]}}],
1225               "metadata": {{}}, "nbformat": 4, "nbformat_minor": 5}}"#
1226        );
1227        let document = read(&input);
1228        let Some(Block::Div(_, body)) = first_output(&document) else {
1229            panic!("expected an output div");
1230        };
1231        assert!(matches!(
1232            body.first(),
1233            Some(Block::CodeBlock(_, text)) if text == "red"
1234        ));
1235    }
1236
1237    #[test]
1238    fn malformed_input_is_an_error_not_a_panic() {
1239        assert!(
1240            IpynbReader
1241                .read("not json", &ReaderOptions::default())
1242                .is_err()
1243        );
1244        assert!(IpynbReader.read("", &ReaderOptions::default()).is_err());
1245    }
1246
1247    #[test]
1248    fn pre_v4_notebook_is_an_error_not_a_panic() {
1249        let result = IpynbReader.read(
1250            r#"{"nbformat": 3, "nbformat_minor": 0, "worksheets": []}"#,
1251            &ReaderOptions::default(),
1252        );
1253        assert!(matches!(result, Err(Error::UnsupportedFormat(_))));
1254    }
1255
1256    /// The body of the first output div of the first code cell.
1257    fn first_output(document: &Document) -> Option<&Block> {
1258        let Some(Block::Div(_, blocks)) = document.blocks.first() else {
1259            return None;
1260        };
1261        blocks.get(1)
1262    }
1263}