Skip to main content

nbformat/
lib.rs

1pub mod legacy;
2pub mod v3;
3pub mod v4;
4
5use serde::Serialize as _;
6use thiserror::Error;
7
8#[derive(Error, Debug)]
9pub enum NotebookError {
10    #[error("Unsupported notebook version: {0}.{1}")]
11    UnsupportedVersion(i32, i32),
12    #[error("JSON parsing error: {0}")]
13    JsonError(#[from] serde_json::Error),
14    #[error("Validation error: {0}")]
15    ValidationError(String),
16}
17
18/// A v4.5 spec violation detected during parse.
19///
20/// Currently only `MissingCellId` is emitted; the enum is
21/// `#[non_exhaustive]` so future additions are minor-safe.
22#[derive(Debug, Clone, PartialEq, Eq)]
23#[non_exhaustive]
24pub enum Quirk {
25    /// A 4.5 cell lacked a required `id` field. `cell_index` is the
26    /// cell's position in the on-disk `cells` array.
27    MissingCellId { cell_index: usize },
28}
29
30/// A v4.5 notebook that violated the 4.5 spec on load.
31///
32/// Missing cell ids have already been filled with fresh UUIDs by the
33/// lenient deserializer — `notebook` is safe to inspect, but the bytes
34/// on disk did not carry these ids. Callers must explicitly promote
35/// this via [`V4Quirks::repair`] before the result is considered a
36/// spec-compliant `v4::Notebook`.
37#[derive(Debug, Clone)]
38pub struct V4Quirks {
39    notebook: v4::Notebook,
40    quirks: Vec<Quirk>,
41}
42
43impl V4Quirks {
44    /// The quirks detected during parse, in document order.
45    pub fn quirks(&self) -> &[Quirk] {
46        &self.quirks
47    }
48
49    /// Borrow the parsed notebook. Fabricated cell ids are already
50    /// present in the returned reference.
51    pub fn notebook(&self) -> &v4::Notebook {
52        &self.notebook
53    }
54
55    /// Consume and promote to a valid `v4::Notebook`.
56    ///
57    /// Because the lenient deserializer already filled missing ids
58    /// with fresh UUIDs, this is a type-system promotion, not a
59    /// runtime mutation. The fabricated ids become authoritative.
60    /// Callers that want stable ids across future loads should
61    /// persist the repaired notebook back to disk.
62    pub fn repair(self) -> v4::Notebook {
63        self.notebook
64    }
65}
66
67#[derive(Debug)]
68#[non_exhaustive]
69pub enum Notebook {
70    V4(v4::Notebook),
71    V4QuirksMode(V4Quirks),
72    Legacy(legacy::Notebook),
73    V3(v3::Notebook),
74}
75
76/// Walk a raw v4.5 notebook value and report spec violations that
77/// the lenient deserializer would otherwise hide. This runs BEFORE
78/// serde deserialization because the `default_cell_id` fallback
79/// makes fabricated cell ids indistinguishable from real ones
80/// after the fact.
81fn detect_v45_quirks(value: &serde_json::Value) -> Vec<Quirk> {
82    let mut quirks = Vec::new();
83
84    let Some(cells) = value.get("cells").and_then(|v| v.as_array()) else {
85        return quirks;
86    };
87
88    for (cell_index, cell) in cells.iter().enumerate() {
89        let has_non_empty_id = cell
90            .get("id")
91            .and_then(|v| v.as_str())
92            .map(|s| !s.is_empty())
93            .unwrap_or(false);
94
95        if !has_non_empty_id {
96            quirks.push(Quirk::MissingCellId { cell_index });
97        }
98    }
99
100    quirks
101}
102
103pub fn parse_notebook(json: &str) -> Result<Notebook, NotebookError> {
104    let value: serde_json::Value = serde_json::from_str(json)?;
105    let nbformat = value["nbformat"].as_i64().unwrap_or(0) as i32;
106    let nbformat_minor = value["nbformat_minor"].as_i64().unwrap_or(0) as i32;
107
108    match (nbformat, nbformat_minor) {
109        (4, 5) => {
110            let quirks = detect_v45_quirks(&value);
111            let notebook = serde_json::from_value::<v4::Notebook>(value)?;
112            if quirks.is_empty() {
113                Ok(Notebook::V4(notebook))
114            } else {
115                Ok(Notebook::V4QuirksMode(V4Quirks { notebook, quirks }))
116            }
117        }
118        (4, 0) | (4, 1) | (4, 2) | (4, 3) | (4, 4) => Ok(Notebook::Legacy(
119            serde_json::from_value::<legacy::Notebook>(value)?,
120        )),
121        (3, _) => Ok(Notebook::V3(serde_json::from_value::<v3::Notebook>(value)?)),
122        _ => Err(NotebookError::UnsupportedVersion(nbformat, nbformat_minor)),
123    }
124}
125
126/// Recursively rebuild every `Value::Object` with its keys in sorted order.
127///
128/// This mirrors Python `nbformat.write`'s use of `json.dumps(..., sort_keys=True)`.
129/// We do this explicitly rather than relying on serde_json's internal map type
130/// because the `preserve_order` feature — which some downstream workspaces
131/// enable — switches the `Map` backing from `BTreeMap` (sorted) to `IndexMap`
132/// (insertion order). Applying the sort ourselves produces identical output
133/// regardless of that feature flag.
134fn sort_value_keys(value: serde_json::Value) -> serde_json::Value {
135    match value {
136        serde_json::Value::Object(map) => {
137            let mut entries: Vec<(String, serde_json::Value)> = map.into_iter().collect();
138            entries.sort_by(|a, b| a.0.cmp(&b.0));
139            let mut sorted = serde_json::Map::new();
140            for (k, v) in entries {
141                sorted.insert(k, sort_value_keys(v));
142            }
143            serde_json::Value::Object(sorted)
144        }
145        serde_json::Value::Array(items) => {
146            serde_json::Value::Array(items.into_iter().map(sort_value_keys).collect())
147        }
148        other => other,
149    }
150}
151
152pub fn serialize_notebook(notebook: &Notebook) -> Result<String, NotebookError> {
153    match notebook {
154        Notebook::V4(notebook) => {
155            let value = sort_value_keys(serde_json::to_value(notebook)?);
156            let mut buf = Vec::new();
157            let formatter = serde_json::ser::PrettyFormatter::with_indent(b" ");
158            let mut ser = serde_json::Serializer::with_formatter(&mut buf, formatter);
159            value.serialize(&mut ser)?;
160
161            // Append a newline to the buffer to match the python implementation of nbformat
162            buf.append(&mut b"\n".to_vec());
163
164            let notebook_json = String::from_utf8(buf)
165                .map_err(|e| NotebookError::ValidationError(e.to_string()))?;
166
167            Ok(notebook_json)
168        }
169        Notebook::V4QuirksMode(_) => Err(NotebookError::ValidationError(
170            "v4.5 notebook has quirks — call V4Quirks::repair() before serializing".to_string(),
171        )),
172        Notebook::Legacy(notebook) => Err(NotebookError::UnsupportedVersion(
173            notebook.nbformat,
174            notebook.nbformat_minor,
175        )),
176        Notebook::V3(notebook) => Err(NotebookError::UnsupportedVersion(
177            notebook.nbformat,
178            notebook.nbformat_minor.unwrap_or(0),
179        )),
180    }
181}
182
183pub fn upgrade_legacy_notebook(legacy_notebook: legacy::Notebook) -> anyhow::Result<v4::Notebook> {
184    let cells: Vec<v4::Cell> = legacy_notebook
185        .cells
186        .into_iter()
187        .map(|cell: legacy::Cell| match cell {
188            legacy::Cell::Markdown {
189                id,
190                metadata,
191                source,
192                attachments,
193            } => v4::Cell::Markdown {
194                id: id.unwrap_or_else(|| uuid::Uuid::new_v4().into()),
195                metadata,
196                source,
197                attachments,
198            },
199            legacy::Cell::Code {
200                id,
201                metadata,
202                execution_count,
203                source,
204                outputs,
205            } => v4::Cell::Code {
206                id: id.unwrap_or_else(|| uuid::Uuid::new_v4().into()),
207                metadata,
208                execution_count,
209                source,
210                outputs,
211            },
212            legacy::Cell::Raw {
213                id,
214                metadata,
215                source,
216            } => v4::Cell::Raw {
217                id: id.unwrap_or_else(|| uuid::Uuid::new_v4().into()),
218                metadata,
219                source,
220            },
221        })
222        .collect();
223
224    // If any of the cell IDs are not unique, bail
225    let mut seen_ids = std::collections::HashSet::new();
226    for cell in &cells {
227        if !seen_ids.insert(cell.id()) {
228            return Err(anyhow::anyhow!("Duplicate Cell ID found: {}", cell.id()));
229        }
230    }
231
232    Ok(v4::Notebook {
233        cells,
234        metadata: legacy_notebook.metadata,
235        nbformat: 4,
236        nbformat_minor: 5,
237    })
238}
239
240pub fn upgrade_v3_notebook(v3_notebook: v3::Notebook) -> anyhow::Result<v4::Notebook> {
241    let mut all_cells: Vec<v3::Cell> = Vec::new();
242
243    if let Some(worksheets) = v3_notebook.worksheets {
244        for worksheet in worksheets {
245            all_cells.extend(worksheet.cells);
246        }
247    }
248
249    let cells: Vec<v4::Cell> = all_cells
250        .into_iter()
251        .map(|cell: v3::Cell| match cell {
252            v3::Cell::Heading {
253                level,
254                metadata,
255                source,
256            } => {
257                let heading_prefix = "#".repeat(level as usize);
258                // v3 heading source lines are plain text with no markdown prefix.
259                // Join them into a single line and prepend the heading marker once.
260                let joined = source.join("");
261                let new_source = if joined.trim().is_empty() {
262                    vec![format!("{}", heading_prefix)]
263                } else {
264                    vec![format!("{} {}", heading_prefix, joined)]
265                };
266                v4::Cell::Markdown {
267                    id: uuid::Uuid::new_v4().into(),
268                    metadata,
269                    source: new_source,
270                    attachments: None,
271                }
272            }
273            v3::Cell::Markdown {
274                metadata,
275                source,
276                attachments,
277            } => v4::Cell::Markdown {
278                id: uuid::Uuid::new_v4().into(),
279                metadata,
280                source,
281                attachments,
282            },
283            v3::Cell::Code {
284                metadata,
285                prompt_number,
286                input,
287                language: _,
288                outputs,
289            } => v4::Cell::Code {
290                id: uuid::Uuid::new_v4().into(),
291                metadata,
292                execution_count: prompt_number,
293                source: input.unwrap_or_default(),
294                outputs: outputs.into_iter().map(convert_v3_output).collect(),
295            },
296            v3::Cell::Raw { metadata, source } => v4::Cell::Raw {
297                id: uuid::Uuid::new_v4().into(),
298                metadata,
299                source,
300            },
301        })
302        .collect();
303
304    // All v3 cells are assigned fresh UUIDs above, so duplicate IDs cannot occur.
305
306    let metadata = convert_v3_metadata(v3_notebook.metadata.as_ref());
307
308    Ok(v4::Notebook {
309        cells,
310        metadata,
311        nbformat: 4,
312        nbformat_minor: 5,
313    })
314}
315
316fn convert_v3_metadata(v3_metadata: Option<&serde_json::Value>) -> v4::Metadata {
317    let mut metadata = v4::Metadata::default();
318
319    if let Some(v3_metadata) = v3_metadata {
320        if let Some(obj) = v3_metadata.as_object() {
321            // Extract language from language_info first so we can use it in kernelspec.
322            let language = obj
323                .get("language_info")
324                .and_then(|li| li.get("name"))
325                .and_then(|v| v.as_str())
326                .map(|s| s.to_string());
327
328            if let Some(kernel_info) = obj.get("kernel_info") {
329                if let Some(name) = kernel_info.get("name").and_then(|v| v.as_str()) {
330                    metadata.kernelspec = Some(v4::KernelSpec {
331                        display_name: name.to_string(),
332                        name: name.to_string(),
333                        // Use the actual language from language_info rather than
334                        // assuming Python.
335                        language: language.clone(),
336                        additional: std::collections::HashMap::new(),
337                    });
338                }
339            }
340
341            if let Some(language_info) = obj.get("language_info") {
342                if let Some(name) = language_info.get("name").and_then(|v| v.as_str()) {
343                    let version = language_info
344                        .get("version")
345                        .and_then(|v| v.as_str())
346                        .map(|s| s.to_string());
347                    metadata.language_info = Some(v4::LanguageInfo {
348                        name: name.to_string(),
349                        version,
350                        codemirror_mode: None,
351                        additional: std::collections::HashMap::new(),
352                    });
353                }
354            }
355
356            for (key, value) in obj {
357                if key != "kernel_info" && key != "language_info" {
358                    metadata.additional.insert(key.clone(), value.clone());
359                }
360            }
361        }
362    }
363
364    metadata
365}
366
367fn map_v3_media_fields(
368    fields: &serde_json::Map<String, serde_json::Value>,
369    skip_keys: &[&str],
370) -> Vec<jupyter_protocol::media::MediaType> {
371    fields
372        .iter()
373        .filter(|(k, _)| !skip_keys.contains(&k.as_str()))
374        .filter_map(|(k, v)| {
375            let content = v3::join_media_value(v)?;
376            let media_type = match k.as_str() {
377                "text" => jupyter_protocol::media::MediaType::Plain(content),
378                "html" => jupyter_protocol::media::MediaType::Html(content),
379                "png" => jupyter_protocol::media::MediaType::Png(content),
380                "jpeg" => jupyter_protocol::media::MediaType::Jpeg(content),
381                "svg" => jupyter_protocol::media::MediaType::Svg(content),
382                "latex" => jupyter_protocol::media::MediaType::Latex(content),
383                "javascript" => jupyter_protocol::media::MediaType::Javascript(content),
384                "json" => {
385                    let parsed = serde_json::from_str(&content)
386                        .unwrap_or(serde_json::Value::String(content));
387                    return Some(jupyter_protocol::media::MediaType::Json(parsed));
388                }
389                _ => jupyter_protocol::media::MediaType::Other((
390                    k.clone(),
391                    serde_json::Value::String(content),
392                )),
393            };
394            Some(media_type)
395        })
396        .collect()
397}
398
399fn convert_v3_output(v3_output: v3::Output) -> v4::Output {
400    match v3_output {
401        v3::Output::Stream { name, stream, text } => v4::Output::Stream {
402            name: name.unwrap_or_else(|| stream.unwrap_or_else(|| "stdout".to_string())),
403            text: v4::MultilineString(text.join("")),
404        },
405        v3::Output::PyOut {
406            prompt_number,
407            metadata,
408            extra_fields,
409        } => {
410            let data = map_v3_media_fields(&extra_fields, &["output_type"]);
411
412            let metadata = match metadata {
413                serde_json::Value::Object(map) => map,
414                _ => serde_json::Map::new(),
415            };
416            let execution_count =
417                jupyter_protocol::ExecutionCount::new(prompt_number.unwrap_or(0).max(0) as usize);
418            v4::Output::ExecuteResult(v4::ExecuteResult {
419                execution_count,
420                data: jupyter_protocol::media::Media::new(data),
421                metadata,
422            })
423        }
424        v3::Output::DisplayData {
425            metadata: _,
426            extra_fields,
427        } => {
428            // v3 display_data also stores media as flat top-level keys. Skip the
429            // structural fields that are not media.
430            let media_vec = map_v3_media_fields(&extra_fields, &["output_type", "metadata"]);
431            v4::Output::DisplayData(v4::DisplayData {
432                data: jupyter_protocol::media::Media::new(media_vec),
433                metadata: serde_json::Map::new(),
434            })
435        }
436        v3::Output::PyErr {
437            ename,
438            evalue,
439            traceback,
440        } => v4::Output::Error(v4::ErrorOutput {
441            ename: ename.unwrap_or_default(),
442            evalue: evalue.unwrap_or_default(),
443            traceback,
444        }),
445    }
446}
447
448#[cfg(test)]
449mod sort_value_keys_tests {
450    use super::sort_value_keys;
451    use serde_json::json;
452
453    fn top_level_keys(v: &serde_json::Value) -> Vec<&str> {
454        v.as_object()
455            .expect("expected object")
456            .keys()
457            .map(String::as_str)
458            .collect()
459    }
460
461    #[test]
462    fn sorts_top_level_keys() {
463        let sorted = sort_value_keys(json!({
464            "zebra": 1,
465            "apple": 2,
466            "mango": 3,
467        }));
468        assert_eq!(top_level_keys(&sorted), vec!["apple", "mango", "zebra"]);
469    }
470
471    #[test]
472    fn sorts_nested_object_keys() {
473        let sorted = sort_value_keys(json!({
474            "outer": {
475                "zebra": 1,
476                "apple": 2,
477            }
478        }));
479        let inner = sorted.get("outer").unwrap();
480        assert_eq!(top_level_keys(inner), vec!["apple", "zebra"]);
481    }
482
483    #[test]
484    fn sorts_keys_inside_arrays() {
485        let sorted = sort_value_keys(json!({
486            "cells": [
487                { "zebra": 1, "apple": 2 },
488                { "mango": 3, "banana": 4 },
489            ]
490        }));
491        let cells = sorted.get("cells").unwrap().as_array().unwrap();
492        assert_eq!(top_level_keys(&cells[0]), vec!["apple", "zebra"]);
493        assert_eq!(top_level_keys(&cells[1]), vec!["banana", "mango"]);
494    }
495
496    #[test]
497    fn preserves_array_element_order() {
498        let sorted = sort_value_keys(json!({
499            "list": [3, 1, 2],
500        }));
501        let list = sorted.get("list").unwrap().as_array().unwrap();
502        let values: Vec<i64> = list.iter().map(|v| v.as_i64().unwrap()).collect();
503        assert_eq!(values, vec![3, 1, 2]);
504    }
505
506    #[test]
507    fn leaves_scalars_untouched() {
508        assert_eq!(sort_value_keys(json!(null)), json!(null));
509        assert_eq!(sort_value_keys(json!(true)), json!(true));
510        assert_eq!(sort_value_keys(json!(42)), json!(42));
511        assert_eq!(sort_value_keys(json!("hello")), json!("hello"));
512    }
513}