use crate::{Document, Error, Extractor, Result};
use serde_json::Value;
use std::path::Path;
#[derive(Default)]
pub struct IpynbExtractor;
impl IpynbExtractor {
#[must_use]
pub fn new() -> Self {
Self
}
}
impl Extractor for IpynbExtractor {
fn extensions(&self) -> &[&'static str] {
&["ipynb"]
}
fn name(&self) -> &'static str {
"ipynb-builtin"
}
fn extract(&self, path: &Path) -> Result<Document> {
let bytes = std::fs::read(path)?;
self.extract_bytes(&bytes, "ipynb")
}
fn extract_bytes(&self, bytes: &[u8], _ext: &str) -> Result<Document> {
let v: Value = serde_json::from_slice(bytes)
.map_err(|e| Error::ParseError(format!("notebook is not valid JSON: {e}")))?;
Ok(notebook_to_document(&v))
}
}
fn notebook_to_document(notebook: &Value) -> Document {
let language = notebook
.pointer("/metadata/kernelspec/language")
.and_then(Value::as_str)
.or_else(|| {
notebook
.pointer("/metadata/language_info/name")
.and_then(Value::as_str)
})
.unwrap_or("")
.to_string();
let mut markdown = String::new();
if let Some(cells) = notebook.get("cells").and_then(Value::as_array) {
for cell in cells {
let cell_type = cell.get("cell_type").and_then(Value::as_str).unwrap_or("");
let source = cell_source(cell);
let trimmed = source.trim();
if trimmed.is_empty() {
continue;
}
if !markdown.is_empty() {
markdown.push_str("\n\n");
}
match cell_type {
"markdown" | "raw" => markdown.push_str(trimmed),
"code" => {
markdown.push_str("```");
if !language.is_empty() {
markdown.push_str(&language);
}
markdown.push('\n');
markdown.push_str(trimmed);
markdown.push_str("\n```");
}
_ => {
markdown.push_str(trimmed);
}
}
}
}
let title = notebook
.pointer("/metadata/title")
.and_then(Value::as_str)
.map(str::to_string);
let mut metadata = std::collections::HashMap::new();
if !language.is_empty() {
metadata.insert("kernel_language".into(), language.clone());
}
if let Some(kernel_name) = notebook
.pointer("/metadata/kernelspec/display_name")
.and_then(Value::as_str)
{
metadata.insert("kernel_display_name".into(), kernel_name.to_string());
}
Document {
markdown,
title,
metadata,
}
}
fn cell_source(cell: &Value) -> String {
match cell.get("source") {
Some(Value::String(s)) => s.clone(),
Some(Value::Array(lines)) => {
let mut out = String::new();
for line in lines {
if let Some(s) = line.as_str() {
out.push_str(s);
}
}
out
}
_ => String::new(),
}
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn extensions_is_ipynb_only() {
assert_eq!(IpynbExtractor.extensions(), &["ipynb"]);
}
#[test]
fn name_identifies_backend() {
assert_eq!(IpynbExtractor.name(), "ipynb-builtin");
}
#[test]
fn empty_notebook_yields_empty_markdown() {
let nb = json!({
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5,
});
let doc = notebook_to_document(&nb);
assert!(doc.markdown.is_empty());
assert!(doc.title.is_none());
}
#[test]
fn markdown_cells_pass_through_verbatim() {
let nb = json!({
"cells": [
{"cell_type": "markdown", "source": "# Hello\n\nworld"},
],
});
let doc = notebook_to_document(&nb);
assert_eq!(doc.markdown, "# Hello\n\nworld");
}
#[test]
fn source_can_be_array_of_lines() {
let nb = json!({
"cells": [
{"cell_type": "markdown", "source": ["# Hello\n", "\n", "world"]},
],
});
let doc = notebook_to_document(&nb);
assert_eq!(doc.markdown, "# Hello\n\nworld");
}
#[test]
fn code_cells_get_fenced_blocks_with_language_hint() {
let nb = json!({
"cells": [
{"cell_type": "code", "source": "print('hi')"},
],
"metadata": {
"kernelspec": {"language": "python", "display_name": "Python 3"},
},
});
let doc = notebook_to_document(&nb);
assert_eq!(doc.markdown, "```python\nprint('hi')\n```");
assert_eq!(
doc.metadata.get("kernel_language").map(String::as_str),
Some("python")
);
assert_eq!(
doc.metadata.get("kernel_display_name").map(String::as_str),
Some("Python 3")
);
}
#[test]
fn code_cells_without_language_use_unhinted_fence() {
let nb = json!({
"cells": [
{"cell_type": "code", "source": "let x = 1;"},
],
});
let doc = notebook_to_document(&nb);
assert_eq!(doc.markdown, "```\nlet x = 1;\n```");
assert!(!doc.metadata.contains_key("kernel_language"));
}
#[test]
fn language_info_falls_back_when_kernelspec_missing() {
let nb = json!({
"cells": [
{"cell_type": "code", "source": "SELECT 1;"},
],
"metadata": {
"language_info": {"name": "sql"},
},
});
let doc = notebook_to_document(&nb);
assert!(doc.markdown.starts_with("```sql\n"));
}
#[test]
fn empty_cells_are_skipped() {
let nb = json!({
"cells": [
{"cell_type": "markdown", "source": "first"},
{"cell_type": "code", "source": " "},
{"cell_type": "markdown", "source": ""},
{"cell_type": "markdown", "source": "second"},
],
});
let doc = notebook_to_document(&nb);
assert_eq!(doc.markdown, "first\n\nsecond");
}
#[test]
fn raw_cells_pass_through_verbatim() {
let nb = json!({
"cells": [
{"cell_type": "raw", "source": "<svg>...</svg>"},
],
});
let doc = notebook_to_document(&nb);
assert_eq!(doc.markdown, "<svg>...</svg>");
}
#[test]
fn unknown_cell_types_emit_as_opaque_text() {
let nb = json!({
"cells": [
{"cell_type": "future-thing", "source": "preserve me"},
],
});
let doc = notebook_to_document(&nb);
assert_eq!(doc.markdown, "preserve me");
}
#[test]
fn malformed_json_returns_typed_error() {
let result = IpynbExtractor.extract_bytes(b"{ not json", "ipynb");
assert!(matches!(result, Err(Error::ParseError(_))));
}
#[test]
fn missing_file_returns_io_error() {
let result = IpynbExtractor.extract(std::path::Path::new("/nonexistent.ipynb"));
assert!(matches!(result, Err(Error::Io(_))));
}
#[test]
fn title_surfaces_when_metadata_has_it() {
let nb = json!({
"cells": [{"cell_type": "markdown", "source": "body"}],
"metadata": {"title": "My Notebook"},
});
let doc = notebook_to_document(&nb);
assert_eq!(doc.title.as_deref(), Some("My Notebook"));
}
}