#[cfg(feature = "office")]
use crate::Result;
#[cfg(feature = "office")]
use crate::core::config::ExtractionConfig;
#[cfg(feature = "office")]
use crate::plugins::{DocumentExtractor, Plugin};
#[cfg(feature = "office")]
use crate::types::internal::InternalDocument;
#[cfg(feature = "office")]
use crate::types::internal_builder::InternalDocumentBuilder;
#[cfg(feature = "office")]
use crate::types::uri::Uri;
#[cfg(feature = "office")]
use crate::types::{ExtractedImage, Metadata};
#[cfg(feature = "office")]
use ahash::AHashMap;
#[cfg(feature = "office")]
use async_trait::async_trait;
#[cfg(feature = "office")]
use base64::Engine;
#[cfg(feature = "office")]
use bytes::Bytes;
#[cfg(feature = "office")]
use serde_json::{Value, json};
#[cfg(feature = "office")]
use std::borrow::Cow;
#[cfg(feature = "office")]
type NotebookContent = (String, AHashMap<Cow<'static, str>, Value>, Vec<ExtractedImage>, Value);
#[cfg(feature = "office")]
pub struct JupyterExtractor;
#[cfg(feature = "office")]
impl JupyterExtractor {
pub fn new() -> Self {
Self
}
fn extract_notebook(content: &[u8], plain: bool) -> Result<NotebookContent> {
let notebook: Value = serde_json::from_slice(content)
.map_err(|e| crate::KreuzbergError::parsing(format!("Failed to parse JSON: {}", e)))?;
let mut extracted_content = String::new();
let mut metadata = AHashMap::new();
let mut images = Vec::new();
if let Some(notebook_metadata) = notebook.get("metadata").and_then(|m| m.as_object()) {
if let Some(kernelspec) = notebook_metadata.get("kernelspec") {
metadata.insert(Cow::Borrowed("kernelspec"), kernelspec.clone());
}
if let Some(language_info) = notebook_metadata.get("language_info") {
metadata.insert(Cow::Borrowed("language_info"), language_info.clone());
if let Some(obj) = language_info.as_object() {
if let Some(name) = obj.get("name") {
metadata.insert(Cow::Borrowed("language_name"), name.clone());
}
if let Some(version) = obj.get("version") {
metadata.insert(Cow::Borrowed("language_version"), version.clone());
}
if let Some(mimetype) = obj.get("mimetype") {
metadata.insert(Cow::Borrowed("language_mimetype"), mimetype.clone());
}
}
}
}
if let Some(nbformat) = notebook.get("nbformat") {
metadata.insert(Cow::Borrowed("nbformat"), nbformat.clone());
}
if let Some(nbformat_minor) = notebook.get("nbformat_minor") {
metadata.insert(Cow::Borrowed("nbformat_minor"), nbformat_minor.clone());
}
if let Some(cells) = notebook.get("cells").and_then(|c| c.as_array()) {
metadata.insert(Cow::Borrowed("cell_count"), json!(cells.len()));
}
if let Some(cells) = notebook.get("cells").and_then(|c| c.as_array()) {
let mut cells_meta: Vec<Value> = Vec::with_capacity(cells.len());
for (cell_idx, cell) in cells.iter().enumerate() {
let cell_type = cell.get("cell_type").and_then(|t| t.as_str()).unwrap_or("unknown");
let mut cell_entry = serde_json::Map::new();
cell_entry.insert("index".into(), json!(cell_idx));
cell_entry.insert("cell_type".into(), json!(cell_type));
if cell_type == "code"
&& let Some(exec_count) = cell.get("execution_count")
{
cell_entry.insert("execution_count".into(), exec_count.clone());
}
if let Some(tags) = cell
.get("metadata")
.and_then(|m| m.get("tags"))
.and_then(|t| t.as_array())
&& !tags.is_empty()
{
cell_entry.insert("tags".into(), Value::Array(tags.clone()));
}
cells_meta.push(Value::Object(cell_entry));
Self::extract_cell(cell, cell_idx, &mut extracted_content, &mut images, plain)?;
}
metadata.insert(Cow::Borrowed("cells"), json!(cells_meta));
}
Ok((extracted_content, metadata, images, notebook))
}
fn extract_cell(
cell: &Value,
cell_idx: usize,
content: &mut String,
images: &mut Vec<ExtractedImage>,
plain: bool,
) -> Result<()> {
let cell_type = cell.get("cell_type").and_then(|t| t.as_str()).unwrap_or("unknown");
match cell_type {
"markdown" => Self::extract_markdown_cell(cell, content)?,
"code" => Self::extract_code_cell(cell, cell_idx, content, images, plain)?,
"raw" => Self::extract_raw_cell(cell, content)?,
_ => {}
}
if !content.ends_with('\n') {
content.push('\n');
}
content.push('\n');
Ok(())
}
fn extract_markdown_cell(cell: &Value, content: &mut String) -> Result<()> {
if let Some(source) = cell.get("source") {
let cell_text = Self::extract_source(source);
content.push_str(&cell_text);
}
Ok(())
}
fn extract_code_cell(
cell: &Value,
cell_idx: usize,
content: &mut String,
images: &mut Vec<ExtractedImage>,
plain: bool,
) -> Result<()> {
if let Some(source) = cell.get("source") {
let cell_text = Self::extract_source(source);
content.push_str(&cell_text);
if !cell_text.ends_with('\n') {
content.push('\n');
}
}
if let Some(outputs) = cell.get("outputs").and_then(|o| o.as_array()) {
for output in outputs {
Self::extract_output(output, cell_idx, content, images, plain)?;
}
}
Ok(())
}
fn extract_raw_cell(cell: &Value, content: &mut String) -> Result<()> {
if let Some(source) = cell.get("source") {
let cell_text = Self::extract_source(source);
content.push_str(&cell_text);
}
Ok(())
}
fn extract_source(source: &Value) -> String {
match source {
Value::String(s) => s.clone(),
Value::Array(arr) => arr.iter().filter_map(|v| v.as_str()).collect::<String>(),
_ => String::new(),
}
}
fn extract_output(
output: &Value,
cell_idx: usize,
content: &mut String,
images: &mut Vec<ExtractedImage>,
plain: bool,
) -> Result<()> {
let output_type = output.get("output_type").and_then(|t| t.as_str()).unwrap_or("unknown");
match output_type {
"stream" => Self::extract_stream_output(output, content)?,
"execute_result" | "display_data" => {
Self::extract_data_output(output, cell_idx, content, images, plain)?;
}
"error" => Self::extract_error_output(output, content)?,
_ => {}
}
Ok(())
}
fn extract_stream_output(output: &Value, content: &mut String) -> Result<()> {
if let Some(text) = output.get("text") {
let text_content = Self::extract_source(text);
content.push_str(&text_content);
}
Ok(())
}
fn extract_data_output(
output: &Value,
cell_idx: usize,
content: &mut String,
images: &mut Vec<ExtractedImage>,
plain_mode: bool,
) -> Result<()> {
if let Some(data) = output.get("data").and_then(|d| d.as_object()) {
if let Some(plain) = data.get("text/plain") {
let text = Self::extract_source(plain);
if !text.is_empty() {
content.push_str(&text);
if !text.ends_with('\n') {
content.push('\n');
}
}
}
if !plain_mode {
for mime_type in &["text/markdown", "text/html"] {
if let Some(mime_content) = data.get(*mime_type) {
let mime_text = Self::extract_source(mime_content);
if !mime_text.is_empty() {
content.push_str(&mime_text);
if !mime_text.ends_with('\n') {
content.push('\n');
}
}
}
}
}
for mime_type in &["image/png", "image/jpeg", "image/gif", "image/webp"] {
if let Some(image_value) = data.get(*mime_type) {
let base64_str = Self::extract_source(image_value);
let cleaned = base64_str.replace(['\n', '\r'], "");
if let Ok(decoded) = base64::engine::general_purpose::STANDARD.decode(&cleaned) {
let format = match *mime_type {
"image/png" => "png",
"image/jpeg" => "jpeg",
"image/gif" => "gif",
"image/webp" => "webp",
_ => "unknown",
};
images.push(ExtractedImage {
data: Bytes::from(decoded),
format: Cow::Borrowed(format),
image_index: images.len(),
page_number: Some(cell_idx + 1),
width: None,
height: None,
colorspace: None,
bits_per_component: None,
is_mask: false,
description: Some(format!("Notebook cell {} output", cell_idx)),
ocr_result: None,
bounding_box: None,
source_path: None,
});
content.push_str(&format!("[Image: {}]\n", mime_type));
}
}
}
if data.contains_key("image/svg+xml") {
content.push_str("[Image: image/svg+xml]\n");
}
if let Some(json_content) = data.get("application/json")
&& let Ok(formatted) = serde_json::to_string_pretty(json_content)
{
content.push_str(&formatted);
content.push('\n');
}
}
Ok(())
}
fn scan_markdown_inline(text: &str) -> (String, Vec<crate::types::TextAnnotation>) {
use crate::types::TextAnnotation;
use crate::types::document_structure::AnnotationKind;
let mut out = String::with_capacity(text.len());
let mut annotations = Vec::new();
let bytes = text.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
if i + 1 < len && bytes[i] == b'*' && bytes[i + 1] == b'*' {
if let Some(end) = Self::find_closing(bytes, i + 2, b"**") {
let inner = &text[i + 2..end];
let start = out.len() as u32;
out.push_str(inner);
let ann_end = out.len() as u32;
annotations.push(TextAnnotation {
start,
end: ann_end,
kind: AnnotationKind::Bold,
});
i = end + 2;
continue;
}
}
if bytes[i] == b'*' && (i == 0 || bytes[i - 1] != b'*') {
if i + 1 < len
&& bytes[i + 1] != b'*'
&& let Some(end) = Self::find_closing_single_star(bytes, i + 1)
{
let inner = &text[i + 1..end];
let start = out.len() as u32;
out.push_str(inner);
let ann_end = out.len() as u32;
annotations.push(TextAnnotation {
start,
end: ann_end,
kind: AnnotationKind::Italic,
});
i = end + 1;
continue;
}
}
if bytes[i] == b'`' && (i + 1 >= len || bytes[i + 1] != b'`') {
if let Some(end) = Self::find_closing_byte(bytes, i + 1, b'`') {
let inner = &text[i + 1..end];
let start = out.len() as u32;
out.push_str(inner);
let ann_end = out.len() as u32;
annotations.push(TextAnnotation {
start,
end: ann_end,
kind: AnnotationKind::Code,
});
i = end + 1;
continue;
}
}
out.push(bytes[i] as char);
i += 1;
}
(out, annotations)
}
fn find_closing(bytes: &[u8], start: usize, delim: &[u8; 2]) -> Option<usize> {
let mut i = start;
while i + 1 < bytes.len() {
if bytes[i] == delim[0] && bytes[i + 1] == delim[1] {
return Some(i);
}
i += 1;
}
None
}
fn find_closing_single_star(bytes: &[u8], start: usize) -> Option<usize> {
let mut i = start;
while i < bytes.len() {
if bytes[i] == b'*' && (i + 1 >= bytes.len() || bytes[i + 1] != b'*') {
return Some(i);
}
i += 1;
}
None
}
fn find_closing_byte(bytes: &[u8], start: usize, delim: u8) -> Option<usize> {
bytes[start..].iter().position(|&b| b == delim).map(|p| start + p)
}
fn extract_markdown_links(text: &str) -> Vec<Uri> {
use pulldown_cmark::{Event, Parser, Tag, TagEnd};
let parser = Parser::new(text);
let mut uris = Vec::new();
let mut current_text = String::new();
let mut current_url: Option<String> = None;
let mut in_link = false;
let mut in_image = false;
for event in parser {
match event {
Event::Start(Tag::Link { dest_url, .. }) => {
in_link = true;
current_text.clear();
current_url = Some(dest_url.into_string());
}
Event::Start(Tag::Image { dest_url, .. }) => {
in_image = true;
current_text.clear();
current_url = Some(dest_url.into_string());
}
Event::Text(text) if in_link || in_image => {
current_text.push_str(&text);
}
Event::End(TagEnd::Link) => {
if let Some(url) = current_url.take()
&& !url.is_empty()
{
let label_opt = if current_text.is_empty() {
None
} else {
Some(current_text.clone())
};
uris.push(Uri::hyperlink(&url, label_opt));
}
in_link = false;
current_text.clear();
}
Event::End(TagEnd::Image) => {
if let Some(url) = current_url.take()
&& !url.is_empty()
{
let label_opt = if current_text.is_empty() {
None
} else {
Some(current_text.clone())
};
uris.push(Uri::image(&url, label_opt));
}
in_image = false;
current_text.clear();
}
_ => {}
}
}
uris
}
fn parse_heading_line(line: &str) -> Option<(u8, &str)> {
let trimmed = line.trim_start();
let hashes = trimmed.bytes().take_while(|&b| b == b'#').count();
if hashes == 0 || hashes > 6 {
return None;
}
let rest = &trimmed[hashes..];
if !rest.is_empty() && !rest.starts_with(' ') {
return None;
}
Some((hashes as u8, rest.trim()))
}
fn collect_output_text(output: &Value) -> String {
let mut text = String::new();
let output_type = output.get("output_type").and_then(|t| t.as_str()).unwrap_or("");
match output_type {
"stream" => {
if let Some(t) = output.get("text") {
text.push_str(&Self::extract_source(t));
}
}
"execute_result" | "display_data" => {
if let Some(data) = output.get("data").and_then(|d| d.as_object())
&& let Some(plain) = data.get("text/plain")
{
text.push_str(&Self::extract_source(plain));
}
}
"error" => {
let ename = output.get("ename").and_then(|e| e.as_str()).unwrap_or("Unknown");
let evalue = output.get("evalue").and_then(|e| e.as_str()).unwrap_or("");
text.push_str(&format!("Error ({}): {}", ename, evalue));
}
_ => {}
}
text
}
fn build_internal_document(notebook: &Value) -> Option<InternalDocument> {
let cells = notebook.get("cells")?.as_array()?;
let kernel_lang = notebook
.get("metadata")
.and_then(|m| m.get("kernelspec"))
.and_then(|k| k.get("language"))
.and_then(|l| l.as_str())
.or_else(|| {
notebook
.get("metadata")
.and_then(|m| m.get("language_info"))
.and_then(|l| l.get("name"))
.and_then(|n| n.as_str())
});
let mut builder = InternalDocumentBuilder::new("jupyter");
for cell in cells {
let cell_type = cell.get("cell_type").and_then(|t| t.as_str()).unwrap_or("unknown");
let source_text = Self::extract_source(cell.get("source").unwrap_or(&Value::Null));
let trimmed = source_text.trim();
if trimmed.is_empty() {
continue;
}
match cell_type {
"markdown" => {
let link_uris = Self::extract_markdown_links(trimmed);
for uri in link_uris {
builder.push_uri(uri);
}
let mut para_buf = String::new();
for line in trimmed.lines() {
if let Some((level, heading_text)) = Self::parse_heading_line(line) {
let flushed = para_buf.trim();
if !flushed.is_empty() {
let (stripped, annotations) = Self::scan_markdown_inline(flushed);
builder.push_paragraph(&stripped, annotations, None, None);
}
para_buf.clear();
if !heading_text.is_empty() {
builder.push_heading(level, heading_text, None, None);
}
} else {
if !para_buf.is_empty() {
para_buf.push('\n');
}
para_buf.push_str(line);
}
}
let flushed = para_buf.trim();
if !flushed.is_empty() {
let (stripped, annotations) = Self::scan_markdown_inline(flushed);
builder.push_paragraph(&stripped, annotations, None, None);
}
}
"code" => {
let idx = builder.push_code(trimmed, kernel_lang, None, None);
let mut attrs = AHashMap::new();
if let Some(exec_count) = cell.get("execution_count") {
match exec_count {
Value::Number(n) => {
attrs.insert("execution_count".to_string(), n.to_string());
}
Value::Null => {
attrs.insert("execution_count".to_string(), "null".to_string());
}
_ => {}
}
}
if let Some(tags) = cell
.get("metadata")
.and_then(|m| m.get("tags"))
.and_then(|t| t.as_array())
&& !tags.is_empty()
{
let tag_strs: Vec<&str> = tags.iter().filter_map(|v| v.as_str()).collect();
attrs.insert("tags".to_string(), tag_strs.join(","));
}
if !attrs.is_empty() {
builder.set_attributes(idx, attrs);
}
if let Some(outputs) = cell.get("outputs").and_then(|o| o.as_array()) {
for output in outputs {
let output_text = Self::collect_output_text(output);
let output_trimmed = output_text.trim();
if !output_trimmed.is_empty() {
builder.push_paragraph(output_trimmed, vec![], None, None);
}
}
}
}
_ => {
builder.push_paragraph(trimmed, vec![], None, None);
}
}
}
Some(builder.build())
}
fn extract_error_output(output: &Value, content: &mut String) -> Result<()> {
let ename = output.get("ename").and_then(|e| e.as_str()).unwrap_or("Unknown");
let evalue = output.get("evalue").and_then(|e| e.as_str()).unwrap_or("");
content.push_str(&format!("Error ({}): {}\n", ename, evalue));
if let Some(traceback) = output.get("traceback").and_then(|t| t.as_array()) {
content.push_str("Traceback:\n");
for line in traceback {
if let Some(line_str) = line.as_str() {
content.push_str(line_str);
content.push('\n');
}
}
}
Ok(())
}
}
#[cfg(feature = "office")]
impl Default for JupyterExtractor {
fn default() -> Self {
Self::new()
}
}
#[cfg(feature = "office")]
impl Plugin for JupyterExtractor {
fn name(&self) -> &str {
"jupyter-extractor"
}
fn version(&self) -> String {
env!("CARGO_PKG_VERSION").to_string()
}
fn initialize(&self) -> Result<()> {
Ok(())
}
fn shutdown(&self) -> Result<()> {
Ok(())
}
fn description(&self) -> &str {
"Extracts content from Jupyter notebooks (.ipynb files)"
}
fn author(&self) -> &str {
"Kreuzberg Team"
}
}
#[cfg(feature = "office")]
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
impl DocumentExtractor for JupyterExtractor {
#[cfg_attr(
feature = "otel",
tracing::instrument(
skip(self, content, config),
fields(
extractor.name = self.name(),
content.size_bytes = content.len(),
)
)
)]
async fn extract_bytes(
&self,
content: &[u8],
mime_type: &str,
config: &ExtractionConfig,
) -> Result<InternalDocument> {
let plain = matches!(
config.output_format,
crate::core::config::OutputFormat::Plain | crate::core::config::OutputFormat::Structured
);
let (_extracted_content, additional_metadata, extracted_images, notebook_json) =
Self::extract_notebook(content, plain)?;
let mut metadata_additional = AHashMap::new();
let meta_language = additional_metadata
.get(&Cow::Borrowed("language_name"))
.and_then(|v| v.as_str().map(|s| s.to_string()));
for (key, value) in additional_metadata {
metadata_additional.insert(key, json!(value));
}
let images = extracted_images;
let mut doc = Self::build_internal_document(¬ebook_json)
.unwrap_or_else(|| InternalDocumentBuilder::new("jupyter").build());
doc.mime_type = Cow::Owned(mime_type.to_string());
doc.metadata = Metadata {
language: meta_language,
additional: metadata_additional,
..Default::default()
};
doc.images = images;
Ok(doc)
}
fn supported_mime_types(&self) -> &[&str] {
&["application/x-ipynb+json"]
}
fn priority(&self) -> i32 {
50
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_jupyter_extractor_plugin_interface() {
let extractor = JupyterExtractor::new();
assert_eq!(extractor.name(), "jupyter-extractor");
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
assert_eq!(extractor.priority(), 50);
assert!(extractor.supported_mime_types().contains(&"application/x-ipynb+json"));
}
#[test]
fn test_extract_execution_count_and_tags() {
let notebook_json = r#"{
"cells": [
{
"cell_type": "code",
"source": ["print('hello')"],
"execution_count": 5,
"outputs": [],
"metadata": {"tags": ["test-tag", "important"]}
}
],
"metadata": {
"kernelspec": {"name": "python3", "language": "python"},
"language_info": {"name": "python", "version": "3.10.0", "mimetype": "text/x-python"}
},
"nbformat": 4,
"nbformat_minor": 5
}"#;
let (_, metadata, _, _) = JupyterExtractor::extract_notebook(notebook_json.as_bytes(), false).unwrap();
let cells = metadata.get(&Cow::Borrowed("cells"));
assert!(cells.is_some(), "Should have cells metadata array");
let cells_arr = cells.unwrap().as_array().expect("cells should be an array");
assert_eq!(cells_arr.len(), 1);
let cell0 = &cells_arr[0];
assert_eq!(cell0["index"], json!(0));
assert_eq!(cell0["cell_type"], json!("code"));
assert_eq!(cell0["execution_count"], json!(5));
assert_eq!(cell0["tags"], json!(["test-tag", "important"]));
assert_eq!(metadata.get(&Cow::Borrowed("cell_count")), Some(&json!(1)));
assert_eq!(metadata.get(&Cow::Borrowed("language_name")), Some(&json!("python")));
assert_eq!(metadata.get(&Cow::Borrowed("language_version")), Some(&json!("3.10.0")));
assert_eq!(
metadata.get(&Cow::Borrowed("language_mimetype")),
Some(&json!("text/x-python"))
);
assert_eq!(metadata.get(&Cow::Borrowed("nbformat_minor")), Some(&json!(5)));
}
#[test]
fn test_extract_error_output_content() {
let notebook_json = r#"{
"cells": [
{
"cell_type": "code",
"source": ["1/0"],
"execution_count": 1,
"outputs": [
{
"output_type": "error",
"ename": "ZeroDivisionError",
"evalue": "division by zero",
"traceback": ["Traceback line 1", "Traceback line 2"]
}
],
"metadata": {}
}
],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 0
}"#;
let (content, _, _, _) = JupyterExtractor::extract_notebook(notebook_json.as_bytes(), false).unwrap();
assert!(
content.contains("Error (ZeroDivisionError): division by zero"),
"Should contain error name and value"
);
assert!(content.contains("Traceback:"), "Should contain traceback header");
assert!(content.contains("Traceback line 1"), "Should contain traceback lines");
}
}