use serde_json::{json, Value};
use crate::document::{DoclingDocument, Node, Table};
const SCHEMA_VERSION: &str = "1.10.0";
const CODE_LANGUAGES: &[&str] = &[
"Ada",
"Awk",
"Bash",
"bc",
"C",
"C#",
"C++",
"CMake",
"COBOL",
"CSS",
"Ceylon",
"Clojure",
"Crystal",
"Cuda",
"Cython",
"D",
"Dart",
"dc",
"Dockerfile",
"DocLang",
"Elixir",
"Erlang",
"FORTRAN",
"Forth",
"Go",
"HTML",
"Haskell",
"Haxe",
"Java",
"JavaScript",
"JSON",
"Julia",
"Kotlin",
"Latex",
"Lisp",
"Lua",
"Matlab",
"MoonScript",
"Nim",
"OCaml",
"ObjectiveC",
"Octave",
"PHP",
"Pascal",
"Perl",
"Prolog",
"Python",
"Racket",
"Ruby",
"Rust",
"SML",
"SQL",
"Scala",
"Scheme",
"Swift",
"Tikz",
"TypeScript",
"VisualBasic",
"XML",
"YAML",
];
fn code_language(lang: Option<&str>) -> &'static str {
match lang {
Some(l) => CODE_LANGUAGES
.iter()
.find(|c| c.eq_ignore_ascii_case(l))
.copied()
.unwrap_or("unknown"),
None => "unknown",
}
}
pub fn to_json(doc: &DoclingDocument) -> Value {
let mut b = Builder::default();
let body = b.walk_into(&doc.nodes, "#/body");
json!({
"schema_name": "DoclingDocument",
"version": SCHEMA_VERSION,
"name": doc.name,
"origin": {
"mimetype": "text/plain",
"binary_hash": fnv1a(&doc.name),
"filename": doc.name,
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified",
},
"body": {
"self_ref": "#/body",
"children": body,
"content_layer": "body",
"name": "_root_",
"label": "unspecified",
},
"groups": b.groups,
"texts": b.texts,
"pictures": b.pictures,
"tables": b.tables,
"key_value_items": [],
"form_items": [],
"pages": {},
})
}
#[derive(Default)]
struct Builder {
texts: Vec<Value>,
groups: Vec<Value>,
tables: Vec<Value>,
pictures: Vec<Value>,
}
impl Builder {
fn add_node(&mut self, node: &Node, parent: &str) -> Option<String> {
match node {
Node::Heading { level: 1, text } => {
Some(self.add_text("title", text, parent, json!({})))
}
Node::Heading { level, text } => Some(self.add_text(
"section_header",
text,
parent,
json!({ "level": level.saturating_sub(1) }),
)),
Node::Paragraph { text } => {
let t = text.trim();
match t.strip_prefix("$$").and_then(|s| s.strip_suffix("$$")) {
Some(inner) if !inner.is_empty() => Some(self.add_formula(inner, parent)),
_ => Some(self.add_text("text", text, parent, json!({}))),
}
}
Node::Code { language, text } => Some(self.add_code(text, language.as_deref(), parent)),
Node::Table(t) => Some(self.add_table(t, parent)),
Node::Picture { caption, image } => {
Some(self.add_picture(caption.as_deref(), image.as_ref(), parent))
}
Node::Group { label, children } => Some(self.add_group(label, children, parent)),
Node::ListItem { .. } => None,
}
}
fn add_text(&mut self, label: &str, text: &str, parent: &str, extra: Value) -> String {
let self_ref = format!("#/texts/{}", self.texts.len());
let raw = unescape_text(text);
let mut item = json!({
"self_ref": self_ref,
"parent": { "$ref": parent },
"children": [],
"content_layer": "body",
"label": label,
"prov": [],
"orig": raw,
"text": raw,
});
merge(&mut item, extra);
self.texts.push(item);
self_ref
}
fn add_formula(&mut self, latex: &str, parent: &str) -> String {
let self_ref = format!("#/texts/{}", self.texts.len());
self.texts.push(json!({
"self_ref": self_ref,
"parent": { "$ref": parent },
"children": [],
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": latex,
"text": latex,
}));
self_ref
}
fn add_code(&mut self, text: &str, language: Option<&str>, parent: &str) -> String {
let self_ref = format!("#/texts/{}", self.texts.len());
let raw = unescape_text(text);
self.texts.push(json!({
"self_ref": self_ref,
"parent": { "$ref": parent },
"children": [],
"content_layer": "body",
"label": "code",
"prov": [],
"orig": raw,
"text": raw,
"captions": [],
"references": [],
"footnotes": [],
"code_language": code_language(language),
}));
self_ref
}
fn add_list(&mut self, items: &[Node], parent: &str) -> String {
let self_ref = format!("#/groups/{}", self.groups.len());
self.groups.push(Value::Null);
let base = level_of(&items[0]);
let mut children = Vec::new();
let mut i = 0;
while i < items.len() {
let lvl = level_of(&items[i]);
if lvl > base {
i += 1;
continue;
}
let item_ref = self.add_list_item(&items[i], &self_ref);
let mut j = i + 1;
while j < items.len() && level_of(&items[j]) > base {
j += 1;
}
if j > i + 1 {
let mut nested = Vec::new();
self.add_sibling_lists(&items[i + 1..j], &item_ref, &mut nested);
if let Some(idx) = ref_index(&item_ref) {
self.texts[idx]["children"]
.as_array_mut()
.unwrap()
.extend(nested);
}
}
children.push(json!({ "$ref": item_ref }));
i = j;
}
self.groups[group_index(&self_ref)] = json!({
"self_ref": self_ref,
"parent": { "$ref": parent },
"children": children,
"content_layer": "body",
"name": "list",
"label": "list",
});
self_ref
}
fn add_list_item(&mut self, node: &Node, parent: &str) -> String {
let Node::ListItem {
ordered,
number,
text,
..
} = node
else {
unreachable!()
};
let self_ref = format!("#/texts/{}", self.texts.len());
let raw = unescape_text(text);
let marker = if *ordered {
format!("{number}.")
} else {
"-".to_string()
};
self.texts.push(json!({
"self_ref": self_ref,
"parent": { "$ref": parent },
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": raw,
"text": raw,
"enumerated": ordered,
"marker": marker,
}));
self_ref
}
fn add_table(&mut self, t: &Table, parent: &str) -> String {
let self_ref = format!("#/tables/{}", self.tables.len());
let num_rows = t.rows.len();
let num_cols = t.rows.iter().map(Vec::len).max().unwrap_or(0);
let mut grid = Vec::with_capacity(num_rows);
let mut cells = Vec::new();
for (r, row) in t.rows.iter().enumerate() {
let mut grid_row = Vec::with_capacity(num_cols);
for c in 0..num_cols {
let text = row.get(c).map(|s| unescape_text(s)).unwrap_or_default();
let cell = json!({
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": r,
"end_row_offset_idx": r + 1,
"start_col_offset_idx": c,
"end_col_offset_idx": c + 1,
"text": text,
"column_header": r == 0,
"row_header": false,
"row_section": false,
"fillable": false,
});
grid_row.push(cell.clone());
cells.push(cell);
}
grid.push(grid_row);
}
self.tables.push(json!({
"self_ref": self_ref,
"parent": { "$ref": parent },
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": cells,
"num_rows": num_rows,
"num_cols": num_cols,
"grid": grid,
},
"annotations": [],
}));
self_ref
}
fn add_picture(
&mut self,
caption: Option<&str>,
image: Option<&crate::PictureImage>,
parent: &str,
) -> String {
let self_ref = format!("#/pictures/{}", self.pictures.len());
let mut captions = Vec::new();
if let Some(cap) = caption.filter(|c| !c.is_empty()) {
let cap_ref = self.add_text("caption", cap, &self_ref, json!({}));
captions.push(json!({ "$ref": cap_ref }));
}
let mut item = json!({
"self_ref": self_ref,
"parent": { "$ref": parent },
"children": [],
"content_layer": "body",
"label": "picture",
"prov": [],
"captions": captions,
"references": [],
"footnotes": [],
"annotations": [],
});
if let Some(img) = image {
item["image"] = json!({
"mimetype": img.mimetype,
"dpi": 72,
"size": { "width": img.width, "height": img.height },
"uri": img.data_uri(),
});
}
self.pictures.push(item);
self_ref
}
fn add_group(&mut self, label: &str, nodes: &[Node], parent: &str) -> String {
let self_ref = format!("#/groups/{}", self.groups.len());
self.groups.push(Value::Null);
let children = self.walk_into(nodes, &self_ref);
let name = if label == "inline" { "group" } else { label };
self.groups[group_index(&self_ref)] = json!({
"self_ref": self_ref,
"parent": { "$ref": parent },
"children": children,
"content_layer": "body",
"name": name,
"label": label,
});
self_ref
}
fn walk_into(&mut self, nodes: &[Node], parent: &str) -> Vec<Value> {
let mut children = Vec::new();
let mut i = 0;
while i < nodes.len() {
if matches!(nodes[i], Node::ListItem { .. }) {
let start = i;
while i < nodes.len() && matches!(nodes[i], Node::ListItem { .. }) {
i += 1;
}
self.add_sibling_lists(&nodes[start..i], parent, &mut children);
} else {
if let Some(r) = self.add_node(&nodes[i], parent) {
children.push(json!({ "$ref": r }));
}
i += 1;
}
}
children
}
fn add_sibling_lists(&mut self, run: &[Node], parent: &str, out: &mut Vec<Value>) {
let base = level_of(&run[0]);
let mut seg = 0;
let mut prev: Option<(bool, u64)> = None;
for k in 0..run.len() {
let Node::ListItem {
ordered,
number,
first_in_list,
level,
..
} = &run[k]
else {
continue;
};
if *level != base {
continue; }
if k > seg {
if let Some((po, pn)) = prev {
if *first_in_list || po != *ordered || (*ordered && *number != pn + 1) {
out.push(json!({ "$ref": self.add_list(&run[seg..k], parent) }));
seg = k;
}
}
}
prev = Some((*ordered, *number));
}
out.push(json!({ "$ref": self.add_list(&run[seg..], parent) }));
}
}
fn level_of(node: &Node) -> u8 {
match node {
Node::ListItem { level, .. } => *level,
_ => 0,
}
}
fn group_index(self_ref: &str) -> usize {
self_ref.rsplit('/').next().unwrap().parse().unwrap()
}
fn ref_index(self_ref: &str) -> Option<usize> {
self_ref.rsplit('/').next()?.parse().ok()
}
fn merge(target: &mut Value, extra: Value) {
if let (Some(t), Some(e)) = (target.as_object_mut(), extra.as_object()) {
for (k, v) in e {
t.insert(k.clone(), v.clone());
}
}
}
fn unescape_text(s: &str) -> String {
s.replace("<", "<")
.replace(">", ">")
.replace("&", "&")
.replace("\\_", "_")
}
fn fnv1a(s: &str) -> u64 {
let mut h: u64 = 0xcbf29ce484222325;
for b in s.bytes() {
h ^= b as u64;
h = h.wrapping_mul(0x100000001b3);
}
h
}
#[cfg(test)]
mod tests {
use crate::{DoclingDocument, ImageMode, Node, PictureImage, Table};
use serde_json::Value;
fn doc_with_image() -> DoclingDocument {
let mut doc = DoclingDocument::new("t");
doc.push(Node::Picture {
caption: Some("Fig 1".into()),
image: Some(PictureImage {
mimetype: "image/png".into(),
width: 4,
height: 2,
data: b"foobar".to_vec(),
}),
});
doc
}
#[test]
fn picture_image_in_markdown_modes_and_json() {
let doc = doc_with_image();
assert!(doc.export_to_markdown().contains("<!-- image -->"));
let (md, files) = doc.export_to_markdown_with_images(ImageMode::Embedded, "artifacts");
assert!(
md.contains(""),
"got:\n{md}"
);
assert!(files.is_empty());
let (md, files) = doc.export_to_markdown_with_images(ImageMode::Referenced, "artifacts");
assert!(
md.contains(""),
"got:\n{md}"
);
assert_eq!(
files,
vec![("artifacts/image_000000.png".to_string(), b"foobar".to_vec())]
);
let v: Value = serde_json::from_str(&doc.export_to_json()).unwrap();
assert_eq!(v["pictures"][0]["image"]["mimetype"], "image/png");
assert_eq!(v["pictures"][0]["image"]["size"]["width"], 4);
assert_eq!(
v["pictures"][0]["image"]["uri"],
"data:image/png;base64,Zm9vYmFy"
);
}
#[test]
fn exports_docling_schema() {
let mut doc = DoclingDocument::new("t");
doc.push(Node::Heading {
level: 1,
text: "Title".into(),
});
doc.push(Node::Heading {
level: 2,
text: "Sec".into(),
});
doc.push(Node::Paragraph {
text: "Body & more".into(),
}); doc.push(Node::ListItem {
ordered: false,
number: 0,
first_in_list: true,
text: "one".into(),
level: 0,
});
doc.push(Node::ListItem {
ordered: false,
number: 0,
first_in_list: false,
text: "two".into(),
level: 0,
});
doc.push(Node::Table(Table {
rows: vec![vec!["A".into(), "B".into()]],
}));
let v: Value = serde_json::from_str(&doc.export_to_json()).unwrap();
assert_eq!(v["schema_name"], "DoclingDocument");
assert_eq!(v["version"], "1.10.0");
assert_eq!(v["texts"][0]["label"], "title");
assert_eq!(v["texts"][1]["label"], "section_header");
assert_eq!(v["texts"][1]["level"], 1); assert_eq!(v["texts"][2]["text"], "Body & more"); assert_eq!(v["groups"][0]["label"], "list");
assert_eq!(v["groups"][0]["children"].as_array().unwrap().len(), 2);
assert_eq!(v["texts"][3]["parent"]["$ref"], "#/groups/0");
assert_eq!(v["texts"][3]["marker"], "-");
assert_eq!(v["tables"][0]["data"]["num_cols"], 2);
assert_eq!(v["tables"][0]["data"]["grid"][0][0]["column_header"], true);
}
}