use roxmltree::{Document, Node};
use crate::error::{KreuzbergError, Result};
use crate::text::utf8_validation;
use super::elements::{
ElementPosition, Formatting, ImageReference, ListElement, ListItem, ParsedContent, Run, SlideElement, TableCell,
TableElement, TableRow, TextElement,
};
use crate::extraction::ooxml_constants::{DRAWINGML_NAMESPACE, PRESENTATIONML_NAMESPACE, RELATIONSHIPS_NAMESPACE};
pub(super) fn parse_slide_xml(xml_data: &[u8]) -> Result<Vec<SlideElement>> {
let xml_str = utf8_validation::from_utf8(xml_data)
.map_err(|_| KreuzbergError::parsing("Invalid UTF-8 in slide XML".to_string()))?;
let doc =
Document::parse(xml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse slide XML: {}", e)))?;
let root = doc.root_element();
let ns = root.tag_name().namespace();
let c_sld = root
.descendants()
.find(|n| n.tag_name().name() == "cSld" && n.tag_name().namespace() == ns)
.ok_or_else(|| KreuzbergError::parsing("No <p:cSld> tag found".to_string()))?;
let sp_tree = c_sld
.children()
.find(|n| n.tag_name().name() == "spTree" && n.tag_name().namespace() == ns)
.ok_or_else(|| KreuzbergError::parsing("No <p:spTree> tag found".to_string()))?;
let mut elements = Vec::new();
for child_node in sp_tree.children().filter(|n| n.is_element()) {
elements.extend(parse_group(&child_node)?);
}
Ok(elements)
}
fn parse_group(node: &Node) -> Result<Vec<SlideElement>> {
let mut elements = Vec::new();
let tag_name = node.tag_name().name();
let namespace = node.tag_name().namespace().unwrap_or("");
if namespace != PRESENTATIONML_NAMESPACE {
return Ok(elements);
}
let position = extract_position(node);
match tag_name {
"sp" => {
let position = extract_position(node);
if let Some(content) = parse_sp(node)? {
match content {
ParsedContent::Text(text) => elements.push(SlideElement::Text(text, position)),
ParsedContent::List(list) => elements.push(SlideElement::List(list, position)),
}
}
}
"graphicFrame" => {
if let Some(graphic_element) = parse_graphic_frame(node)? {
elements.push(SlideElement::Table(graphic_element, position));
}
}
"pic" => {
let image_reference = parse_pic(node)?;
elements.push(SlideElement::Image(image_reference, position));
}
"grpSp" => {
for child in node.children().filter(|n| n.is_element()) {
elements.extend(parse_group(&child)?);
}
}
_ => elements.push(SlideElement::Unknown),
}
Ok(elements)
}
fn is_title_placeholder(sp_node: &Node) -> bool {
let nv_sp_pr = sp_node
.children()
.find(|n| n.tag_name().name() == "nvSpPr" && n.tag_name().namespace() == Some(PRESENTATIONML_NAMESPACE));
if let Some(nv_sp_pr) = nv_sp_pr {
let nv_pr = nv_sp_pr
.children()
.find(|n| n.tag_name().name() == "nvPr" && n.tag_name().namespace() == Some(PRESENTATIONML_NAMESPACE));
if let Some(nv_pr) = nv_pr {
let ph = nv_pr
.children()
.find(|n| n.tag_name().name() == "ph" && n.tag_name().namespace() == Some(PRESENTATIONML_NAMESPACE));
if let Some(ph) = ph {
return matches!(ph.attribute("type"), Some("title") | Some("ctrTitle"));
}
}
}
false
}
fn parse_sp(sp_node: &Node) -> Result<Option<ParsedContent>> {
let tx_body_node = match sp_node
.children()
.find(|n| n.tag_name().name() == "txBody" && n.tag_name().namespace() == Some(PRESENTATIONML_NAMESPACE))
{
Some(node) => node,
None => return Ok(None), };
let is_title = is_title_placeholder(sp_node);
let is_list = !is_title
&& tx_body_node.descendants().any(|n| {
n.is_element()
&& n.tag_name().name() == "pPr"
&& n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE)
&& (n.attribute("lvl").is_some()
|| n.children().any(|child| {
child.is_element()
&& (child.tag_name().name() == "buAutoNum" || child.tag_name().name() == "buChar")
}))
});
if is_list {
Ok(Some(ParsedContent::List(parse_list(&tx_body_node)?)))
} else {
let mut text_el = parse_text(&tx_body_node)?;
text_el.is_title = is_title;
Ok(Some(ParsedContent::Text(text_el)))
}
}
pub(super) fn parse_text(tx_body_node: &Node) -> Result<TextElement> {
let mut runs = Vec::new();
for p_node in tx_body_node.children().filter(|n| {
n.is_element() && n.tag_name().name() == "p" && n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE)
}) {
let mut paragraph_runs = parse_paragraph(&p_node, true)?;
runs.append(&mut paragraph_runs);
}
Ok(TextElement { runs, is_title: false })
}
fn parse_graphic_frame(node: &Node) -> Result<Option<TableElement>> {
let graphic_data_node = node.descendants().find(|n| {
n.is_element()
&& n.tag_name().name() == "graphicData"
&& n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE)
&& n.attribute("uri") == Some("http://schemas.openxmlformats.org/drawingml/2006/table")
});
if let Some(graphic_data) = graphic_data_node
&& let Some(tbl_node) = graphic_data.children().find(|n| {
n.is_element() && n.tag_name().name() == "tbl" && n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE)
})
{
let table = parse_table(&tbl_node)?;
return Ok(Some(table));
}
Ok(None)
}
fn parse_table(tbl_node: &Node) -> Result<TableElement> {
let mut rows = Vec::new();
for tr_node in tbl_node.children().filter(|n| {
n.is_element() && n.tag_name().name() == "tr" && n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE)
}) {
let row = parse_table_row(&tr_node)?;
rows.push(row);
}
Ok(TableElement { rows })
}
fn parse_table_row(tr_node: &Node) -> Result<TableRow> {
let mut cells = Vec::new();
for tc_node in tr_node.children().filter(|n| {
n.is_element() && n.tag_name().name() == "tc" && n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE)
}) {
let cell = parse_table_cell(&tc_node)?;
cells.push(cell);
}
Ok(TableRow { cells })
}
fn parse_table_cell(tc_node: &Node) -> Result<TableCell> {
let mut runs = Vec::new();
if let Some(tx_body_node) = tc_node.children().find(|n| {
n.is_element() && n.tag_name().name() == "txBody" && n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE)
}) {
for p_node in tx_body_node.children().filter(|n| {
n.is_element() && n.tag_name().name() == "p" && n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE)
}) {
let mut paragraph_runs = parse_paragraph(&p_node, false)?;
runs.append(&mut paragraph_runs);
}
}
Ok(TableCell { runs })
}
fn parse_pic(pic_node: &Node) -> Result<ImageReference> {
let blip_node = pic_node
.descendants()
.find(|n| {
n.is_element() && n.tag_name().name() == "blip" && n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE)
})
.ok_or_else(|| KreuzbergError::parsing("Image blip not found".to_string()))?;
let embed_attr = blip_node
.attribute((RELATIONSHIPS_NAMESPACE, "embed"))
.or_else(|| blip_node.attribute("r:embed"))
.ok_or_else(|| KreuzbergError::parsing("Image embed attribute not found".to_string()))?;
let description = pic_node
.descendants()
.find(|n| {
n.is_element()
&& n.tag_name().name() == "cNvPr"
&& n.tag_name().namespace() == Some(PRESENTATIONML_NAMESPACE)
})
.or_else(|| {
pic_node.descendants().find(|n| {
n.is_element()
&& n.tag_name().name() == "cNvPr"
&& n.parent().is_some_and(|p| p.tag_name().name() == "nvPicPr")
})
})
.and_then(|cnv| cnv.attribute("descr"))
.filter(|d| !d.is_empty())
.map(|d| d.to_string());
let image_ref = ImageReference {
id: embed_attr.to_string(),
target: String::new(),
description,
};
Ok(image_ref)
}
fn parse_list(tx_body_node: &Node) -> Result<ListElement> {
let mut items = Vec::new();
for p_node in tx_body_node.children().filter(|n| {
n.is_element() && n.tag_name().name() == "p" && n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE)
}) {
let (level, is_ordered, has_bullet) = parse_list_properties(&p_node)?;
let runs = parse_paragraph(&p_node, true)?;
items.push(ListItem {
level,
is_ordered,
runs,
has_bullet,
});
}
Ok(ListElement { items })
}
fn parse_list_properties(p_node: &Node) -> Result<(u32, bool, bool)> {
let mut level = 1;
let mut is_ordered = false;
let mut has_bullet = false;
if let Some(p_pr_node) = p_node.children().find(|n| {
n.is_element() && n.tag_name().name() == "pPr" && n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE)
}) {
if let Some(lvl_attr) = p_pr_node.attribute("lvl") {
level = lvl_attr.parse::<u32>().unwrap_or(0) + 1;
}
is_ordered = p_pr_node.children().any(|n| {
n.is_element()
&& n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE)
&& n.tag_name().name() == "buAutoNum"
});
let has_bu_none = p_pr_node.children().any(|n| {
n.is_element() && n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE) && n.tag_name().name() == "buNone"
});
has_bullet = !has_bu_none
&& (is_ordered
|| p_pr_node.children().any(|n| {
n.is_element()
&& n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE)
&& n.tag_name().name() == "buChar"
})
|| p_pr_node.attribute("lvl").is_some());
}
Ok((level, is_ordered, has_bullet))
}
fn parse_paragraph(p_node: &Node, add_new_line: bool) -> Result<Vec<Run>> {
let run_nodes: Vec<_> = p_node
.children()
.filter(|n| {
n.is_element() && n.tag_name().name() == "r" && n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE)
})
.collect();
let count = run_nodes.len();
let mut runs: Vec<Run> = Vec::new();
for (idx, r_node) in run_nodes.iter().enumerate() {
let mut run = parse_run(r_node)?;
if add_new_line && idx == count - 1 {
run.text.push('\n');
}
runs.push(run);
}
Ok(runs)
}
fn parse_run(r_node: &Node) -> Result<Run> {
let mut text = String::new();
let mut formatting = Formatting::default();
let mut hyperlink_id = None;
if let Some(r_pr_node) = r_node.children().find(|n| {
n.is_element() && n.tag_name().name() == "rPr" && n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE)
}) {
if let Some(b_attr) = r_pr_node.attribute("b") {
formatting.bold = b_attr == "1" || b_attr.eq_ignore_ascii_case("true");
}
if let Some(i_attr) = r_pr_node.attribute("i") {
formatting.italic = i_attr == "1" || i_attr.eq_ignore_ascii_case("true");
}
if let Some(u_attr) = r_pr_node.attribute("u") {
formatting.underlined = u_attr != "none";
}
if let Some(strike_attr) = r_pr_node.attribute("strike") {
formatting.strikethrough = matches!(strike_attr, "sngStrike" | "dblStrike");
}
if let Some(sz_attr) = r_pr_node.attribute("sz") {
formatting.font_size = sz_attr.parse::<u32>().ok();
}
if let Some(lang_attr) = r_pr_node.attribute("lang") {
formatting.lang = lang_attr.to_string();
}
if let Some(hlink_node) = r_pr_node.children().find(|n| {
n.is_element()
&& n.tag_name().name() == "hlinkClick"
&& n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE)
}) {
hyperlink_id = hlink_node
.attribute((RELATIONSHIPS_NAMESPACE, "id"))
.or_else(|| hlink_node.attribute("r:id"))
.map(|s| s.to_string());
}
}
if let Some(t_node) = r_node
.children()
.find(|n| n.is_element() && n.tag_name().name() == "t" && n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE))
&& let Some(t) = t_node.text()
{
text.push_str(t);
}
Ok(Run {
text,
formatting,
hyperlink_id,
})
}
pub(super) fn extract_position(node: &Node) -> ElementPosition {
let default = ElementPosition::default();
node.descendants()
.find(|n| n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE) && n.tag_name().name() == "xfrm")
.and_then(|xfrm| {
let x = xfrm
.children()
.find(|n| n.tag_name().name() == "off" && n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE))
.and_then(|off| off.attribute("x")?.parse::<i64>().ok())?;
let y = xfrm
.children()
.find(|n| n.tag_name().name() == "off" && n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE))
.and_then(|off| off.attribute("y")?.parse::<i64>().ok())?;
let (cx, cy) = xfrm
.children()
.find(|n| n.tag_name().name() == "ext" && n.tag_name().namespace() == Some(DRAWINGML_NAMESPACE))
.map(|ext| {
let cx = ext.attribute("cx").and_then(|v| v.parse::<i64>().ok()).unwrap_or(0);
let cy = ext.attribute("cy").and_then(|v| v.parse::<i64>().ok()).unwrap_or(0);
(cx, cy)
})
.unwrap_or((0, 0));
Some(ElementPosition { x, y, cx, cy })
})
.unwrap_or(default)
}
pub(super) struct SlideRels {
pub(super) images: Vec<ImageReference>,
pub(super) hyperlinks: Vec<super::elements::HyperlinkReference>,
}
pub(super) fn parse_slide_rels(rels_data: &[u8]) -> Result<SlideRels> {
let xml_str = utf8_validation::from_utf8(rels_data)
.map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in rels XML: {}", e)))?;
let doc =
Document::parse(xml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse rels XML: {}", e)))?;
let mut images = Vec::new();
let mut hyperlinks = Vec::new();
for node in doc.descendants() {
if node.has_tag_name("Relationship")
&& let Some(rel_type) = node.attribute("Type")
&& let (Some(id), Some(target)) = (node.attribute("Id"), node.attribute("Target"))
{
if rel_type.contains("image") {
images.push(ImageReference {
id: id.to_string(),
target: target.to_string(),
description: None,
});
} else if rel_type.contains("hyperlink") {
hyperlinks.push(super::elements::HyperlinkReference {
id: id.to_string(),
url: target.to_string(),
});
}
}
}
Ok(SlideRels { images, hyperlinks })
}
pub(super) fn parse_presentation_rels(rels_data: &[u8]) -> Result<Vec<String>> {
let xml_str = utf8_validation::from_utf8(rels_data)
.map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in presentation rels: {}", e)))?;
let doc = Document::parse(xml_str)
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse presentation rels: {}", e)))?;
let mut slide_paths = Vec::new();
for node in doc.descendants() {
if node.has_tag_name("Relationship")
&& let Some(rel_type) = node.attribute("Type")
&& rel_type.contains("slide")
&& !rel_type.contains("slideMaster")
&& let Some(target) = node.attribute("Target")
{
let normalized_target = target.strip_prefix('/').unwrap_or(target);
let final_path = if normalized_target.starts_with("ppt/") {
normalized_target.to_string()
} else {
format!("ppt/{}", normalized_target)
};
slide_paths.push(final_path);
}
}
slide_paths.sort_by(|a, b| {
fn slide_num(s: &str) -> u32 {
s.rsplit('/')
.next()
.unwrap_or("")
.strip_prefix("slide")
.unwrap_or("")
.strip_suffix(".xml")
.unwrap_or("")
.parse()
.unwrap_or(u32::MAX)
}
slide_num(a).cmp(&slide_num(b))
});
Ok(slide_paths)
}
#[cfg(test)]
mod tests {
#[test]
fn test_slide_paths_sorted_numerically() {
let mut paths = vec![
"ppt/slides/slide1.xml".to_string(),
"ppt/slides/slide10.xml".to_string(),
"ppt/slides/slide2.xml".to_string(),
"ppt/slides/slide20.xml".to_string(),
"ppt/slides/slide3.xml".to_string(),
"ppt/slides/slide11.xml".to_string(),
];
paths.sort_by(|a, b| {
fn slide_num(s: &str) -> u32 {
s.rsplit('/')
.next()
.unwrap_or("")
.strip_prefix("slide")
.unwrap_or("")
.strip_suffix(".xml")
.unwrap_or("")
.parse()
.unwrap_or(u32::MAX)
}
slide_num(a).cmp(&slide_num(b))
});
assert_eq!(
paths,
vec![
"ppt/slides/slide1.xml",
"ppt/slides/slide2.xml",
"ppt/slides/slide3.xml",
"ppt/slides/slide10.xml",
"ppt/slides/slide11.xml",
"ppt/slides/slide20.xml",
]
);
}
}