use crate::charts;
use crate::container::OoxmlContainer;
use crate::error::{Error, Result};
use crate::model::{
Block, Cell, Document, HeadingLevel, Metadata, Paragraph, Resource, ResourceType, RevisionType,
Row, Section, Table, TextRun, TextStyle,
};
use std::collections::HashMap;
use std::path::Path;
#[derive(Debug, Clone)]
struct SlideInfo {
#[allow(dead_code)]
id: String,
rel_id: String,
}
pub struct PptxParser {
container: OoxmlContainer,
slides: Vec<SlideInfo>,
relationships: HashMap<String, String>,
}
impl PptxParser {
pub fn open(path: impl AsRef<Path>) -> Result<Self> {
let container = OoxmlContainer::open(path)?;
Self::from_container(container)
}
pub fn from_bytes(data: Vec<u8>) -> Result<Self> {
let container = OoxmlContainer::from_bytes(data)?;
Self::from_container(container)
}
fn from_container(container: OoxmlContainer) -> Result<Self> {
let relationships = Self::parse_presentation_rels(&container)?;
let slides = Self::parse_presentation(&container)?;
Ok(Self {
container,
slides,
relationships,
})
}
fn parse_presentation_rels(container: &OoxmlContainer) -> Result<HashMap<String, String>> {
Ok(container
.read_required_relationships_for_part("ppt/presentation.xml")?
.into_targets_by_id())
}
fn parse_presentation(container: &OoxmlContainer) -> Result<Vec<SlideInfo>> {
let mut slides = Vec::new();
let xml = container.read_xml("ppt/presentation.xml")?;
let mut reader = quick_xml::Reader::from_str(&xml);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Empty(e)) | Ok(quick_xml::events::Event::Start(e)) => {
let name = e.name();
let local_name = name.local_name();
if local_name.as_ref() == b"sldId" {
let mut id = String::new();
let mut rel_id = String::new();
for attr in e.attributes().flatten() {
match attr.key.as_ref() {
b"id" => {
id = String::from_utf8_lossy(&attr.value).to_string();
}
key if key.ends_with(b"id") && key != b"id" && key.len() > 2 => {
rel_id = String::from_utf8_lossy(&attr.value).to_string();
}
_ => {}
}
}
if !rel_id.is_empty() {
slides.push(SlideInfo { id, rel_id });
}
}
}
Ok(quick_xml::events::Event::Eof) => break,
Err(e) => return Err(Error::XmlParse(e.to_string())),
_ => {}
}
buf.clear();
}
Ok(slides)
}
pub fn parse(&mut self) -> Result<Document> {
let mut doc = Document::new();
doc.metadata = self.parse_metadata()?;
let resources = self.extract_resources()?;
for resource in resources {
if let Some(ref filename) = resource.filename {
doc.add_resource(filename.clone(), resource);
}
}
for (idx, slide) in self.slides.clone().iter().enumerate() {
let mut section = Section::new(idx);
section.name = Some(format!("Slide {}", idx + 1));
if let Some(target) = self.relationships.get(&slide.rel_id) {
let slide_path = if let Some(stripped) = target.strip_prefix('/') {
stripped.to_string()
} else {
format!("ppt/{}", target)
};
let slide_rels = self.parse_slide_relationships(&slide_path)?;
if let Some(xml) = self.container.read_xml_optional(&slide_path)? {
let blocks =
self.parse_slide_content_with_rels(&xml, &slide_rels, &slide_path)?;
for block in blocks {
section.add_block(block);
}
}
let notes_path = slide_path
.replace("slides/slide", "notesSlides/notesSlide")
.replace("slides\\slide", "notesSlides\\notesSlide");
if let Some(xml) = self.container.read_xml_optional(¬es_path)? {
let notes_rels = self.parse_slide_relationships(¬es_path)?;
let notes = self.parse_notes_with_rels(&xml, ¬es_rels)?;
if !notes.is_empty() {
section.notes = Some(notes);
}
}
}
doc.add_section(section);
}
Ok(doc)
}
fn parse_slide_relationships(&self, slide_path: &str) -> Result<HashMap<String, String>> {
self.container
.read_optional_relationships_for_part(slide_path)
.map(|rels| rels.into_targets_by_id())
}
fn parse_metadata(&self) -> Result<Metadata> {
let mut meta = self.container.parse_core_metadata()?;
meta.page_count = Some(self.slides.len() as u32);
Ok(meta)
}
#[allow(dead_code)]
fn parse_slide(&self, xml: &str) -> Result<Vec<Paragraph>> {
self.parse_text_content(xml)
}
#[allow(dead_code)]
fn parse_slide_content(&self, xml: &str) -> Result<Vec<Block>> {
self.parse_slide_content_with_rels(xml, &HashMap::new(), "")
}
fn parse_slide_content_with_rels(
&self,
xml: &str,
rels: &HashMap<String, String>,
slide_path: &str,
) -> Result<Vec<Block>> {
let mut blocks = Vec::new();
let paragraphs = self.parse_text_content_excluding_tables_with_rels(xml, rels)?;
for para in paragraphs {
blocks.push(Block::Paragraph(para));
}
let tables = self.parse_tables_with_rels(xml, rels)?;
for table in tables {
blocks.push(Block::Table(table));
}
let chart_tables = self.parse_charts(rels, slide_path)?;
for table in chart_tables {
blocks.push(Block::Table(table));
}
let images = self.parse_images(xml, rels)?;
for image in images {
blocks.push(image);
}
Ok(blocks)
}
fn parse_images(&self, xml: &str, rels: &HashMap<String, String>) -> Result<Vec<Block>> {
let mut images = Vec::new();
let mut reader = quick_xml::Reader::from_str(xml);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let mut in_pic = false;
let mut in_nvpicpr = false;
let mut in_blipfill = false;
let mut in_sppr = false;
let mut current_name: Option<String> = None;
let mut current_rel_id: Option<String> = None;
let mut current_width: Option<u32> = None;
let mut current_height: Option<u32> = None;
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Start(ref e)) => {
let local_name = e.name().local_name();
match local_name.as_ref() {
b"pic" => {
in_pic = true;
current_name = None;
current_rel_id = None;
current_width = None;
current_height = None;
}
b"nvPicPr" if in_pic => {
in_nvpicpr = true;
}
b"cNvPr" if in_nvpicpr => {
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"name" {
current_name =
Some(String::from_utf8_lossy(&attr.value).to_string());
}
}
}
b"blipFill" if in_pic => {
in_blipfill = true;
}
b"blip" if in_blipfill => {
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"embed" {
current_rel_id =
Some(String::from_utf8_lossy(&attr.value).to_string());
}
}
}
b"spPr" if in_pic => {
in_sppr = true;
}
b"ext" if in_sppr => {
for attr in e.attributes().flatten() {
match attr.key.local_name().as_ref() {
b"cx" => {
if let Ok(cx) =
String::from_utf8_lossy(&attr.value).parse::<u32>()
{
current_width = Some(cx);
}
}
b"cy" => {
if let Ok(cy) =
String::from_utf8_lossy(&attr.value).parse::<u32>()
{
current_height = Some(cy);
}
}
_ => {}
}
}
}
_ => {}
}
}
Ok(quick_xml::events::Event::Empty(ref e)) => {
let local_name = e.name().local_name();
match local_name.as_ref() {
b"cNvPr" if in_nvpicpr => {
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"name" {
current_name =
Some(String::from_utf8_lossy(&attr.value).to_string());
}
}
}
b"blip" if in_blipfill => {
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"embed" {
current_rel_id =
Some(String::from_utf8_lossy(&attr.value).to_string());
}
}
}
b"ext" if in_sppr => {
for attr in e.attributes().flatten() {
match attr.key.local_name().as_ref() {
b"cx" => {
if let Ok(cx) =
String::from_utf8_lossy(&attr.value).parse::<u32>()
{
current_width = Some(cx);
}
}
b"cy" => {
if let Ok(cy) =
String::from_utf8_lossy(&attr.value).parse::<u32>()
{
current_height = Some(cy);
}
}
_ => {}
}
}
}
_ => {}
}
}
Ok(quick_xml::events::Event::End(ref e)) => {
let local_name = e.name().local_name();
match local_name.as_ref() {
b"pic" => {
if let Some(rel_id) = current_rel_id.take() {
if let Some(target) = rels.get(&rel_id) {
let filename =
target.rsplit('/').next().unwrap_or(target).to_string();
images.push(Block::Image {
resource_id: filename,
alt_text: current_name.take(),
width: current_width.take(),
height: current_height.take(),
});
}
}
in_pic = false;
}
b"nvPicPr" => {
in_nvpicpr = false;
}
b"blipFill" => {
in_blipfill = false;
}
b"spPr" => {
in_sppr = false;
}
_ => {}
}
}
Ok(quick_xml::events::Event::Eof) => break,
Err(e) => return Err(Error::XmlParse(e.to_string())),
_ => {}
}
buf.clear();
}
Ok(images)
}
fn parse_charts(&self, rels: &HashMap<String, String>, slide_path: &str) -> Result<Vec<Table>> {
let mut tables = Vec::new();
for (_rel_id, target) in rels.iter() {
if !target.contains("chart") {
continue;
}
let chart_path = if let Some(stripped) = target.strip_prefix("../") {
if let Some(last_slash) = slide_path.rfind('/') {
let slide_dir = &slide_path[..last_slash];
if let Some(parent_slash) = slide_dir.rfind('/') {
let parent_dir = &slide_dir[..parent_slash];
format!("{}/{}", parent_dir, stripped)
} else {
stripped.to_string()
}
} else {
stripped.to_string()
}
} else if let Some(stripped) = target.strip_prefix('/') {
stripped.to_string()
} else {
format!("ppt/{}", target)
};
let chart_xml = self.container.read_xml(&chart_path)?;
match charts::parse_chart_xml(&chart_xml) {
Ok(chart_data) => {
if !chart_data.is_empty() {
let mut table = chart_data.to_table();
if let Some(ref title) = chart_data.title {
if !title.is_empty() {
if let Some(first_row) = table.rows.first_mut() {
if let Some(first_cell) = first_row.cells.first_mut() {
let original = first_cell.plain_text();
first_cell.content.clear();
first_cell.content.push(Paragraph::with_text(format!(
"{} ({})",
original, title
)));
}
}
}
}
tables.push(table);
}
}
Err(e) => return Err(e),
}
}
Ok(tables)
}
#[allow(dead_code)]
fn parse_notes(&self, xml: &str) -> Result<Vec<Paragraph>> {
self.parse_notes_with_rels(xml, &HashMap::new())
}
fn parse_notes_with_rels(
&self,
xml: &str,
rels: &HashMap<String, String>,
) -> Result<Vec<Paragraph>> {
self.parse_text_content_with_rels(xml, rels)
}
#[allow(dead_code)]
fn parse_tables(&self, xml: &str) -> Result<Vec<Table>> {
self.parse_tables_with_rels(xml, &HashMap::new())
}
fn parse_tables_with_rels(
&self,
xml: &str,
rels: &HashMap<String, String>,
) -> Result<Vec<Table>> {
let mut tables = Vec::new();
let mut reader = quick_xml::Reader::from_str(xml);
reader.config_mut().trim_text(false);
let mut buf = Vec::new();
let mut in_table = false;
let mut in_row = false;
let mut in_cell = false;
let mut in_txbody = false;
let mut in_paragraph = false;
let mut in_run = false;
let mut in_text = false;
let mut in_rpr = false;
let mut current_table = Table::new();
let mut current_row = Row::new();
let mut current_cell = Cell::new();
let mut current_paragraphs: Vec<Paragraph> = Vec::new();
let mut current_runs: Vec<TextRun> = Vec::new();
let mut current_text = String::new();
let mut current_style = TextStyle::default();
let mut current_hyperlink: Option<String> = None;
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Start(ref e)) => {
let local_name = e.name().local_name();
match local_name.as_ref() {
b"tbl" => {
in_table = true;
current_table = Table::new();
}
b"tr" if in_table => {
in_row = true;
current_row = Row::new();
}
b"tc" if in_row => {
in_cell = true;
current_cell = Cell::new();
current_paragraphs.clear();
}
b"txBody" if in_cell => {
in_txbody = true;
}
b"p" if in_txbody => {
in_paragraph = true;
current_runs.clear();
}
b"r" if in_paragraph => {
in_run = true;
current_text.clear();
current_style = TextStyle::default();
current_hyperlink = None;
}
b"t" if in_run => {
in_text = true;
}
b"rPr" if in_run => {
in_rpr = true;
for attr in e.attributes().flatten() {
match attr.key.local_name().as_ref() {
b"b" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.bold = val != "0" && val != "false";
}
b"i" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.italic = val != "0" && val != "false";
}
_ => {}
}
}
}
b"hlinkClick" if in_rpr => {
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"id" {
let rel_id = String::from_utf8_lossy(&attr.value);
if let Some(url) = rels.get(rel_id.as_ref()) {
current_hyperlink = Some(url.clone());
}
}
}
}
_ => {}
}
}
Ok(quick_xml::events::Event::Empty(ref e)) => {
let local_name = e.name().local_name();
match local_name.as_ref() {
b"rPr" if in_run => {
for attr in e.attributes().flatten() {
match attr.key.local_name().as_ref() {
b"b" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.bold = val != "0" && val != "false";
}
b"i" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.italic = val != "0" && val != "false";
}
_ => {}
}
}
}
b"hlinkClick" if in_run => {
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"id" {
let rel_id = String::from_utf8_lossy(&attr.value);
if let Some(url) = rels.get(rel_id.as_ref()) {
current_hyperlink = Some(url.clone());
}
}
}
}
_ => {}
}
}
Ok(quick_xml::events::Event::Text(ref e)) if in_text => {
current_text.push_str(&crate::decode::decode_text_lossy(e));
}
Ok(quick_xml::events::Event::End(ref e)) => {
let local_name = e.name().local_name();
match local_name.as_ref() {
b"t" => {
in_text = false;
}
b"rPr" => {
in_rpr = false;
}
b"r" => {
if !current_text.is_empty() {
current_runs.push(TextRun {
text: current_text.clone(),
style: current_style.clone(),
hyperlink: current_hyperlink.clone(),
line_break: false,
page_break: false,
revision: RevisionType::None,
});
}
in_run = false;
current_hyperlink = None;
}
b"p" if in_txbody => {
if !current_runs.is_empty() {
current_paragraphs.push(Paragraph {
runs: current_runs.clone(),
..Default::default()
});
}
in_paragraph = false;
}
b"txBody" => {
in_txbody = false;
}
b"tc" => {
current_cell.content = current_paragraphs.clone();
current_row.add_cell(current_cell.clone());
in_cell = false;
}
b"tr" => {
if !current_row.is_empty() {
if current_table.is_empty() {
current_row.is_header = true;
}
current_table.add_row(current_row.clone());
}
in_row = false;
}
b"tbl" => {
if !current_table.is_empty() {
tables.push(current_table.clone());
}
in_table = false;
}
_ => {}
}
}
Ok(quick_xml::events::Event::Eof) => break,
Err(e) => return Err(Error::XmlParse(e.to_string())),
_ => {}
}
buf.clear();
}
Ok(tables)
}
#[allow(dead_code)]
fn parse_text_content_excluding_tables(&self, xml: &str) -> Result<Vec<Paragraph>> {
self.parse_text_content_excluding_tables_with_rels(xml, &HashMap::new())
}
fn parse_text_content_excluding_tables_with_rels(
&self,
xml: &str,
rels: &HashMap<String, String>,
) -> Result<Vec<Paragraph>> {
let mut paragraphs = Vec::new();
let mut reader = quick_xml::Reader::from_str(xml);
reader.config_mut().trim_text(false);
let mut buf = Vec::new();
let mut in_table = false;
let mut table_depth = 0;
let mut in_shape = false;
let mut in_txbody = false;
let mut in_paragraph = false;
let mut in_run = false;
let mut in_text = false;
let mut in_rpr = false;
let mut current_runs: Vec<TextRun> = Vec::new();
let mut current_text = String::new();
let mut current_style = TextStyle::default();
let mut current_hyperlink: Option<String> = None;
let mut current_heading: HeadingLevel = HeadingLevel::None;
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Start(ref e)) => {
let local_name = e.name().local_name();
match local_name.as_ref() {
b"tbl" => {
in_table = true;
table_depth += 1;
}
b"sp" if !in_table => {
in_shape = true;
current_heading = HeadingLevel::None;
}
b"txBody" if in_shape && !in_table => {
in_txbody = true;
}
b"p" if !in_table && in_txbody => {
in_paragraph = true;
current_runs.clear();
}
b"r" if in_paragraph && !in_table => {
in_run = true;
current_text.clear();
current_style = TextStyle::default();
current_hyperlink = None;
}
b"t" if in_run && !in_table => {
in_text = true;
}
b"rPr" if in_run && !in_table => {
in_rpr = true;
for attr in e.attributes().flatten() {
match attr.key.local_name().as_ref() {
b"b" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.bold = val != "0" && val != "false";
}
b"i" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.italic = val != "0" && val != "false";
}
b"u" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.underline = val != "none";
}
b"strike" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.strikethrough =
val != "noStrike" && val != "0" && val != "false";
}
_ => {}
}
}
}
b"hlinkClick" if in_rpr && !in_table => {
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"id" {
let rel_id = String::from_utf8_lossy(&attr.value);
if let Some(url) = rels.get(rel_id.as_ref()) {
current_hyperlink = Some(url.clone());
}
}
}
}
b"ph" if in_shape && !in_table => {
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"type" {
let ph_type = String::from_utf8_lossy(&attr.value);
current_heading = match ph_type.as_ref() {
"title" | "ctrTitle" => HeadingLevel::H1,
"subTitle" => HeadingLevel::H2,
"body" => HeadingLevel::None,
_ => HeadingLevel::None,
};
}
}
}
_ => {}
}
}
Ok(quick_xml::events::Event::Empty(ref e)) => {
let local_name = e.name().local_name();
match local_name.as_ref() {
b"ph" if in_shape && !in_table => {
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"type" {
let ph_type = String::from_utf8_lossy(&attr.value);
current_heading = match ph_type.as_ref() {
"title" | "ctrTitle" => HeadingLevel::H1,
"subTitle" => HeadingLevel::H2,
"body" => HeadingLevel::None,
_ => HeadingLevel::None,
};
}
}
}
b"rPr" if in_run && !in_table => {
for attr in e.attributes().flatten() {
match attr.key.local_name().as_ref() {
b"b" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.bold = val != "0" && val != "false";
}
b"i" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.italic = val != "0" && val != "false";
}
b"u" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.underline = val != "none";
}
b"strike" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.strikethrough =
val != "noStrike" && val != "0" && val != "false";
}
_ => {}
}
}
}
b"hlinkClick" if in_run && !in_table => {
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"id" {
let rel_id = String::from_utf8_lossy(&attr.value);
if let Some(url) = rels.get(rel_id.as_ref()) {
current_hyperlink = Some(url.clone());
}
}
}
}
_ => {}
}
}
Ok(quick_xml::events::Event::Text(ref e)) if in_text && !in_table => {
current_text.push_str(&crate::decode::decode_text_lossy(e));
}
Ok(quick_xml::events::Event::End(ref e)) => {
let local_name = e.name().local_name();
match local_name.as_ref() {
b"tbl" => {
table_depth -= 1;
if table_depth == 0 {
in_table = false;
}
}
b"t" if !in_table => {
in_text = false;
}
b"rPr" if !in_table => {
in_rpr = false;
}
b"r" if !in_table => {
if !current_text.is_empty() {
current_runs.push(TextRun {
text: current_text.clone(),
style: current_style.clone(),
hyperlink: current_hyperlink.clone(),
line_break: false,
page_break: false,
revision: RevisionType::None,
});
}
in_run = false;
current_hyperlink = None;
}
b"p" if !in_table => {
if !current_runs.is_empty() {
paragraphs.push(Paragraph {
runs: current_runs.clone(),
heading: current_heading,
..Default::default()
});
}
in_paragraph = false;
}
b"txBody" if !in_table => {
in_txbody = false;
}
b"sp" if !in_table => {
in_shape = false;
current_heading = HeadingLevel::None;
}
_ => {}
}
}
Ok(quick_xml::events::Event::Eof) => break,
Err(e) => return Err(Error::XmlParse(e.to_string())),
_ => {}
}
buf.clear();
}
Ok(paragraphs)
}
#[allow(dead_code)]
fn parse_text_content(&self, xml: &str) -> Result<Vec<Paragraph>> {
self.parse_text_content_with_rels(xml, &HashMap::new())
}
fn parse_text_content_with_rels(
&self,
xml: &str,
rels: &HashMap<String, String>,
) -> Result<Vec<Paragraph>> {
let mut paragraphs = Vec::new();
let mut reader = quick_xml::Reader::from_str(xml);
reader.config_mut().trim_text(false);
let mut buf = Vec::new();
let mut in_paragraph = false;
let mut in_run = false;
let mut in_text = false;
let mut in_rpr = false;
let mut current_runs: Vec<TextRun> = Vec::new();
let mut current_text = String::new();
let mut current_style = TextStyle::default();
let mut current_hyperlink: Option<String> = None;
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Start(ref e)) => {
let local_name = e.name().local_name();
match local_name.as_ref() {
b"p" => {
in_paragraph = true;
current_runs.clear();
}
b"r" if in_paragraph => {
in_run = true;
current_text.clear();
current_style = TextStyle::default();
current_hyperlink = None;
}
b"t" if in_run => {
in_text = true;
}
b"rPr" if in_run => {
in_rpr = true;
for attr in e.attributes().flatten() {
match attr.key.local_name().as_ref() {
b"b" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.bold = val != "0" && val != "false";
}
b"i" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.italic = val != "0" && val != "false";
}
b"u" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.underline = val != "none";
}
b"strike" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.strikethrough =
val != "noStrike" && val != "0" && val != "false";
}
_ => {}
}
}
}
b"hlinkClick" if in_rpr => {
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"id" {
let rel_id = String::from_utf8_lossy(&attr.value);
if let Some(url) = rels.get(rel_id.as_ref()) {
current_hyperlink = Some(url.clone());
}
}
}
}
_ => {}
}
}
Ok(quick_xml::events::Event::Empty(ref e)) => {
let local_name = e.name().local_name();
match local_name.as_ref() {
b"rPr" if in_run => {
for attr in e.attributes().flatten() {
match attr.key.local_name().as_ref() {
b"b" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.bold = val != "0" && val != "false";
}
b"i" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.italic = val != "0" && val != "false";
}
b"u" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.underline = val != "none";
}
b"strike" => {
let val = String::from_utf8_lossy(&attr.value);
current_style.strikethrough =
val != "noStrike" && val != "0" && val != "false";
}
_ => {}
}
}
}
b"hlinkClick" if in_run => {
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"id" {
let rel_id = String::from_utf8_lossy(&attr.value);
if let Some(url) = rels.get(rel_id.as_ref()) {
current_hyperlink = Some(url.clone());
}
}
}
}
_ => {}
}
}
Ok(quick_xml::events::Event::Text(ref e)) if in_text => {
current_text.push_str(&crate::decode::decode_text_lossy(e));
}
Ok(quick_xml::events::Event::End(ref e)) => {
let local_name = e.name().local_name();
match local_name.as_ref() {
b"t" => {
in_text = false;
}
b"rPr" => {
in_rpr = false;
}
b"r" => {
if !current_text.is_empty() {
current_runs.push(TextRun {
text: current_text.clone(),
style: current_style.clone(),
hyperlink: current_hyperlink.clone(),
line_break: false,
page_break: false,
revision: RevisionType::None,
});
}
in_run = false;
current_hyperlink = None;
}
b"p" => {
if !current_runs.is_empty() {
paragraphs.push(Paragraph {
runs: current_runs.clone(),
..Default::default()
});
}
in_paragraph = false;
}
_ => {}
}
}
Ok(quick_xml::events::Event::Eof) => break,
Err(e) => return Err(Error::XmlParse(e.to_string())),
_ => {}
}
buf.clear();
}
Ok(paragraphs)
}
pub fn extract_resources(&self) -> Result<Vec<Resource>> {
let mut resources = Vec::new();
for file in self.container.list_files() {
if file.starts_with("ppt/media/") {
if let Ok(data) = self.container.read_binary(&file) {
let filename = file.rsplit('/').next().unwrap_or(&file).to_string();
let ext = std::path::Path::new(&file)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("");
let size = data.len();
resources.push(Resource {
resource_type: ResourceType::from_extension(ext),
filename: Some(filename),
mime_type: guess_mime_type(&file),
data,
size,
width: None,
height: None,
alt_text: None,
});
}
}
}
Ok(resources)
}
pub fn container(&self) -> &OoxmlContainer {
&self.container
}
pub fn slide_count(&self) -> usize {
self.slides.len()
}
}
fn guess_mime_type(path: &str) -> Option<String> {
let ext = path.rsplit('.').next()?.to_lowercase();
match ext.as_str() {
"png" => Some("image/png".to_string()),
"jpg" | "jpeg" => Some("image/jpeg".to_string()),
"gif" => Some("image/gif".to_string()),
"bmp" => Some("image/bmp".to_string()),
"tiff" | "tif" => Some("image/tiff".to_string()),
"webp" => Some("image/webp".to_string()),
"svg" => Some("image/svg+xml".to_string()),
"emf" => Some("image/x-emf".to_string()),
"wmf" => Some("image/x-wmf".to_string()),
"mp3" => Some("audio/mpeg".to_string()),
"wav" => Some("audio/wav".to_string()),
"mp4" => Some("video/mp4".to_string()),
"avi" => Some("video/x-msvideo".to_string()),
"wmv" => Some("video/x-ms-wmv".to_string()),
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_open_pptx() {
let path = "test-files/file_example_PPT_1MB.pptx";
if std::path::Path::new(path).exists() {
let parser = PptxParser::open(path);
assert!(parser.is_ok());
}
}
#[test]
fn test_parse_pptx() {
let path = "test-files/file_example_PPT_1MB.pptx";
if std::path::Path::new(path).exists() {
let mut parser = PptxParser::open(path).unwrap();
let doc = parser.parse().unwrap();
assert!(!doc.sections.is_empty());
println!("Parsed {} slides", doc.sections.len());
assert!(doc.metadata.page_count.is_some());
}
}
#[test]
fn test_slide_count() {
let path = "test-files/file_example_PPT_1MB.pptx";
if std::path::Path::new(path).exists() {
let parser = PptxParser::open(path).unwrap();
let count = parser.slide_count();
assert!(count > 0);
println!("Slide count: {}", count);
}
}
#[test]
fn test_extract_text() {
let path = "test-files/file_example_PPT_1MB.pptx";
if std::path::Path::new(path).exists() {
let mut parser = PptxParser::open(path).unwrap();
let doc = parser.parse().unwrap();
let text = doc.plain_text();
assert!(!text.trim().is_empty());
println!("Extracted text length: {} chars", text.len());
println!("First 500 chars:\n{}", &text[..text.len().min(500)]);
}
}
#[test]
fn test_extract_resources() {
let path = "test-files/file_example_PPT_1MB.pptx";
if std::path::Path::new(path).exists() {
let parser = PptxParser::open(path).unwrap();
let resources = parser.extract_resources().unwrap();
println!("Found {} resources", resources.len());
for res in &resources {
println!(
" - {:?}: {} ({} bytes)",
res.resource_type,
res.filename.as_deref().unwrap_or("unnamed"),
res.size
);
}
}
}
#[test]
fn test_parse_text_content() {
let _xml = r#"<?xml version="1.0"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
<p:cSld>
<p:spTree>
<p:sp>
<p:txBody>
<a:p>
<a:r>
<a:t>Hello World</a:t>
</a:r>
</a:p>
<a:p>
<a:r>
<a:rPr b="1"/>
<a:t>Bold Text</a:t>
</a:r>
</a:p>
</p:txBody>
</p:sp>
</p:spTree>
</p:cSld>
</p:sld>"#;
let container = OoxmlContainer::from_bytes(Vec::new());
if container.is_ok() {
}
}
#[test]
fn test_metadata() {
let path = "test-files/file_example_PPT_1MB.pptx";
if std::path::Path::new(path).exists() {
let mut parser = PptxParser::open(path).unwrap();
let doc = parser.parse().unwrap();
println!("Title: {:?}", doc.metadata.title);
println!("Author: {:?}", doc.metadata.author);
println!("Page count: {:?}", doc.metadata.page_count);
}
}
#[test]
fn test_parse_tables() {
let path = "test-files/file_example_PPT_1MB.pptx";
if std::path::Path::new(path).exists() {
let mut parser = PptxParser::open(path).unwrap();
let doc = parser.parse().unwrap();
let mut table_count = 0;
for section in &doc.sections {
for block in §ion.content {
if let Block::Table(table) = block {
table_count += 1;
println!(
"Found table in {}: {} rows, {} cols",
section.name.as_deref().unwrap_or("unnamed"),
table.row_count(),
table.column_count()
);
for (i, row) in table.rows.iter().enumerate() {
let cells: Vec<String> =
row.cells.iter().map(|c| c.plain_text()).collect();
println!(" Row {}: {:?}", i, cells);
}
}
}
}
println!("Total tables found: {}", table_count);
assert!(table_count > 0, "Expected at least one table in the PPTX");
}
}
#[test]
fn test_parse_hyperlinks() {
let path = "test-files/officedissector/test/govdocs/036279.pptx";
if std::path::Path::new(path).exists() {
let mut parser = PptxParser::open(path).unwrap();
let doc = parser.parse().unwrap();
let mut hyperlink_count = 0;
let mut found_email = false;
for section in &doc.sections {
for block in §ion.content {
if let Block::Paragraph(para) = block {
for run in ¶.runs {
if let Some(ref link) = run.hyperlink {
hyperlink_count += 1;
println!("Found hyperlink: {} -> {}", run.text, link);
if link.contains("ncicb@pop.nci.nih.gov") {
found_email = true;
}
}
}
}
}
}
println!("Total hyperlinks found: {}", hyperlink_count);
assert!(hyperlink_count > 0, "Expected at least one hyperlink");
assert!(
found_email,
"Expected to find email link ncicb@pop.nci.nih.gov"
);
}
}
fn create_minimal_pptx(slide_xml: &str) -> Vec<u8> {
create_minimal_pptx_with_relationships(
slide_xml,
Some(
r#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
</Relationships>"#,
),
Some(
r#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
</Relationships>"#,
),
)
}
fn create_minimal_pptx_with_notes(slide_xml: &str, notes_xml: &str) -> Vec<u8> {
use std::io::{Cursor, Write};
let buf = Cursor::new(Vec::new());
let mut zip = zip::ZipWriter::new(buf);
let options = zip::write::SimpleFileOptions::default()
.compression_method(zip::CompressionMethod::Stored);
zip.start_file("[Content_Types].xml", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
<Override PartName="/ppt/notesSlides/notesSlide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml"/>
</Types>"#).unwrap();
zip.start_file("_rels/.rels", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#).unwrap();
zip.start_file("ppt/_rels/presentation.xml.rels", options)
.unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
</Relationships>"#).unwrap();
zip.start_file("ppt/presentation.xml", options).unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<p:sldIdLst>
<p:sldId id="256" r:id="rId1"/>
</p:sldIdLst>
</p:presentation>"#,
)
.unwrap();
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
.unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
</Relationships>"#,
)
.unwrap();
zip.start_file("ppt/slides/slide1.xml", options).unwrap();
zip.write_all(slide_xml.as_bytes()).unwrap();
zip.start_file("ppt/notesSlides/notesSlide1.xml", options)
.unwrap();
zip.write_all(notes_xml.as_bytes()).unwrap();
zip.finish().unwrap().into_inner()
}
fn create_minimal_pptx_with_relationships(
slide_xml: &str,
presentation_rels_xml: Option<&str>,
slide_rels_xml: Option<&str>,
) -> Vec<u8> {
use std::io::{Cursor, Write};
let buf = Cursor::new(Vec::new());
let mut zip = zip::ZipWriter::new(buf);
let options = zip::write::SimpleFileOptions::default()
.compression_method(zip::CompressionMethod::Stored);
zip.start_file("[Content_Types].xml", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
</Types>"#).unwrap();
zip.start_file("_rels/.rels", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#).unwrap();
if let Some(presentation_rels_xml) = presentation_rels_xml {
zip.start_file("ppt/_rels/presentation.xml.rels", options)
.unwrap();
zip.write_all(presentation_rels_xml.as_bytes()).unwrap();
}
zip.start_file("ppt/presentation.xml", options).unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<p:sldIdLst>
<p:sldId id="256" r:id="rId1"/>
</p:sldIdLst>
</p:presentation>"#,
)
.unwrap();
if let Some(slide_rels_xml) = slide_rels_xml {
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
.unwrap();
zip.write_all(slide_rels_xml.as_bytes()).unwrap();
}
zip.start_file("ppt/slides/slide1.xml", options).unwrap();
zip.write_all(slide_xml.as_bytes()).unwrap();
zip.finish().unwrap().into_inner()
}
#[test]
fn test_pptx_chart_invalid_numeric_value_propagates_error() {
use std::io::{Cursor, Write};
use zip::write::SimpleFileOptions;
let chart_xml = r#"<?xml version="1.0"?>
<c:chartSpace xmlns:c="http://schemas.openxmlformats.org/drawingml/2006/chart">
<c:chart><c:plotArea><c:lineChart>
<c:ser>
<c:tx><c:strRef><c:strCache><c:pt idx="0"><c:v>S</c:v></c:pt></c:strCache></c:strRef></c:tx>
<c:cat><c:strRef><c:strCache><c:pt idx="0"><c:v>Q1</c:v></c:pt></c:strCache></c:strRef></c:cat>
<c:val><c:numRef><c:numCache><c:pt idx="0"><c:v>not-a-number</c:v></c:pt></c:numCache></c:numRef></c:val>
</c:ser>
</c:lineChart></c:plotArea></c:chart>
</c:chartSpace>"#;
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
xmlns:c="http://schemas.openxmlformats.org/drawingml/2006/chart">
<p:cSld><p:spTree>
<p:graphicFrame><a:graphic><a:graphicData>
<c:chart r:id="rIdChart"/>
</a:graphicData></a:graphic></p:graphicFrame>
</p:spTree></p:cSld>
</p:sld>"#;
let slide_rels = r#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rIdChart" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/chart" Target="../charts/chart1.xml"/>
</Relationships>"#;
let presentation_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<p:sldIdLst><p:sldId id="256" r:id="rIdSlide"/></p:sldIdLst>
</p:presentation>"#;
let presentation_rels = r#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rIdSlide" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
</Relationships>"#;
let buf = Cursor::new(Vec::new());
let mut zip = zip::ZipWriter::new(buf);
let options =
SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored);
zip.start_file("[Content_Types].xml", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
</Types>"#).unwrap();
zip.start_file("_rels/.rels", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#).unwrap();
zip.start_file("ppt/presentation.xml", options).unwrap();
zip.write_all(presentation_xml.as_bytes()).unwrap();
zip.start_file("ppt/_rels/presentation.xml.rels", options)
.unwrap();
zip.write_all(presentation_rels.as_bytes()).unwrap();
zip.start_file("ppt/slides/slide1.xml", options).unwrap();
zip.write_all(slide_xml.as_bytes()).unwrap();
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
.unwrap();
zip.write_all(slide_rels.as_bytes()).unwrap();
zip.start_file("ppt/charts/chart1.xml", options).unwrap();
zip.write_all(chart_xml.as_bytes()).unwrap();
let data = zip.finish().unwrap().into_inner();
let mut parser = PptxParser::from_bytes(data).unwrap();
let err = parser
.parse()
.expect_err("invalid chart numeric value must surface");
match err {
Error::InvalidData(msg) => assert!(
msg.contains("invalid chart numeric value"),
"unexpected msg: {msg}"
),
other => panic!("expected InvalidData, got {other:?}"),
}
}
#[test]
fn test_pptx_missing_chart_part_propagates_error() {
use std::io::{Cursor, Write};
use zip::write::SimpleFileOptions;
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
xmlns:c="http://schemas.openxmlformats.org/drawingml/2006/chart">
<p:cSld><p:spTree>
<p:graphicFrame><a:graphic><a:graphicData>
<c:chart r:id="rIdChart"/>
</a:graphicData></a:graphic></p:graphicFrame>
</p:spTree></p:cSld>
</p:sld>"#;
let slide_rels = r#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rIdChart" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/chart" Target="../charts/chart1.xml"/>
</Relationships>"#;
let presentation_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<p:sldIdLst><p:sldId id="256" r:id="rIdSlide"/></p:sldIdLst>
</p:presentation>"#;
let presentation_rels = r#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rIdSlide" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
</Relationships>"#;
let buf = Cursor::new(Vec::new());
let mut zip = zip::ZipWriter::new(buf);
let options =
SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored);
zip.start_file("[Content_Types].xml", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
</Types>"#).unwrap();
zip.start_file("_rels/.rels", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#).unwrap();
zip.start_file("ppt/presentation.xml", options).unwrap();
zip.write_all(presentation_xml.as_bytes()).unwrap();
zip.start_file("ppt/_rels/presentation.xml.rels", options)
.unwrap();
zip.write_all(presentation_rels.as_bytes()).unwrap();
zip.start_file("ppt/slides/slide1.xml", options).unwrap();
zip.write_all(slide_xml.as_bytes()).unwrap();
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
.unwrap();
zip.write_all(slide_rels.as_bytes()).unwrap();
let data = zip.finish().unwrap().into_inner();
let mut parser = PptxParser::from_bytes(data).unwrap();
let err = parser
.parse()
.expect_err("missing referenced chart part must surface");
match err {
Error::MissingComponent(path) => assert_eq!(path, "ppt/charts/chart1.xml"),
other => panic!("expected MissingComponent, got {other:?}"),
}
}
#[test]
fn test_pptx_slide_table_preserves_raw_malformed_entity() {
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
<p:cSld><p:spTree>
<p:graphicFrame><a:graphic><a:graphicData>
<a:tbl>
<a:tr><a:tc><a:txBody><a:p><a:r><a:t>Cell &bogus; text</a:t></a:r></a:p></a:txBody></a:tc></a:tr>
</a:tbl>
</a:graphicData></a:graphic></p:graphicFrame>
</p:spTree></p:cSld>
</p:sld>"#;
let data = create_minimal_pptx(slide_xml);
let mut parser = PptxParser::from_bytes(data).unwrap();
let doc = parser.parse().unwrap();
assert!(
doc.plain_text().contains("Cell &bogus; text"),
"expected raw malformed entity preserved in slide table, got: {}",
doc.plain_text()
);
}
#[test]
fn test_pptx_slide_text_preserves_raw_malformed_entity() {
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
<p:cSld><p:spTree>
<p:sp><p:txBody>
<a:p><a:r><a:t>Slide &bogus; body</a:t></a:r></a:p>
</p:txBody></p:sp>
</p:spTree></p:cSld>
</p:sld>"#;
let data = create_minimal_pptx(slide_xml);
let mut parser = PptxParser::from_bytes(data).unwrap();
let doc = parser.parse().unwrap();
assert!(
doc.plain_text().contains("Slide &bogus; body"),
"expected raw malformed entity preserved in slide text, got: {}",
doc.plain_text()
);
}
#[test]
fn test_pptx_notes_preserves_raw_malformed_entity() {
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
<p:cSld><p:spTree/></p:cSld>
</p:sld>"#;
let notes_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<p:notes xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
<p:cSld><p:spTree>
<p:sp><p:txBody>
<a:p><a:r><a:t>Note &bogus; body</a:t></a:r></a:p>
</p:txBody></p:sp>
</p:spTree></p:cSld>
</p:notes>"#;
let data = create_minimal_pptx_with_notes(slide_xml, notes_xml);
let mut parser = PptxParser::from_bytes(data).unwrap();
let doc = parser.parse().unwrap();
let notes = doc.sections[0]
.notes
.as_ref()
.expect("notes should be parsed");
let notes_text = notes
.iter()
.map(Paragraph::plain_text)
.collect::<Vec<_>>()
.join("\n");
assert!(
notes_text.contains("Note &bogus; body"),
"expected raw malformed entity preserved in notes, got: {}",
notes_text
);
}
#[test]
fn test_pptx_requires_presentation_relationships() {
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"/>"#;
let data = create_minimal_pptx_with_relationships(slide_xml, None, None);
let err = PptxParser::from_bytes(data)
.err()
.expect("missing presentation relationships should fail");
match err {
Error::MissingComponent(path) => assert_eq!(path, "ppt/_rels/presentation.xml.rels"),
other => panic!("expected missing presentation rels error, got {other:?}"),
}
}
#[test]
fn test_pptx_rejects_malformed_presentation_relationships() {
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"/>"#;
let data = create_minimal_pptx_with_relationships(slide_xml, Some("<Relationships"), None);
let err = PptxParser::from_bytes(data)
.err()
.expect("malformed presentation relationships should fail");
match err {
Error::XmlParseWithContext { location, .. } => {
assert_eq!(location, "ppt/_rels/presentation.xml.rels")
}
other => panic!("expected malformed presentation rels error, got {other:?}"),
}
}
#[test]
fn test_pptx_non_utf8_presentation_is_error() {
use std::io::{Cursor, Write};
use zip::write::SimpleFileOptions;
let buf = Cursor::new(Vec::new());
let mut zip = zip::ZipWriter::new(buf);
let options =
SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored);
zip.start_file("[Content_Types].xml", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
</Types>"#).unwrap();
zip.start_file("_rels/.rels", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#).unwrap();
zip.start_file("ppt/_rels/presentation.xml.rels", options)
.unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
</Relationships>"#,
)
.unwrap();
zip.start_file("ppt/presentation.xml", options).unwrap();
zip.write_all(b"<?xml version=\"1.0\"?><presentation>Caf\xe9</presentation>")
.unwrap();
let data = zip.finish().unwrap().into_inner();
let err = match PptxParser::from_bytes(data) {
Ok(_) => panic!("non-UTF-8 presentation must surface Error::Encoding"),
Err(err) => err,
};
assert!(
matches!(err, Error::Encoding(_)),
"expected Error::Encoding, got {err:?}"
);
}
fn create_minimal_pptx_with_malformed_part(malformed_part_path: &str) -> Vec<u8> {
use std::io::{Cursor, Write};
let buf = Cursor::new(Vec::new());
let mut zip = zip::ZipWriter::new(buf);
let options = zip::write::SimpleFileOptions::default()
.compression_method(zip::CompressionMethod::Stored);
zip.start_file("[Content_Types].xml", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
<Override PartName="/ppt/notesSlides/notesSlide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml"/>
</Types>"#).unwrap();
zip.start_file("_rels/.rels", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#).unwrap();
zip.start_file("ppt/_rels/presentation.xml.rels", options)
.unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
</Relationships>"#).unwrap();
zip.start_file("ppt/presentation.xml", options).unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<p:sldIdLst>
<p:sldId id="256" r:id="rId1"/>
</p:sldIdLst>
</p:presentation>"#,
)
.unwrap();
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
.unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
</Relationships>"#,
)
.unwrap();
let valid_slide = br#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
<p:cSld><p:spTree/></p:cSld>
</p:sld>"#;
let valid_notes = br#"<?xml version="1.0" encoding="UTF-8"?>
<p:notes xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
<p:cSld><p:spTree/></p:cSld>
</p:notes>"#;
let malformed = b"<?xml version=\"1.0\"?><root>Caf\xe9</root>";
zip.start_file("ppt/slides/slide1.xml", options).unwrap();
if malformed_part_path == "ppt/slides/slide1.xml" {
zip.write_all(malformed).unwrap();
} else {
zip.write_all(valid_slide).unwrap();
}
zip.start_file("ppt/notesSlides/notesSlide1.xml", options)
.unwrap();
if malformed_part_path == "ppt/notesSlides/notesSlide1.xml" {
zip.write_all(malformed).unwrap();
} else {
zip.write_all(valid_notes).unwrap();
}
zip.finish().unwrap().into_inner()
}
#[test]
fn test_pptx_non_utf8_optional_parts_surface_encoding_error() {
for part_path in &["ppt/slides/slide1.xml", "ppt/notesSlides/notesSlide1.xml"] {
let data = create_minimal_pptx_with_malformed_part(part_path);
let mut parser = PptxParser::from_bytes(data).expect("constructor must succeed");
let err = match parser.parse() {
Ok(_) => panic!("malformed {part_path} must surface Error::Encoding"),
Err(err) => err,
};
assert!(
matches!(err, Error::Encoding(_)),
"expected Error::Encoding for {part_path}, got {err:?}"
);
}
}
#[test]
fn test_pptx_allows_missing_optional_slide_relationships() {
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
<p:cSld><p:spTree/></p:cSld>
</p:sld>"#;
let data = create_minimal_pptx_with_relationships(
slide_xml,
Some(
r#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"></Relationship>
</Relationships>"#,
),
None,
);
let mut parser = PptxParser::from_bytes(data).unwrap();
let doc = parser.parse().unwrap();
assert_eq!(doc.sections.len(), 1);
}
#[test]
fn test_pptx_rejects_malformed_optional_slide_relationships() {
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
<p:cSld>
<p:spTree>
<p:sp>
<p:txBody>
<a:p><a:r><a:t>Hello from slide</a:t></a:r></a:p>
</p:txBody>
</p:sp>
</p:spTree>
</p:cSld>
</p:sld>"#;
let data = create_minimal_pptx_with_relationships(
slide_xml,
Some(
r#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
</Relationships>"#,
),
Some("<Relationships"),
);
let mut parser = PptxParser::from_bytes(data).unwrap();
let err = parser
.parse()
.expect_err("malformed optional slide relationships should fail");
match err {
Error::XmlParseWithContext { location, .. } => {
assert_eq!(location, "ppt/slides/_rels/slide1.xml.rels")
}
other => panic!("expected malformed optional slide rels error, got {other:?}"),
}
}
#[test]
fn test_parse_grouped_shapes() {
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<p:cSld>
<p:spTree>
<p:sp>
<p:txBody>
<a:p><a:r><a:t>Top level shape</a:t></a:r></a:p>
</p:txBody>
</p:sp>
<p:grpSp>
<p:nvGrpSpPr>
<p:cNvPr id="10" name="Group 1"/>
<p:cNvGrpSpPr/>
<p:nvPr/>
</p:nvGrpSpPr>
<p:grpSpPr/>
<p:sp>
<p:txBody>
<a:p><a:r><a:t>Grouped shape 1</a:t></a:r></a:p>
</p:txBody>
</p:sp>
<p:sp>
<p:txBody>
<a:p><a:r><a:t>Grouped shape 2</a:t></a:r></a:p>
</p:txBody>
</p:sp>
</p:grpSp>
</p:spTree>
</p:cSld>
</p:sld>"#;
let data = create_minimal_pptx(slide_xml);
let mut parser = PptxParser::from_bytes(data).unwrap();
let doc = parser.parse().unwrap();
let text = doc.plain_text();
assert!(
text.contains("Top level shape"),
"Should contain top-level shape text, got: {}",
text
);
assert!(
text.contains("Grouped shape 1"),
"Should contain first grouped shape text, got: {}",
text
);
assert!(
text.contains("Grouped shape 2"),
"Should contain second grouped shape text, got: {}",
text
);
}
#[test]
fn test_parse_nested_grouped_shapes() {
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<p:cSld>
<p:spTree>
<p:grpSp>
<p:nvGrpSpPr><p:cNvPr id="10" name="Outer Group"/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
<p:grpSpPr/>
<p:sp>
<p:txBody>
<a:p><a:r><a:t>Outer group text</a:t></a:r></a:p>
</p:txBody>
</p:sp>
<p:grpSp>
<p:nvGrpSpPr><p:cNvPr id="20" name="Inner Group"/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
<p:grpSpPr/>
<p:sp>
<p:txBody>
<a:p><a:r><a:t>Inner group text</a:t></a:r></a:p>
</p:txBody>
</p:sp>
</p:grpSp>
</p:grpSp>
</p:spTree>
</p:cSld>
</p:sld>"#;
let data = create_minimal_pptx(slide_xml);
let mut parser = PptxParser::from_bytes(data).unwrap();
let doc = parser.parse().unwrap();
let text = doc.plain_text();
assert!(
text.contains("Outer group text"),
"Should contain outer group shape text, got: {}",
text
);
assert!(
text.contains("Inner group text"),
"Should contain inner (nested) group shape text, got: {}",
text
);
}
#[test]
fn test_parse_headings() {
use crate::model::HeadingLevel;
let path = "test-files/file_example_PPT_1MB.pptx";
if std::path::Path::new(path).exists() {
let mut parser = PptxParser::open(path).unwrap();
let doc = parser.parse().unwrap();
let mut h1_count = 0;
let mut h2_count = 0;
let mut found_lorem = false;
for section in &doc.sections {
for block in §ion.content {
if let Block::Paragraph(para) = block {
let text = para.plain_text();
match para.heading {
HeadingLevel::H1 => {
h1_count += 1;
println!("Found H1: {}", text);
if text.contains("Lorem ipsum") {
found_lorem = true;
}
}
HeadingLevel::H2 => {
h2_count += 1;
println!("Found H2: {}", text);
}
_ => {}
}
}
}
}
println!("Total H1: {}, H2: {}", h1_count, h2_count);
assert!(h1_count > 0, "Expected at least one H1 heading (title)");
assert!(found_lorem, "Expected to find 'Lorem ipsum' as H1 title");
}
}
#[test]
fn test_pptx_slide_mixed_entities_preserve_legitimate_and_malformed() {
use std::io::Write;
let mut buf = Vec::new();
{
let cursor = std::io::Cursor::new(&mut buf);
let mut zip = zip::ZipWriter::new(cursor);
let options = zip::write::SimpleFileOptions::default()
.compression_method(zip::CompressionMethod::Stored);
zip.start_file("[Content_Types].xml", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
</Types>"#).unwrap();
zip.start_file("_rels/.rels", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#).unwrap();
zip.start_file("ppt/_rels/presentation.xml.rels", options)
.unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
</Relationships>"#).unwrap();
zip.start_file("ppt/presentation.xml", options).unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
<p:sldIdLst><p:sldId id="256" r:id="rId1"/></p:sldIdLst>
</p:presentation>"#,
)
.unwrap();
zip.start_file("ppt/slides/slide1.xml", options).unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
<p:cSld><p:spTree>
<p:sp><p:txBody><a:p><a:r><a:t>A & B &bogus; C</a:t></a:r></a:p></p:txBody></p:sp>
</p:spTree></p:cSld>
</p:sld>"#,
)
.unwrap();
zip.finish().unwrap();
}
let mut parser = PptxParser::from_bytes(buf).expect("parser opens");
let doc = parser.parse().expect("document parses");
let text = doc.plain_text();
assert!(
text.contains("A & B &bogus; C"),
"expected legitimate entity decoded and malformed preserved; got {text:?}"
);
assert!(
!text.contains("A & B"),
"legitimate entity must not remain escaped; got {text:?}"
);
}
#[test]
fn test_pptx_missing_presentation_surfaces_missing_component() {
use std::io::Write;
let mut buf = Vec::new();
{
let cursor = std::io::Cursor::new(&mut buf);
let mut zip = zip::ZipWriter::new(cursor);
let options = zip::write::SimpleFileOptions::default()
.compression_method(zip::CompressionMethod::Stored);
zip.start_file("[Content_Types].xml", options).unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
</Types>"#,
)
.unwrap();
zip.start_file("_rels/.rels", options).unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#,
)
.unwrap();
zip.start_file("ppt/_rels/presentation.xml.rels", options)
.unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"/>"#,
)
.unwrap();
zip.finish().unwrap();
}
let err = PptxParser::from_bytes(buf)
.err()
.expect("must fail on missing presentation");
match err {
Error::MissingComponent(path) => {
assert_eq!(path, "ppt/presentation.xml");
}
other => panic!("expected MissingComponent(\"ppt/presentation.xml\"), got {other:?}"),
}
}
}