//! PPTX (Office Open XML Presentation) to Markdown converter.
//!
//! Parses PPTX files directly from their OOXML ZIP structure. Extracts slide titles,
//! body text, tables, speaker notes, embedded images, and content from group shapes
//! (`<p:grpSp>`). Each slide becomes a `## Slide N: Title` section separated by
//! horizontal rules.
use std::collections::HashMap;
use std::io::Cursor;
use quick_xml::Reader;
use quick_xml::events::Event;
use zip::ZipArchive;
use crate::converter::comments::{self, Comment};
use crate::converter::ooxml_utils::{
ImageInfo, PendingImageResolution, Relationship, attr_value_unescaped, derive_rels_path,
parse_relationships, resolve_image_placeholders, resolve_relative_to_file,
};
use crate::converter::{
ConversionOptions, ConversionResult, ConversionWarning, Converter, WarningCode,
};
use crate::error::ConvertError;
use crate::markdown::{build_table, build_table_plain};
use crate::zip_utils::{read_zip_bytes, read_zip_text, read_zip_text_lossy};
/// Converts PPTX files to Markdown.
pub struct PptxConverter;
// ---- Data types ----
/// Information about a slide in presentation order.
#[derive(Debug, Clone)]
struct SlideInfo {
number: usize,
path: String,
}
/// The type of placeholder in a shape.
#[derive(Debug, Clone, PartialEq)]
enum PlaceholderType {
Title,
CenterTitle,
SubTitle,
Body,
Other,
}
/// Content extracted from a single shape on a slide.
#[derive(Debug, Clone)]
enum ShapeContent {
Title(String),
Body(String),
Table {
headers: Vec<String>,
rows: Vec<Vec<String>>,
},
Image {
rel_id: String,
alt_text: Option<String>,
},
}
// ---- Slide order resolution ----
/// Parse presentation.xml and its rels to determine slide order.
///
/// Returns slides in presentation order (as defined by `<p:sldIdLst>`).
fn resolve_slide_order(
pres_xml: &str,
pres_rels: &HashMap<String, Relationship>,
) -> Vec<SlideInfo> {
let mut reader = Reader::from_str(pres_xml);
let mut rel_ids: Vec<String> = Vec::new();
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
if local_str == "sldId" {
for attr in e.attributes().flatten() {
let key = std::str::from_utf8(attr.key.as_ref()).unwrap_or("");
if key == "r:id" || key.ends_with(":id") {
let val = String::from_utf8_lossy(&attr.value).to_string();
rel_ids.push(val);
}
}
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
}
let mut slides = Vec::new();
for (i, rid) in rel_ids.iter().enumerate() {
if let Some(rel) = pres_rels.get(rid) {
// Target is relative to ppt/ directory, e.g., "slides/slide1.xml"
let path = if rel.target.starts_with("ppt/") {
rel.target.clone()
} else {
format!("ppt/{}", rel.target)
};
slides.push(SlideInfo {
number: i + 1,
path,
});
}
}
slides
}
// ---- Slide content parsing ----
/// Parse a slide XML and extract shape contents in document order.
///
/// Returns (shapes, warnings).
fn parse_slide(xml: &str) -> (Vec<ShapeContent>, Vec<ConversionWarning>) {
let mut reader = Reader::from_str(xml);
let mut shapes: Vec<ShapeContent> = Vec::new();
let mut warnings: Vec<ConversionWarning> = Vec::new();
// Shape-level state
let mut in_shape = false; // inside <p:sp>
let mut in_graphic_frame = false; // inside <p:graphicFrame>
let mut in_picture = false; // inside <p:pic>
let mut placeholder_type: Option<PlaceholderType> = None;
// Text body state
let mut in_text_body = false;
let mut in_paragraph = false;
let mut in_run = false;
let mut in_text = false;
let mut current_paragraph = String::new();
let mut shape_paragraphs: Vec<String> = Vec::new();
// Table state
let mut in_table = false;
let mut in_table_row = false;
let mut in_table_cell = false;
let mut table_rows: Vec<Vec<String>> = Vec::new();
let mut current_row: Vec<String> = Vec::new();
let mut current_cell = String::new();
// Track text state within table cells
let mut in_cell_paragraph = false;
let mut in_cell_run = false;
let mut in_cell_text = false;
// Image state
let mut current_blip_rel_id: Option<String> = None;
let mut current_image_alt: Option<String> = None;
// Track depth for nested elements
let mut shape_depth: u32 = 0;
let mut graphic_frame_depth: u32 = 0;
let mut picture_depth: u32 = 0;
// Group shape depth: <p:grpSp> is a transparent container — child shapes
// (sp, graphicFrame, pic) are processed normally. The counter tracks nesting
// for proper End-tag matching but does not gate any logic.
let mut group_depth: u32 = 0;
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
match local_str {
"grpSp" if !in_shape && !in_graphic_frame && !in_picture => {
group_depth += 1;
}
"sp" if !in_shape && !in_graphic_frame && !in_picture => {
in_shape = true;
shape_depth = 1;
placeholder_type = None;
shape_paragraphs.clear();
}
"graphicFrame" if !in_shape && !in_graphic_frame && !in_picture => {
in_graphic_frame = true;
graphic_frame_depth = 1;
}
"pic" if !in_shape && !in_graphic_frame && !in_picture => {
in_picture = true;
picture_depth = 1;
current_blip_rel_id = None;
current_image_alt = None;
}
_ if in_shape => {
shape_depth += 1;
handle_shape_start(
local_str,
e,
&mut placeholder_type,
&mut in_text_body,
&mut in_paragraph,
&mut in_run,
&mut in_text,
&mut current_paragraph,
);
}
_ if in_graphic_frame => {
graphic_frame_depth += 1;
handle_graphic_frame_start(
local_str,
&mut in_table,
&mut in_table_row,
&mut in_table_cell,
&mut in_cell_paragraph,
&mut in_cell_run,
&mut in_cell_text,
&mut current_cell,
&mut current_row,
&mut table_rows,
);
}
_ if in_picture => {
picture_depth += 1;
handle_picture_start(
local_str,
e,
&mut current_blip_rel_id,
&mut current_image_alt,
);
}
_ => {}
}
}
Ok(Event::Empty(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
if in_shape {
handle_shape_empty(
local_str,
e,
&mut placeholder_type,
in_run,
&mut current_paragraph,
);
} else if in_graphic_frame {
handle_graphic_frame_empty(local_str, in_cell_run, &mut current_cell);
} else if in_picture {
handle_picture_start(
local_str,
e,
&mut current_blip_rel_id,
&mut current_image_alt,
);
}
}
Ok(Event::Text(ref e)) => {
if in_shape && in_text && in_run {
let text = e.unescape().unwrap_or_default().to_string();
current_paragraph.push_str(&text);
} else if in_graphic_frame && in_cell_text && in_cell_run {
let text = e.unescape().unwrap_or_default().to_string();
current_cell.push_str(&text);
}
}
Ok(Event::End(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
if in_shape {
shape_depth -= 1;
match local_str {
"t" => in_text = false,
"r" => {
in_run = false;
in_text = false;
}
"p" if in_paragraph => {
let para = current_paragraph.clone();
if !para.is_empty() {
shape_paragraphs.push(para);
}
current_paragraph.clear();
in_paragraph = false;
}
"txBody" => in_text_body = false,
_ => {}
}
if shape_depth == 0 {
// Finalize shape
let content = finalize_shape(&placeholder_type, &shape_paragraphs);
if let Some(c) = content {
shapes.push(c);
}
in_shape = false;
placeholder_type = None;
shape_paragraphs.clear();
in_text_body = false;
in_paragraph = false;
in_run = false;
in_text = false;
}
} else if in_graphic_frame {
graphic_frame_depth -= 1;
match local_str {
"t" if in_table_cell => in_cell_text = false,
"r" if in_table_cell => {
in_cell_run = false;
in_cell_text = false;
}
"p" if in_cell_paragraph => {
in_cell_paragraph = false;
}
"tc" if in_table_cell => {
current_row.push(current_cell.trim().to_string());
current_cell.clear();
in_table_cell = false;
in_cell_paragraph = false;
in_cell_run = false;
in_cell_text = false;
}
"tr" if in_table_row => {
table_rows.push(current_row.clone());
current_row.clear();
in_table_row = false;
}
"tbl" if in_table => {
// Finalize table
if !table_rows.is_empty() {
let headers = table_rows[0].clone();
let data_rows = if table_rows.len() > 1 {
table_rows[1..].to_vec()
} else {
Vec::new()
};
shapes.push(ShapeContent::Table {
headers,
rows: data_rows,
});
}
table_rows.clear();
in_table = false;
}
_ => {}
}
if graphic_frame_depth == 0 {
in_graphic_frame = false;
in_table = false;
in_table_row = false;
in_table_cell = false;
in_cell_paragraph = false;
in_cell_run = false;
in_cell_text = false;
}
} else if in_picture {
picture_depth -= 1;
if picture_depth == 0 {
if let Some(rel_id) = current_blip_rel_id.take() {
shapes.push(ShapeContent::Image {
rel_id,
alt_text: current_image_alt.take(),
});
}
in_picture = false;
current_image_alt = None;
}
} else if local_str == "grpSp" && group_depth > 0 {
group_depth -= 1;
}
}
Ok(Event::Eof) => break,
Err(err) => {
warnings.push(ConversionWarning {
code: WarningCode::MalformedSegment,
message: format!("XML parse error in slide: {err}"),
location: None,
});
break;
}
_ => {}
}
}
(shapes, warnings)
}
/// Handle a Start event inside a <p:sp> shape.
#[allow(clippy::too_many_arguments)]
fn handle_shape_start(
local_str: &str,
e: &quick_xml::events::BytesStart,
placeholder_type: &mut Option<PlaceholderType>,
in_text_body: &mut bool,
in_paragraph: &mut bool,
in_run: &mut bool,
in_text: &mut bool,
current_paragraph: &mut String,
) {
match local_str {
"ph" => {
// <p:ph type="title"/> or <p:ph type="ctrTitle"/> etc.
let mut ph_type = PlaceholderType::Other;
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let key = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if key == "type" {
let val = String::from_utf8_lossy(&attr.value);
ph_type = match val.as_ref() {
"title" => PlaceholderType::Title,
"ctrTitle" => PlaceholderType::CenterTitle,
"subTitle" => PlaceholderType::SubTitle,
"body" => PlaceholderType::Body,
_ => PlaceholderType::Other,
};
}
}
*placeholder_type = Some(ph_type);
}
"txBody" => {
*in_text_body = true;
}
"p" if *in_text_body => {
*in_paragraph = true;
current_paragraph.clear();
}
"r" if *in_paragraph => {
*in_run = true;
}
"t" if *in_run => {
*in_text = true;
}
_ => {}
}
}
/// Handle an Empty event inside a <p:sp> shape.
fn handle_shape_empty(
local_str: &str,
e: &quick_xml::events::BytesStart,
placeholder_type: &mut Option<PlaceholderType>,
in_run: bool,
current_paragraph: &mut String,
) {
match local_str {
"ph" => {
let mut ph_type = PlaceholderType::Other;
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let key = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if key == "type" {
let val = String::from_utf8_lossy(&attr.value);
ph_type = match val.as_ref() {
"title" => PlaceholderType::Title,
"ctrTitle" => PlaceholderType::CenterTitle,
"subTitle" => PlaceholderType::SubTitle,
"body" => PlaceholderType::Body,
_ => PlaceholderType::Other,
};
}
}
*placeholder_type = Some(ph_type);
}
"br" if in_run => {
current_paragraph.push('\n');
}
_ => {}
}
}
/// Handle a Start event inside a <p:graphicFrame>.
#[allow(clippy::too_many_arguments)]
fn handle_graphic_frame_start(
local_str: &str,
in_table: &mut bool,
in_table_row: &mut bool,
in_table_cell: &mut bool,
in_cell_paragraph: &mut bool,
in_cell_run: &mut bool,
in_cell_text: &mut bool,
current_cell: &mut String,
current_row: &mut Vec<String>,
table_rows: &mut Vec<Vec<String>>,
) {
match local_str {
"tbl" => {
*in_table = true;
table_rows.clear();
}
"tr" if *in_table => {
*in_table_row = true;
current_row.clear();
}
"tc" if *in_table_row => {
*in_table_cell = true;
current_cell.clear();
}
"p" if *in_table_cell => {
// Add space separator between paragraphs in the same cell
if !current_cell.is_empty() {
current_cell.push(' ');
}
*in_cell_paragraph = true;
}
"r" if *in_cell_paragraph => {
*in_cell_run = true;
}
"t" if *in_cell_run => {
*in_cell_text = true;
}
_ => {}
}
}
/// Handle an Empty event inside a <p:graphicFrame>.
fn handle_graphic_frame_empty(local_str: &str, in_cell_run: bool, current_cell: &mut String) {
if local_str == "br" && in_cell_run {
current_cell.push(' ');
}
}
/// Handle a Start/Empty event inside a <p:pic>.
fn handle_picture_start(
local_str: &str,
e: &quick_xml::events::BytesStart,
current_blip_rel_id: &mut Option<String>,
current_image_alt: &mut Option<String>,
) {
match local_str {
"blip" => {
for attr in e.attributes().flatten() {
let key = std::str::from_utf8(attr.key.as_ref()).unwrap_or("");
if key == "r:embed" || key.ends_with(":embed") {
let val = String::from_utf8_lossy(&attr.value).to_string();
*current_blip_rel_id = Some(val);
}
}
}
"cNvPr" => {
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let key = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if key == "descr" {
let val = String::from_utf8_lossy(&attr.value).to_string();
if !val.is_empty() {
*current_image_alt = Some(val);
}
}
}
}
_ => {}
}
}
/// Finalize a shape into a ShapeContent based on its placeholder type and paragraphs.
fn finalize_shape(
placeholder_type: &Option<PlaceholderType>,
paragraphs: &[String],
) -> Option<ShapeContent> {
if paragraphs.is_empty() {
return None;
}
let text = paragraphs.join("\n");
let text = text.trim().to_string();
if text.is_empty() {
return None;
}
match placeholder_type {
Some(PlaceholderType::Title) | Some(PlaceholderType::CenterTitle) => {
Some(ShapeContent::Title(text))
}
Some(PlaceholderType::SubTitle) => Some(ShapeContent::Body(text)),
Some(PlaceholderType::Body) => Some(ShapeContent::Body(text)),
Some(PlaceholderType::Other) | None => {
// Shapes without a known placeholder type are treated as body text
Some(ShapeContent::Body(text))
}
}
}
// ---- Notes parsing ----
/// Parse a notes slide XML and extract the body text.
///
/// Only extracts text from the body placeholder (ignores slide number placeholders).
fn parse_notes(xml: &str) -> Option<String> {
let mut reader = Reader::from_str(xml);
let mut in_shape = false;
let mut shape_depth: u32 = 0;
let mut is_body_placeholder = false;
let mut in_text_body = false;
let mut in_paragraph = false;
let mut in_run = false;
let mut in_text = false;
let mut current_paragraph = String::new();
let mut paragraphs: Vec<String> = Vec::new();
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
if local_str == "sp" && !in_shape {
in_shape = true;
shape_depth = 1;
is_body_placeholder = false;
paragraphs.clear();
} else if in_shape {
shape_depth += 1;
match local_str {
"ph" => {
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let key = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if key == "type" {
let val = String::from_utf8_lossy(&attr.value);
if val.as_ref() == "body" {
is_body_placeholder = true;
}
}
}
}
"txBody" => in_text_body = true,
"p" if in_text_body => {
in_paragraph = true;
current_paragraph.clear();
}
"r" if in_paragraph => in_run = true,
"t" if in_run => in_text = true,
_ => {}
}
}
}
Ok(Event::Empty(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
if in_shape {
if local_str == "ph" {
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let key = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if key == "type" {
let val = String::from_utf8_lossy(&attr.value);
if val.as_ref() == "body" {
is_body_placeholder = true;
}
}
}
} else if local_str == "br" && in_run {
current_paragraph.push('\n');
}
}
}
Ok(Event::Text(ref e)) if in_shape && in_text && in_run => {
let text = e.unescape().unwrap_or_default().to_string();
current_paragraph.push_str(&text);
}
Ok(Event::End(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
if in_shape {
shape_depth -= 1;
match local_str {
"t" => in_text = false,
"r" => {
in_run = false;
in_text = false;
}
"p" if in_paragraph => {
if !current_paragraph.is_empty() {
paragraphs.push(current_paragraph.clone());
}
current_paragraph.clear();
in_paragraph = false;
}
"txBody" => in_text_body = false,
_ => {}
}
if shape_depth == 0 {
if is_body_placeholder && !paragraphs.is_empty() {
let text = paragraphs.join("\n").trim().to_string();
if !text.is_empty() {
return Some(text);
}
}
in_shape = false;
is_body_placeholder = false;
paragraphs.clear();
in_text_body = false;
in_paragraph = false;
in_run = false;
in_text = false;
}
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
}
None
}
/// Find the notes slide path from a slide's relationships.
fn resolve_notes_path(slide_rels: &HashMap<String, Relationship>) -> Option<String> {
for rel in slide_rels.values() {
if rel.rel_type.contains("notesSlide") {
return Some(rel.target.clone());
}
}
None
}
// ---- Markdown rendering ----
/// Render a single slide's content as Markdown.
///
/// Images are emitted with unique placeholder alt text `__img_N__`.
/// `image_counter` is incremented for each image to ensure uniqueness.
/// Returns (markdown, image_infos).
/// Render a slide into both markdown and plain text.
///
/// Returns `(markdown, plain_text, image_infos)`.
fn render_slide(
number: usize,
shapes: &[ShapeContent],
notes: &Option<String>,
image_filenames: &HashMap<String, String>,
slide_key: &str,
image_counter: &mut usize,
) -> (String, String, Vec<ImageInfo>) {
let mut out = String::new();
let mut plain = String::new();
let mut image_infos: Vec<ImageInfo> = Vec::new();
// Find the title
let title = shapes.iter().find_map(|s| {
if let ShapeContent::Title(t) = s {
Some(t.as_str())
} else {
None
}
});
// Slide heading
if let Some(title_text) = title {
out.push_str(&format!("## Slide {number}: {title_text}\n\n"));
plain.push_str(&format!("{title_text}\n\n"));
} else {
out.push_str(&format!("## Slide {number}\n\n"));
plain.push('\n');
}
// Body content, tables, and images (skip title since it's already in heading)
for shape in shapes {
match shape {
ShapeContent::Title(_) => {} // Already rendered as heading
ShapeContent::Body(text) => {
out.push_str(text);
out.push_str("\n\n");
plain.push_str(text);
plain.push_str("\n\n");
}
ShapeContent::Table { headers, rows } => {
let header_refs: Vec<&str> = headers.iter().map(|s| s.as_str()).collect();
let row_refs: Vec<Vec<&str>> = rows
.iter()
.map(|r| r.iter().map(|s| s.as_str()).collect())
.collect();
out.push_str(&build_table(&header_refs, &row_refs));
out.push('\n');
plain.push_str(&build_table_plain(&header_refs, &row_refs));
plain.push('\n');
}
ShapeContent::Image { rel_id, alt_text } => {
if let Some(filename) = image_filenames.get(rel_id) {
let original_alt = alt_text.as_deref().unwrap_or("").to_string();
let placeholder = format!("__img_{n}__", n = *image_counter);
*image_counter += 1;
image_infos.push(ImageInfo {
placeholder: placeholder.clone(),
original_alt: original_alt.clone(),
filename: filename.clone(),
bytes_key: format!("{slide_key}::{rel_id}"),
});
out.push_str(&format!("\n\n"));
// Plain text: image description placeholder (resolved later)
plain.push_str(&format!("{placeholder}\n\n"));
}
}
}
}
// Notes
if let Some(notes_text) = notes {
let lines: Vec<&str> = notes_text.lines().collect();
if !lines.is_empty() {
out.push_str(&format!("> Note: {}", lines[0]));
for line in &lines[1..] {
out.push_str(&format!("\n> {line}"));
}
out.push_str("\n\n");
// Plain text: notes without blockquote prefix
plain.push_str(notes_text);
plain.push_str("\n\n");
}
}
// Trim trailing whitespace
(
out.trim_end().to_string(),
plain.trim_end().to_string(),
image_infos,
)
}
// ---- Converter trait impl ----
// ---- Comment extraction ----
/// Parse an author registry, mapping each author `id` to its `name`.
///
/// `elem` is the author element's local name — `cmAuthor` for the legacy
/// `ppt/commentAuthors.xml` (`<p:cmAuthor id="0" name="Julie Lee"/>`) or
/// `author` for the modern `ppt/authors.xml` (`<p188:author id="{GUID}"
/// name="Julie Lee"/>`). Names are XML-unescaped.
fn parse_author_registry(xml: &str, elem: &str) -> HashMap<String, String> {
let mut reader = Reader::from_str(xml);
let mut authors = HashMap::new();
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
let local = e.local_name();
if std::str::from_utf8(local.as_ref()).unwrap_or("") == elem
&& let Some(id) = attr_value_unescaped(e, "id")
{
let name = attr_value_unescaped(e, "name").unwrap_or_default();
authors.insert(id, name);
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
}
authors
}
/// A raw PPTX comment before it is turned into a rendered [`Comment`].
#[derive(Debug, Clone)]
struct RawPptxComment {
author: String,
date: String,
body: String,
is_reply: bool,
}
/// Parse a legacy comment file (`ppt/comments/commentN.xml`).
///
/// Comments are `<p:cm authorId="0" dt="..." idx="1">` with the body as plain
/// text in a `<p:text>` child. Legacy comments are never replies.
fn parse_legacy_comments(xml: &str, authors: &HashMap<String, String>) -> Vec<RawPptxComment> {
let mut reader = Reader::from_str(xml);
let mut out = Vec::new();
let mut author = String::new();
let mut date = String::new();
let mut body = String::new();
let mut in_cm = false;
let mut in_text = false;
loop {
match reader.read_event() {
// Only Event::Start sets in_text: a self-closing `<p:text/>` (Empty,
// no End) would otherwise leave in_text stuck true and leak later
// text into the body.
Ok(Event::Start(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
match local_str {
"cm" => {
in_cm = true;
author = String::new();
date = String::new();
body = String::new();
if let Some(v) = attr_value_unescaped(e, "authorId") {
author = authors.get(&v).cloned().unwrap_or_default();
}
if let Some(v) = attr_value_unescaped(e, "dt") {
date = v;
}
}
"text" if in_cm => in_text = true,
_ => {}
}
}
// A self-closing `<p:cm .../>` (no children) still yields a comment.
Ok(Event::Empty(ref e)) => {
let local = e.local_name();
if std::str::from_utf8(local.as_ref()).unwrap_or("") == "cm" {
let author = attr_value_unescaped(e, "authorId")
.and_then(|v| authors.get(&v).cloned())
.unwrap_or_default();
let date = attr_value_unescaped(e, "dt").unwrap_or_default();
out.push(RawPptxComment {
author,
date,
body: String::new(),
is_reply: false,
});
}
}
Ok(Event::Text(ref e)) if in_text => {
body.push_str(&e.unescape().unwrap_or_default());
}
Ok(Event::End(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
match local_str {
"text" => in_text = false,
"cm" if in_cm => {
out.push(RawPptxComment {
author: std::mem::take(&mut author),
date: std::mem::take(&mut date),
body: std::mem::take(&mut body),
is_reply: false,
});
in_cm = false;
}
_ => {}
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
}
out
}
/// Parse a modern comment file (`ppt/comments/modernComment_*.xml`).
///
/// Comments are `<p188:cm authorId="{GUID}" created="...">` whose body lives in
/// DrawingML `<a:t>` runs inside `<p188:txBody>`. Replies are nested inside a
/// `<p188:replyLst>` and are marked `is_reply = true`. Top-level comments and
/// replies are emitted in document order (a parent precedes its replies).
fn parse_modern_comments(xml: &str, authors: &HashMap<String, String>) -> Vec<RawPptxComment> {
let mut reader = Reader::from_str(xml);
let mut out = Vec::new();
/// One in-progress comment on the parse stack.
struct Frame {
/// Document start order, so a parent (whose start tag precedes its
/// reply's) is emitted before its replies even though it closes later.
seq: usize,
author: String,
date: String,
body: String,
is_reply: bool,
}
// Comments nest: a `cm`'s `replyLst` (and its `reply` children) appears
// before the `cm`'s own `txBody`. A stack keeps each comment's captured
// attributes and body separate; text appends to the innermost open frame.
// Finished frames are collected with their start `seq`, then sorted so the
// emit order is document order (parent before reply).
let mut stack: Vec<Frame> = Vec::new();
let mut finished: Vec<Frame> = Vec::new();
let mut next_seq: usize = 0;
let mut in_text = false;
// Build a Frame from a cm/reply element's attributes.
let make_frame = |e: &quick_xml::events::BytesStart, is_reply: bool, seq: usize| {
let author = attr_value_unescaped(e, "authorId")
.and_then(|v| authors.get(&v).cloned())
.unwrap_or_default();
let date = attr_value_unescaped(e, "created").unwrap_or_default();
Frame {
seq,
author,
date,
body: String::new(),
is_reply,
}
};
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
match local_str {
"cm" | "reply" => {
stack.push(make_frame(e, local_str == "reply", next_seq));
next_seq += 1;
}
// Only Event::Start sets in_text: a self-closing `<a:t/>`
// (Empty, no End) would otherwise leak later text into the
// innermost open frame's body.
"t" if !stack.is_empty() => in_text = true,
_ => {}
}
}
Ok(Event::Empty(ref e)) => {
// A self-closing `<p188:cm/>`/`<p188:reply/>` (no body) is a
// complete empty comment.
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
if local_str == "cm" || local_str == "reply" {
finished.push(make_frame(e, local_str == "reply", next_seq));
next_seq += 1;
}
}
Ok(Event::Text(ref e)) if in_text => {
if let Some(frame) = stack.last_mut() {
frame.body.push_str(&e.unescape().unwrap_or_default());
}
}
Ok(Event::End(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
match local_str {
"t" => in_text = false,
"cm" | "reply" => {
if let Some(frame) = stack.pop() {
finished.push(frame);
}
}
_ => {}
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
}
finished.sort_by_key(|f| f.seq);
for frame in finished {
out.push(RawPptxComment {
author: frame.author,
date: frame.date,
body: frame.body,
is_reply: frame.is_reply,
});
}
out
}
/// Build the slide-label `source` for a PPTX comment: `Slide N: Title` when the
/// slide has a title, else `Slide N`.
///
/// The title's internal whitespace is collapsed (a multi-paragraph or
/// `<a:br/>`-bearing title contains newlines) and the label is capped, so the
/// rendered `- **source**:` line never breaks across multiple lines.
fn slide_label(number: usize, title: Option<&str>) -> String {
let label = match title.map(comments::collapse_ws) {
Some(t) if !t.is_empty() => format!("Slide {number}: {t}"),
_ => format!("Slide {number}"),
};
comments::cap_text(&label, comments::SOURCE_CAP)
}
/// Convert raw PPTX comments for one slide into rendered [`Comment`]s.
fn build_pptx_comments(raw: Vec<RawPptxComment>, source: &str) -> Vec<Comment> {
raw.into_iter()
.map(|rc| Comment {
author: comments::format_author(&rc.author, &rc.date),
body: comments::collapse_ws(&rc.body),
source: source.to_string(),
is_reply: rc.is_reply,
})
.collect()
}
// ---- Internal conversion (parse + image extraction, no resolution) ----
impl PptxConverter {
/// Parse the presentation and extract images without resolving placeholders.
///
/// Returns the conversion result (with unresolved placeholders in markdown),
/// pending image data for later resolution (sync or async), and any extracted
/// comments (empty unless `options.extract_comments` is set). Comments are
/// appended to the output by the caller, after image placeholders resolve.
pub(crate) fn convert_inner(
&self,
data: &[u8],
options: &ConversionOptions,
) -> Result<(ConversionResult, PendingImageResolution, Vec<Comment>), ConvertError> {
let cursor = Cursor::new(data);
let mut archive = ZipArchive::new(cursor)?;
crate::zip_utils::validate_zip_budget(&mut archive, options.max_uncompressed_zip_bytes)?;
let mut warnings: Vec<ConversionWarning> = Vec::new();
let mut images: Vec<(String, Vec<u8>)> = Vec::new();
// 1. Parse presentation.xml.rels (optional but needed for slide resolution)
let pres_rels = match read_zip_text(&mut archive, "ppt/_rels/presentation.xml.rels")? {
Some(xml) => parse_relationships(&xml),
None => HashMap::new(),
};
// 2. Parse presentation.xml (required)
let pres_xml = read_zip_text(&mut archive, "ppt/presentation.xml")?.ok_or_else(|| {
ConvertError::MalformedDocument {
reason: "missing ppt/presentation.xml".to_string(),
}
})?;
// 3. Resolve slide order
let slides = resolve_slide_order(&pres_xml, &pres_rels);
if slides.is_empty() {
return Ok((
ConversionResult {
markdown: String::new(),
..Default::default()
},
PendingImageResolution::default(),
Vec::new(),
));
}
// Load comment author registries (legacy + modern) once, if requested.
let comment_authors = if options.extract_comments {
let mut map = HashMap::new();
if let Some(xml) = read_zip_text_lossy(&mut archive, "ppt/commentAuthors.xml")? {
map.extend(parse_author_registry(&xml, "cmAuthor"));
}
if let Some(xml) = read_zip_text_lossy(&mut archive, "ppt/authors.xml")? {
map.extend(parse_author_registry(&xml, "author"));
}
map
} else {
HashMap::new()
};
// 4. Process each slide — collect all image infos and bytes across slides
let mut slide_markdowns: Vec<String> = Vec::new();
let mut slide_plains: Vec<String> = Vec::new();
let mut document_title: Option<String> = None;
let mut total_image_bytes: usize = 0;
let mut image_counter: usize = 0;
let mut all_image_infos: Vec<ImageInfo> = Vec::new();
let mut all_image_bytes: HashMap<String, Vec<u8>> = HashMap::new();
let mut all_comments: Vec<Comment> = Vec::new();
for slide_info in &slides {
// Read slide XML
let slide_xml = match read_zip_text(&mut archive, &slide_info.path)? {
Some(xml) => xml,
None => {
warnings.push(ConversionWarning {
code: WarningCode::SkippedElement,
message: format!("slide file not found: {}", slide_info.path),
location: Some(slide_info.path.clone()),
});
continue;
}
};
// Parse slide content
let (shapes, mut slide_warnings) = parse_slide(&slide_xml);
warnings.append(&mut slide_warnings);
// Read slide rels for notes and images
let slide_rels_path = derive_rels_path(&slide_info.path);
let slide_rels = match read_zip_text(&mut archive, &slide_rels_path)? {
Some(xml) => parse_relationships(&xml),
None => HashMap::new(),
};
// Parse notes
let notes = if let Some(notes_target) = resolve_notes_path(&slide_rels) {
let notes_path = resolve_relative_to_file(&slide_info.path, ¬es_target);
match read_zip_text(&mut archive, ¬es_path)? {
Some(xml) => parse_notes(&xml),
None => None,
}
} else {
None
};
// Extract comments anchored to this slide (legacy + modern schemes).
if options.extract_comments {
let slide_title = shapes.iter().find_map(|s| match s {
ShapeContent::Title(t) => Some(t.as_str()),
_ => None,
});
let source = slide_label(slide_info.number, slide_title);
// Collect comment-part targets, sorted by path for deterministic
// ordering when a slide references more than one comment file.
// The modern scheme uses the office/2018/10 namespace; legacy
// uses the 2006 one.
let mut comment_targets: Vec<(String, bool)> = slide_rels
.values()
.filter(|rel| rel.rel_type.contains("comments"))
.map(|rel| {
let path = resolve_relative_to_file(&slide_info.path, &rel.target);
(path, rel.rel_type.contains("2018"))
})
.collect();
comment_targets.sort();
// A slide can carry BOTH a modern and a legacy comment part for
// back-compat, describing the same threads. Prefer modern and
// skip legacy in that case to avoid double-reporting.
let has_modern = comment_targets.iter().any(|(_, m)| *m);
for (path, is_modern) in comment_targets {
if has_modern && !is_modern {
continue;
}
let Some(xml) = read_zip_text_lossy(&mut archive, &path)? else {
continue;
};
let raw = if is_modern {
parse_modern_comments(&xml, &comment_authors)
} else {
parse_legacy_comments(&xml, &comment_authors)
};
all_comments.extend(build_pptx_comments(raw, &source));
}
}
// Resolve image filenames and optionally extract image data
let need_image_bytes = options.extract_images || options.image_describer.is_some();
let mut image_filenames: HashMap<String, String> = HashMap::new();
for shape in &shapes {
if let ShapeContent::Image { rel_id, .. } = shape
&& let Some(rel) = slide_rels.get(rel_id)
{
let image_path = resolve_relative_to_file(&slide_info.path, &rel.target);
let filename = image_path.rsplit('/').next().unwrap_or(&image_path);
image_filenames.insert(rel_id.clone(), filename.to_string());
if need_image_bytes
&& total_image_bytes < options.max_total_image_bytes
&& let Ok(Some(img_data)) = read_zip_bytes(&mut archive, &image_path)
{
total_image_bytes += img_data.len();
if total_image_bytes <= options.max_total_image_bytes {
if options.extract_images {
images.push((filename.to_string(), img_data.clone()));
}
let bytes_key = format!("{}::{}", slide_info.path, rel_id);
all_image_bytes.insert(bytes_key, img_data);
} else {
warnings.push(ConversionWarning {
code: WarningCode::ResourceLimitReached,
message: format!(
"total image bytes exceeded limit ({})",
options.max_total_image_bytes
),
location: Some(image_path),
});
}
}
}
}
// Set document title from first slide's title
if document_title.is_none() {
document_title = shapes.iter().find_map(|s| {
if let ShapeContent::Title(t) = s {
Some(t.clone())
} else {
None
}
});
}
let (slide_md, slide_plain, slide_image_infos) = render_slide(
slide_info.number,
&shapes,
¬es,
&image_filenames,
&slide_info.path,
&mut image_counter,
);
all_image_infos.extend(slide_image_infos);
slide_markdowns.push(slide_md);
slide_plains.push(slide_plain);
}
// Join slides with horizontal rule separator (markdown) or blank line (plain text)
let markdown = slide_markdowns.join("\n\n---\n\n");
let markdown = if markdown.is_empty() {
markdown
} else {
format!("{markdown}\n")
};
let plain_text = slide_plains.join("\n\n");
let plain_text = if plain_text.is_empty() {
plain_text
} else {
format!("{plain_text}\n")
};
let result = ConversionResult {
markdown,
plain_text,
title: document_title,
images,
warnings,
};
let pending = PendingImageResolution {
infos: all_image_infos,
bytes: all_image_bytes,
};
Ok((result, pending, all_comments))
}
}
// ---- Converter trait impl ----
impl Converter for PptxConverter {
fn supported_extensions(&self) -> &[&str] {
&["pptx"]
}
fn convert(
&self,
data: &[u8],
options: &ConversionOptions,
) -> Result<ConversionResult, ConvertError> {
let (mut result, pending, doc_comments) = self.convert_inner(data, options)?;
resolve_image_placeholders(
&mut result.markdown,
&mut result.plain_text,
&pending.infos,
&pending.bytes,
options.image_describer.as_deref(),
&mut result.warnings,
);
comments::append_comments(&mut result.markdown, &mut result.plain_text, &doc_comments);
Ok(result)
}
}
#[cfg(test)]
mod tests {
use super::*;
// ---- Helper: build minimal PPTX ZIP in memory ----
struct TestSlide<'a> {
title: Option<&'a str>,
body_texts: Vec<&'a str>,
notes: Option<&'a str>,
table: Option<TestTable<'a>>,
images: Vec<&'a str>, // rel IDs for image references
image_alt_texts: Vec<Option<&'a str>>, // alt text per image (parallel to images)
}
struct TestTable<'a> {
headers: Vec<&'a str>,
rows: Vec<Vec<&'a str>>,
}
/// Build a minimal PPTX ZIP in memory.
fn build_test_pptx(slides: &[TestSlide]) -> Vec<u8> {
use std::io::Write;
use zip::ZipWriter;
use zip::write::SimpleFileOptions;
let buf = Vec::new();
let mut zip = ZipWriter::new(Cursor::new(buf));
let opts = SimpleFileOptions::default();
// [Content_Types].xml
let mut ct = String::from(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>"#);
ct.push_str(
r#"<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">"#,
);
ct.push_str(
r#"<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>"#,
);
ct.push_str(r#"<Default Extension="xml" ContentType="application/xml"/>"#);
ct.push_str("</Types>");
zip.start_file("[Content_Types].xml", opts).unwrap();
zip.write_all(ct.as_bytes()).unwrap();
// Build presentation.xml with slide references
let mut pres_xml = String::from(
r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"><p:sldIdLst>"#,
);
let mut pres_rels_xml = String::from(
r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">"#,
);
for (i, slide) in slides.iter().enumerate() {
let slide_num = i + 1;
let rid = format!("rId{slide_num}");
let slide_id = 256 + i;
pres_xml.push_str(&format!(r#"<p:sldId id="{slide_id}" r:id="{rid}"/>"#));
pres_rels_xml.push_str(&format!(
r#"<Relationship Id="{rid}" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide{slide_num}.xml"/>"#
));
// Build slide XML
let slide_xml = build_slide_xml(slide);
zip.start_file(format!("ppt/slides/slide{slide_num}.xml"), opts)
.unwrap();
zip.write_all(slide_xml.as_bytes()).unwrap();
// Build slide rels if notes or images exist
if slide.notes.is_some() || !slide.images.is_empty() {
let mut slide_rels = String::from(
r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">"#,
);
if slide.notes.is_some() {
slide_rels.push_str(&format!(
r#"<Relationship Id="rIdNotes" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide" Target="../notesSlides/notesSlide{slide_num}.xml"/>"#
));
}
for (img_idx, _) in slide.images.iter().enumerate() {
let img_rid = format!("rIdImg{}", img_idx + 1);
slide_rels.push_str(&format!(
r#"<Relationship Id="{img_rid}" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image{}.png"/>"#,
img_idx + 1
));
}
slide_rels.push_str("</Relationships>");
zip.start_file(format!("ppt/slides/_rels/slide{slide_num}.xml.rels"), opts)
.unwrap();
zip.write_all(slide_rels.as_bytes()).unwrap();
}
// Build notes slide if present
if let Some(notes_text) = slide.notes {
let notes_xml = build_notes_xml(notes_text);
zip.start_file(format!("ppt/notesSlides/notesSlide{slide_num}.xml"), opts)
.unwrap();
zip.write_all(notes_xml.as_bytes()).unwrap();
}
}
pres_xml.push_str("</p:sldIdLst></p:presentation>");
pres_rels_xml.push_str("</Relationships>");
zip.start_file("ppt/presentation.xml", opts).unwrap();
zip.write_all(pres_xml.as_bytes()).unwrap();
zip.start_file("ppt/_rels/presentation.xml.rels", opts)
.unwrap();
zip.write_all(pres_rels_xml.as_bytes()).unwrap();
let cursor = zip.finish().unwrap();
cursor.into_inner()
}
/// Build the XML for a single slide.
fn build_slide_xml(slide: &TestSlide) -> String {
let mut xml = String::from(
r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"><p:cSld><p:spTree>"#,
);
// Title shape
if let Some(title) = slide.title {
xml.push_str(&format!(
r#"<p:sp><p:nvSpPr><p:cNvPr id="1" name="Title"/><p:cNvSpPr/><p:nvPr><p:ph type="title"/></p:nvPr></p:nvSpPr><p:txBody><a:p><a:r><a:t>{title}</a:t></a:r></a:p></p:txBody></p:sp>"#
));
}
// Body text shapes
for text in &slide.body_texts {
xml.push_str(&format!(
r#"<p:sp><p:nvSpPr><p:cNvPr id="2" name="Content"/><p:cNvSpPr/><p:nvPr><p:ph type="body"/></p:nvPr></p:nvSpPr><p:txBody><a:p><a:r><a:t>{text}</a:t></a:r></a:p></p:txBody></p:sp>"#
));
}
// Table
if let Some(table) = &slide.table {
xml.push_str(r#"<p:graphicFrame><p:nvGraphicFramePr><p:cNvPr id="3" name="Table"/><p:cNvGraphicFramePr/><p:nvPr/></p:nvGraphicFramePr><a:graphic><a:graphicData><a:tbl>"#);
// Header row
xml.push_str("<a:tr>");
for header in &table.headers {
xml.push_str(&format!(
r#"<a:tc><a:txBody><a:p><a:r><a:t>{header}</a:t></a:r></a:p></a:txBody></a:tc>"#
));
}
xml.push_str("</a:tr>");
// Data rows
for row in &table.rows {
xml.push_str("<a:tr>");
for cell in row {
xml.push_str(&format!(
r#"<a:tc><a:txBody><a:p><a:r><a:t>{cell}</a:t></a:r></a:p></a:txBody></a:tc>"#
));
}
xml.push_str("</a:tr>");
}
xml.push_str("</a:tbl></a:graphicData></a:graphic></p:graphicFrame>");
}
// Image shapes
for (idx, rel_id) in slide.images.iter().enumerate() {
let descr_attr = slide
.image_alt_texts
.get(idx)
.and_then(|a| *a)
.map(|alt| format!(r#" descr="{alt}""#))
.unwrap_or_default();
xml.push_str(&format!(
r#"<p:pic><p:nvPicPr><p:cNvPr id="{}"{descr_attr} name="Picture"/><p:cNvPicPr/><p:nvPr/></p:nvPicPr><p:blipFill><a:blip r:embed="{rel_id}"/></p:blipFill></p:pic>"#,
10 + idx
));
}
xml.push_str("</p:spTree></p:cSld></p:sld>");
xml
}
/// Build the XML for a notes slide.
fn build_notes_xml(text: &str) -> String {
// Split text by newlines to create separate paragraphs
let paragraphs: Vec<&str> = text.lines().collect();
let mut para_xml = String::new();
for p in ¶graphs {
para_xml.push_str(&format!(r#"<a:p><a:r><a:t>{p}</a:t></a:r></a:p>"#));
}
format!(
r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:notes xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p:cSld><p:spTree><p:sp><p:nvSpPr><p:cNvPr id="1" name="Slide Number"/><p:cNvSpPr/><p:nvPr><p:ph type="sldNum"/></p:nvPr></p:nvSpPr><p:txBody><a:p><a:r><a:t>1</a:t></a:r></a:p></p:txBody></p:sp><p:sp><p:nvSpPr><p:cNvPr id="2" name="Notes"/><p:cNvSpPr/><p:nvPr><p:ph type="body"/></p:nvPr></p:nvSpPr><p:txBody>{para_xml}</p:txBody></p:sp></p:spTree></p:cSld></p:notes>"#
)
}
// ---- Tests ----
#[test]
fn test_pptx_supported_extensions() {
let converter = PptxConverter;
assert_eq!(converter.supported_extensions(), &["pptx"]);
}
#[test]
fn test_pptx_can_convert() {
let converter = PptxConverter;
assert!(converter.can_convert("pptx", &[]));
assert!(!converter.can_convert("docx", &[]));
assert!(!converter.can_convert("xlsx", &[]));
assert!(!converter.can_convert("pdf", &[]));
}
#[test]
fn test_pptx_invalid_data_returns_error() {
let converter = PptxConverter;
let result = converter.convert(b"not a valid pptx file", &ConversionOptions::default());
assert!(result.is_err());
}
#[test]
fn test_pptx_empty_presentation() {
let data = build_test_pptx(&[]);
let converter = PptxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert_eq!(result.markdown, "");
assert!(result.title.is_none());
}
#[test]
fn test_pptx_single_slide_title_and_body() {
let data = build_test_pptx(&[TestSlide {
title: Some("Hello World"),
body_texts: vec!["This is the body text."],
notes: None,
table: None,
images: vec![],
image_alt_texts: vec![],
}]);
let converter = PptxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("## Slide 1: Hello World"));
assert!(result.markdown.contains("This is the body text."));
}
#[test]
fn test_pptx_multiple_slides_with_separator() {
let data = build_test_pptx(&[
TestSlide {
title: Some("First"),
body_texts: vec!["Body one."],
notes: None,
table: None,
images: vec![],
image_alt_texts: vec![],
},
TestSlide {
title: Some("Second"),
body_texts: vec!["Body two."],
notes: None,
table: None,
images: vec![],
image_alt_texts: vec![],
},
]);
let converter = PptxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("## Slide 1: First"));
assert!(result.markdown.contains("## Slide 2: Second"));
assert!(result.markdown.contains("\n\n---\n\n"));
}
#[test]
fn test_pptx_slide_without_title() {
let data = build_test_pptx(&[TestSlide {
title: None,
body_texts: vec!["Just body text."],
notes: None,
table: None,
images: vec![],
image_alt_texts: vec![],
}]);
let converter = PptxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("## Slide 1\n"));
// Should NOT have a colon when no title
assert!(!result.markdown.contains("## Slide 1:"));
}
#[test]
fn test_pptx_title_center_title() {
// Build a PPTX with ctrTitle placeholder type
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p:cSld><p:spTree><p:sp><p:nvSpPr><p:cNvPr id="1" name="Title"/><p:cNvSpPr/><p:nvPr><p:ph type="ctrTitle"/></p:nvPr></p:nvSpPr><p:txBody><a:p><a:r><a:t>Center Title</a:t></a:r></a:p></p:txBody></p:sp></p:spTree></p:cSld></p:sld>"#;
let (shapes, _) = parse_slide(slide_xml);
assert_eq!(shapes.len(), 1);
match &shapes[0] {
ShapeContent::Title(text) => assert_eq!(text, "Center Title"),
other => panic!("expected Title, got {:?}", other),
}
}
#[test]
fn test_pptx_document_title_from_first_slide() {
let data = build_test_pptx(&[
TestSlide {
title: Some("Presentation Title"),
body_texts: vec![],
notes: None,
table: None,
images: vec![],
image_alt_texts: vec![],
},
TestSlide {
title: Some("Second Slide"),
body_texts: vec![],
notes: None,
table: None,
images: vec![],
image_alt_texts: vec![],
},
]);
let converter = PptxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert_eq!(result.title, Some("Presentation Title".to_string()));
}
#[test]
fn test_pptx_body_text_multiple_paragraphs() {
// Build slide XML with multiple paragraphs in body
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p:cSld><p:spTree><p:sp><p:nvSpPr><p:cNvPr id="1" name="Content"/><p:cNvSpPr/><p:nvPr><p:ph type="body"/></p:nvPr></p:nvSpPr><p:txBody><a:p><a:r><a:t>First paragraph</a:t></a:r></a:p><a:p><a:r><a:t>Second paragraph</a:t></a:r></a:p></p:txBody></p:sp></p:spTree></p:cSld></p:sld>"#;
let (shapes, _) = parse_slide(slide_xml);
assert_eq!(shapes.len(), 1);
match &shapes[0] {
ShapeContent::Body(text) => {
assert!(text.contains("First paragraph"));
assert!(text.contains("Second paragraph"));
assert!(text.contains('\n'));
}
other => panic!("expected Body, got {:?}", other),
}
}
#[test]
fn test_pptx_body_text_multiple_runs_joined() {
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p:cSld><p:spTree><p:sp><p:nvSpPr><p:cNvPr id="1" name="Content"/><p:cNvSpPr/><p:nvPr><p:ph type="body"/></p:nvPr></p:nvSpPr><p:txBody><a:p><a:r><a:t>Hello </a:t></a:r><a:r><a:t>World</a:t></a:r></a:p></p:txBody></p:sp></p:spTree></p:cSld></p:sld>"#;
let (shapes, _) = parse_slide(slide_xml);
match &shapes[0] {
ShapeContent::Body(text) => assert_eq!(text, "Hello World"),
other => panic!("expected Body, got {:?}", other),
}
}
#[test]
fn test_pptx_subtitle_treated_as_body() {
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p:cSld><p:spTree><p:sp><p:nvSpPr><p:cNvPr id="1" name="Title"/><p:cNvSpPr/><p:nvPr><p:ph type="title"/></p:nvPr></p:nvSpPr><p:txBody><a:p><a:r><a:t>Main Title</a:t></a:r></a:p></p:txBody></p:sp><p:sp><p:nvSpPr><p:cNvPr id="2" name="Subtitle"/><p:cNvSpPr/><p:nvPr><p:ph type="subTitle"/></p:nvPr></p:nvSpPr><p:txBody><a:p><a:r><a:t>The subtitle</a:t></a:r></a:p></p:txBody></p:sp></p:spTree></p:cSld></p:sld>"#;
let (shapes, _) = parse_slide(slide_xml);
assert_eq!(shapes.len(), 2);
match &shapes[0] {
ShapeContent::Title(text) => assert_eq!(text, "Main Title"),
other => panic!("expected Title, got {:?}", other),
}
match &shapes[1] {
ShapeContent::Body(text) => assert_eq!(text, "The subtitle"),
other => panic!("expected Body, got {:?}", other),
}
}
#[test]
fn test_pptx_table_basic() {
let data = build_test_pptx(&[TestSlide {
title: Some("Data"),
body_texts: vec![],
notes: None,
table: Some(TestTable {
headers: vec!["Name", "Value"],
rows: vec![vec!["Alpha", "100"], vec!["Beta", "200"]],
}),
images: vec![],
image_alt_texts: vec![],
}]);
let converter = PptxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("| Name | Value |"));
assert!(result.markdown.contains("|---|---|"));
assert!(result.markdown.contains("| Alpha | 100 |"));
assert!(result.markdown.contains("| Beta | 200 |"));
}
#[test]
fn test_pptx_table_empty_cells() {
let data = build_test_pptx(&[TestSlide {
title: None,
body_texts: vec![],
notes: None,
table: Some(TestTable {
headers: vec!["A", "B", "C"],
rows: vec![vec!["1", "", "3"]],
}),
images: vec![],
image_alt_texts: vec![],
}]);
let converter = PptxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("| A | B | C |"));
assert!(result.markdown.contains("| 1 | | 3 |"));
}
#[test]
fn test_pptx_notes_basic() {
let data = build_test_pptx(&[TestSlide {
title: Some("Slide"),
body_texts: vec!["Content."],
notes: Some("This is a speaker note."),
table: None,
images: vec![],
image_alt_texts: vec![],
}]);
let converter = PptxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("> Note: This is a speaker note."));
}
#[test]
fn test_pptx_notes_multiline() {
let data = build_test_pptx(&[TestSlide {
title: Some("Slide"),
body_texts: vec![],
notes: Some("First line\nSecond line\nThird line"),
table: None,
images: vec![],
image_alt_texts: vec![],
}]);
let converter = PptxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("> Note: First line"));
assert!(result.markdown.contains("> Second line"));
assert!(result.markdown.contains("> Third line"));
}
#[test]
fn test_pptx_notes_missing() {
let data = build_test_pptx(&[TestSlide {
title: Some("Slide"),
body_texts: vec!["Text."],
notes: None,
table: None,
images: vec![],
image_alt_texts: vec![],
}]);
let converter = PptxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(!result.markdown.contains("> Note:"));
}
#[test]
fn test_pptx_unicode_cjk() {
let data = build_test_pptx(&[TestSlide {
title: Some("다국어"),
body_texts: vec!["한국어 테스트", "中文测试", "日本語テスト"],
notes: None,
table: None,
images: vec![],
image_alt_texts: vec![],
}]);
let converter = PptxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("한국어 테스트"));
assert!(result.markdown.contains("中文测试"));
assert!(result.markdown.contains("日本語テスト"));
assert!(result.markdown.contains("다국어"));
}
#[test]
fn test_pptx_emoji() {
let data = build_test_pptx(&[TestSlide {
title: Some("Emoji Test"),
body_texts: vec!["Rocket: 🚀 Stars: ✨ Earth: 🌍"],
notes: None,
table: None,
images: vec![],
image_alt_texts: vec![],
}]);
let converter = PptxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("🚀"));
assert!(result.markdown.contains("✨"));
assert!(result.markdown.contains("🌍"));
}
#[test]
fn test_pptx_missing_presentation_xml() {
use std::io::Write;
use zip::ZipWriter;
use zip::write::SimpleFileOptions;
let buf = Vec::new();
let mut zip = ZipWriter::new(Cursor::new(buf));
let opts = SimpleFileOptions::default();
// Just a content types file, no presentation.xml
zip.start_file("[Content_Types].xml", opts).unwrap();
zip.write_all(b"<?xml version=\"1.0\"?><Types xmlns=\"http://schemas.openxmlformats.org/package/2006/content-types\"></Types>").unwrap();
let cursor = zip.finish().unwrap();
let data = cursor.into_inner();
let converter = PptxConverter;
let result = converter.convert(&data, &ConversionOptions::default());
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
format!("{err}").contains("missing ppt/presentation.xml"),
"error was: {err}"
);
}
#[test]
fn test_pptx_missing_slide_file_graceful() {
use std::io::Write;
use zip::ZipWriter;
use zip::write::SimpleFileOptions;
let buf = Vec::new();
let mut zip = ZipWriter::new(Cursor::new(buf));
let opts = SimpleFileOptions::default();
zip.start_file("[Content_Types].xml", opts).unwrap();
zip.write_all(b"<?xml version=\"1.0\"?><Types xmlns=\"http://schemas.openxmlformats.org/package/2006/content-types\"></Types>").unwrap();
// presentation.xml references a slide that doesn't exist
let pres_xml = r#"<?xml version="1.0"?><p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"><p:sldIdLst><p:sldId id="256" r:id="rId1"/></p:sldIdLst></p:presentation>"#;
zip.start_file("ppt/presentation.xml", opts).unwrap();
zip.write_all(pres_xml.as_bytes()).unwrap();
let pres_rels = r#"<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/></Relationships>"#;
zip.start_file("ppt/_rels/presentation.xml.rels", opts)
.unwrap();
zip.write_all(pres_rels.as_bytes()).unwrap();
let cursor = zip.finish().unwrap();
let data = cursor.into_inner();
let converter = PptxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(!result.warnings.is_empty());
assert_eq!(result.warnings[0].code, WarningCode::SkippedElement);
assert!(result.warnings[0].message.contains("slide file not found"));
}
#[test]
fn test_pptx_derive_rels_path() {
assert_eq!(
derive_rels_path("ppt/slides/slide1.xml"),
"ppt/slides/_rels/slide1.xml.rels"
);
assert_eq!(
derive_rels_path("ppt/presentation.xml"),
"ppt/_rels/presentation.xml.rels"
);
assert_eq!(derive_rels_path("file.xml"), "_rels/file.xml.rels");
}
#[test]
fn test_pptx_resolve_relative_to_file() {
assert_eq!(
resolve_relative_to_file("ppt/slides/slide1.xml", "../media/image1.png"),
"ppt/media/image1.png"
);
assert_eq!(
resolve_relative_to_file("ppt/slides/slide1.xml", "/ppt/media/image1.png"),
"ppt/media/image1.png"
);
assert_eq!(
resolve_relative_to_file("ppt/slides/slide1.xml", "../notesSlides/notesSlide1.xml"),
"ppt/notesSlides/notesSlide1.xml"
);
assert_eq!(
resolve_relative_to_file("ppt/slides/slide1.xml", "chart1.xml"),
"ppt/slides/chart1.xml"
);
}
#[test]
fn test_pptx_image_reference_detected() {
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"><p:cSld><p:spTree><p:pic><p:nvPicPr><p:cNvPr id="1" name="Picture"/><p:cNvPicPr/><p:nvPr/></p:nvPicPr><p:blipFill><a:blip r:embed="rId2"/></p:blipFill></p:pic></p:spTree></p:cSld></p:sld>"#;
let (shapes, _) = parse_slide(slide_xml);
assert_eq!(shapes.len(), 1);
match &shapes[0] {
ShapeContent::Image { rel_id, .. } => assert_eq!(rel_id, "rId2"),
other => panic!("expected Image, got {:?}", other),
}
}
#[test]
fn test_pptx_image_alt_text_extracted() {
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"><p:cSld><p:spTree><p:pic><p:nvPicPr><p:cNvPr id="10" descr="A beautiful chart" name="Picture"/><p:cNvPicPr/><p:nvPr/></p:nvPicPr><p:blipFill><a:blip r:embed="rId2"/></p:blipFill></p:pic></p:spTree></p:cSld></p:sld>"#;
let (shapes, _) = parse_slide(slide_xml);
assert_eq!(shapes.len(), 1);
match &shapes[0] {
ShapeContent::Image { rel_id, alt_text } => {
assert_eq!(rel_id, "rId2");
assert_eq!(alt_text.as_deref(), Some("A beautiful chart"));
}
other => panic!("expected Image, got {:?}", other),
}
}
#[test]
fn test_pptx_image_alt_text_missing() {
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"><p:cSld><p:spTree><p:pic><p:nvPicPr><p:cNvPr id="10" name="Picture"/><p:cNvPicPr/><p:nvPr/></p:nvPicPr><p:blipFill><a:blip r:embed="rId3"/></p:blipFill></p:pic></p:spTree></p:cSld></p:sld>"#;
let (shapes, _) = parse_slide(slide_xml);
assert_eq!(shapes.len(), 1);
match &shapes[0] {
ShapeContent::Image { rel_id, alt_text } => {
assert_eq!(rel_id, "rId3");
assert!(alt_text.is_none());
}
other => panic!("expected Image, got {:?}", other),
}
}
#[test]
fn test_pptx_image_alt_text_in_markdown() {
let data = build_test_pptx(&[TestSlide {
title: Some("Images"),
body_texts: vec![],
notes: None,
table: None,
images: vec!["rIdImg1"],
image_alt_texts: vec![Some("A diagram of the architecture")],
}]);
let converter = PptxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(
result
.markdown
.contains(""),
"markdown was: {}",
result.markdown
);
}
#[test]
fn test_pptx_line_break() {
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p:cSld><p:spTree><p:sp><p:nvSpPr><p:cNvPr id="1" name="Content"/><p:cNvSpPr/><p:nvPr><p:ph type="body"/></p:nvPr></p:nvSpPr><p:txBody><a:p><a:r><a:t>Line one</a:t><a:br/><a:t>Line two</a:t></a:r></a:p></p:txBody></p:sp></p:spTree></p:cSld></p:sld>"#;
let (shapes, _) = parse_slide(slide_xml);
match &shapes[0] {
ShapeContent::Body(text) => assert!(text.contains("Line one\nLine two")),
other => panic!("expected Body, got {:?}", other),
}
}
// ---- Image describer tests ----
use crate::converter::ImageDescriber;
use std::sync::Arc;
struct MockDescriber {
description: String,
}
impl ImageDescriber for MockDescriber {
fn describe(
&self,
_image_bytes: &[u8],
_mime_type: &str,
_prompt: &str,
) -> Result<String, ConvertError> {
Ok(self.description.clone())
}
}
struct FailingDescriber;
impl ImageDescriber for FailingDescriber {
fn describe(
&self,
_image_bytes: &[u8],
_mime_type: &str,
_prompt: &str,
) -> Result<String, ConvertError> {
Err(ConvertError::ImageDescriptionError {
reason: "API error".to_string(),
})
}
}
/// Build a PPTX with actual image data embedded in the ZIP for describer tests.
fn build_test_pptx_with_image_data(
slides: &[TestSlide],
image_data: &[(&str, &[u8])], // (path in zip, data)
) -> Vec<u8> {
use std::io::Write;
use zip::ZipWriter;
use zip::write::SimpleFileOptions;
let buf = Vec::new();
let mut zip = ZipWriter::new(Cursor::new(buf));
let opts = SimpleFileOptions::default();
let mut ct = String::from(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>"#);
ct.push_str(
r#"<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">"#,
);
ct.push_str(
r#"<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>"#,
);
ct.push_str(r#"<Default Extension="xml" ContentType="application/xml"/>"#);
ct.push_str(r#"<Default Extension="png" ContentType="image/png"/>"#);
ct.push_str("</Types>");
zip.start_file("[Content_Types].xml", opts).unwrap();
zip.write_all(ct.as_bytes()).unwrap();
let mut pres_xml = String::from(
r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"><p:sldIdLst>"#,
);
let mut pres_rels_xml = String::from(
r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">"#,
);
for (i, slide) in slides.iter().enumerate() {
let slide_num = i + 1;
let rid = format!("rId{slide_num}");
let slide_id = 256 + i;
pres_xml.push_str(&format!(r#"<p:sldId id="{slide_id}" r:id="{rid}"/>"#));
pres_rels_xml.push_str(&format!(
r#"<Relationship Id="{rid}" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide{slide_num}.xml"/>"#
));
let slide_xml = build_slide_xml(slide);
zip.start_file(format!("ppt/slides/slide{slide_num}.xml"), opts)
.unwrap();
zip.write_all(slide_xml.as_bytes()).unwrap();
if slide.notes.is_some() || !slide.images.is_empty() {
let mut slide_rels = String::from(
r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">"#,
);
if slide.notes.is_some() {
slide_rels.push_str(&format!(
r#"<Relationship Id="rIdNotes" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide" Target="../notesSlides/notesSlide{slide_num}.xml"/>"#
));
}
for (img_idx, _) in slide.images.iter().enumerate() {
let img_rid = format!("rIdImg{}", img_idx + 1);
slide_rels.push_str(&format!(
r#"<Relationship Id="{img_rid}" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image{}.png"/>"#,
img_idx + 1
));
}
slide_rels.push_str("</Relationships>");
zip.start_file(format!("ppt/slides/_rels/slide{slide_num}.xml.rels"), opts)
.unwrap();
zip.write_all(slide_rels.as_bytes()).unwrap();
}
if let Some(notes_text) = slide.notes {
let notes_xml = build_notes_xml(notes_text);
zip.start_file(format!("ppt/notesSlides/notesSlide{slide_num}.xml"), opts)
.unwrap();
zip.write_all(notes_xml.as_bytes()).unwrap();
}
}
pres_xml.push_str("</p:sldIdLst></p:presentation>");
pres_rels_xml.push_str("</Relationships>");
zip.start_file("ppt/presentation.xml", opts).unwrap();
zip.write_all(pres_xml.as_bytes()).unwrap();
zip.start_file("ppt/_rels/presentation.xml.rels", opts)
.unwrap();
zip.write_all(pres_rels_xml.as_bytes()).unwrap();
// Add image files
for (path, data) in image_data {
zip.start_file(path.to_string(), opts).unwrap();
zip.write_all(data).unwrap();
}
let cursor = zip.finish().unwrap();
cursor.into_inner()
}
#[test]
fn test_pptx_multiple_images_on_one_slide() {
let data = build_test_pptx_with_image_data(
&[TestSlide {
title: Some("Gallery"),
body_texts: vec![],
notes: None,
table: None,
images: vec!["rIdImg1", "rIdImg2"],
image_alt_texts: vec![Some("First image"), Some("Second image")],
}],
&[
("ppt/media/image1.png", b"fake-png-1"),
("ppt/media/image2.png", b"fake-png-2"),
],
);
let converter = PptxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(
result.markdown.contains(""),
"markdown was: {}",
result.markdown
);
assert!(
result.markdown.contains(""),
"markdown was: {}",
result.markdown
);
// Verify ordering: image1 appears before image2
let pos1 = result.markdown.find("image1.png").unwrap();
let pos2 = result.markdown.find("image2.png").unwrap();
assert!(pos1 < pos2, "image1 should appear before image2");
}
#[test]
fn test_pptx_image_describer_replaces_alt_text() {
let data = build_test_pptx_with_image_data(
&[TestSlide {
title: Some("Slide with Image"),
body_texts: vec![],
notes: None,
table: None,
images: vec!["rIdImg1"],
image_alt_texts: vec![None],
}],
&[("ppt/media/image1.png", b"fake-png-data")],
);
let converter = PptxConverter;
let options = ConversionOptions {
image_describer: Some(Arc::new(MockDescriber {
description: "A diagram showing data flow".to_string(),
})),
..Default::default()
};
let result = converter.convert(&data, &options).unwrap();
assert!(
result
.markdown
.contains(""),
"markdown was: {}",
result.markdown
);
assert!(result.images.is_empty());
}
#[test]
fn test_pptx_image_describer_error_keeps_original_alt() {
let data = build_test_pptx_with_image_data(
&[TestSlide {
title: Some("Slide"),
body_texts: vec![],
notes: None,
table: None,
images: vec!["rIdImg1"],
image_alt_texts: vec![Some("Original description")],
}],
&[("ppt/media/image1.png", b"fake-png-data")],
);
let converter = PptxConverter;
let options = ConversionOptions {
image_describer: Some(Arc::new(FailingDescriber)),
..Default::default()
};
let result = converter.convert(&data, &options).unwrap();
assert!(
result
.markdown
.contains(""),
"markdown was: {}",
result.markdown
);
assert!(
result
.warnings
.iter()
.any(|w| w.code == WarningCode::SkippedElement
&& w.message.contains("image description failed")),
);
}
// ---- Group shape tests ----
#[test]
fn test_pptx_group_shape_text_extracted() {
// A single <p:sp> inside a <p:grpSp> should have its text extracted
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p:cSld><p:spTree><p:grpSp><p:nvGrpSpPr><p:cNvPr id="10" name="Group 1"/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr><p:grpSpPr/><p:sp><p:nvSpPr><p:cNvPr id="11" name="TextBox"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr><p:txBody><a:p><a:r><a:t>Group text</a:t></a:r></a:p></p:txBody></p:sp></p:grpSp></p:spTree></p:cSld></p:sld>"#;
let (shapes, warnings) = parse_slide(slide_xml);
assert!(warnings.is_empty());
assert_eq!(shapes.len(), 1);
match &shapes[0] {
ShapeContent::Body(text) => assert_eq!(text, "Group text"),
other => panic!("expected Body, got {:?}", other),
}
}
#[test]
fn test_pptx_group_shape_multiple_children() {
// Two <p:sp> shapes inside one <p:grpSp>
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p:cSld><p:spTree><p:grpSp><p:nvGrpSpPr><p:cNvPr id="10" name="Group"/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr><p:grpSpPr/><p:sp><p:nvSpPr><p:cNvPr id="11" name="Shape1"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr><p:txBody><a:p><a:r><a:t>First shape</a:t></a:r></a:p></p:txBody></p:sp><p:sp><p:nvSpPr><p:cNvPr id="12" name="Shape2"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr><p:txBody><a:p><a:r><a:t>Second shape</a:t></a:r></a:p></p:txBody></p:sp></p:grpSp></p:spTree></p:cSld></p:sld>"#;
let (shapes, warnings) = parse_slide(slide_xml);
assert!(warnings.is_empty());
assert_eq!(shapes.len(), 2);
match &shapes[0] {
ShapeContent::Body(text) => assert_eq!(text, "First shape"),
other => panic!("expected Body, got {:?}", other),
}
match &shapes[1] {
ShapeContent::Body(text) => assert_eq!(text, "Second shape"),
other => panic!("expected Body, got {:?}", other),
}
}
#[test]
fn test_pptx_group_shape_nested() {
// <p:grpSp> inside <p:grpSp> with a <p:sp> child
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p:cSld><p:spTree><p:grpSp><p:nvGrpSpPr><p:cNvPr id="10" name="Outer"/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr><p:grpSpPr/><p:grpSp><p:nvGrpSpPr><p:cNvPr id="11" name="Inner"/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr><p:grpSpPr/><p:sp><p:nvSpPr><p:cNvPr id="12" name="Deep"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr><p:txBody><a:p><a:r><a:t>Nested group text</a:t></a:r></a:p></p:txBody></p:sp></p:grpSp></p:grpSp></p:spTree></p:cSld></p:sld>"#;
let (shapes, warnings) = parse_slide(slide_xml);
assert!(warnings.is_empty());
assert_eq!(shapes.len(), 1);
match &shapes[0] {
ShapeContent::Body(text) => assert_eq!(text, "Nested group text"),
other => panic!("expected Body, got {:?}", other),
}
}
#[test]
fn test_pptx_group_shape_with_table() {
// <p:graphicFrame> (table) inside a <p:grpSp>
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p:cSld><p:spTree><p:grpSp><p:nvGrpSpPr><p:cNvPr id="10" name="Group"/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr><p:grpSpPr/><p:graphicFrame><p:nvGraphicFramePr><p:cNvPr id="11" name="Table"/><p:cNvGraphicFramePr/><p:nvPr/></p:nvGraphicFramePr><a:graphic><a:graphicData><a:tbl><a:tr><a:tc><a:txBody><a:p><a:r><a:t>H1</a:t></a:r></a:p></a:txBody></a:tc><a:tc><a:txBody><a:p><a:r><a:t>H2</a:t></a:r></a:p></a:txBody></a:tc></a:tr><a:tr><a:tc><a:txBody><a:p><a:r><a:t>A</a:t></a:r></a:p></a:txBody></a:tc><a:tc><a:txBody><a:p><a:r><a:t>B</a:t></a:r></a:p></a:txBody></a:tc></a:tr></a:tbl></a:graphicData></a:graphic></p:graphicFrame></p:grpSp></p:spTree></p:cSld></p:sld>"#;
let (shapes, warnings) = parse_slide(slide_xml);
assert!(warnings.is_empty());
assert_eq!(shapes.len(), 1);
match &shapes[0] {
ShapeContent::Table { headers, rows } => {
assert_eq!(headers, &["H1", "H2"]);
assert_eq!(rows.len(), 1);
assert_eq!(rows[0], &["A", "B"]);
}
other => panic!("expected Table, got {:?}", other),
}
}
#[test]
fn test_pptx_group_shape_with_image() {
// <p:pic> inside a <p:grpSp>
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"><p:cSld><p:spTree><p:grpSp><p:nvGrpSpPr><p:cNvPr id="10" name="Group"/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr><p:grpSpPr/><p:pic><p:nvPicPr><p:cNvPr id="11" descr="Alt text" name="Picture"/><p:cNvPicPr/><p:nvPr/></p:nvPicPr><p:blipFill><a:blip r:embed="rIdImg1"/></p:blipFill></p:pic></p:grpSp></p:spTree></p:cSld></p:sld>"#;
let (shapes, warnings) = parse_slide(slide_xml);
assert!(warnings.is_empty());
assert_eq!(shapes.len(), 1);
match &shapes[0] {
ShapeContent::Image { rel_id, alt_text } => {
assert_eq!(rel_id, "rIdImg1");
assert_eq!(alt_text.as_deref(), Some("Alt text"));
}
other => panic!("expected Image, got {:?}", other),
}
}
#[test]
fn test_pptx_group_shape_empty() {
// Empty <p:grpSp> produces no shapes
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p:cSld><p:spTree><p:grpSp><p:nvGrpSpPr><p:cNvPr id="10" name="Empty Group"/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr><p:grpSpPr/></p:grpSp></p:spTree></p:cSld></p:sld>"#;
let (shapes, warnings) = parse_slide(slide_xml);
assert!(warnings.is_empty());
assert!(shapes.is_empty());
}
// ---- Comment extraction tests ----
// -- unit: author registries --
#[test]
fn test_parse_comment_authors_legacy() {
let xml = r#"<?xml version="1.0"?><p:cmAuthorLst xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"><p:cmAuthor id="0" name="Julie Lee" initials="JL"/><p:cmAuthor id="1" name="Sam Park" initials="SP"/></p:cmAuthorLst>"#;
let authors = parse_author_registry(xml, "cmAuthor");
assert_eq!(authors.get("0").map(|s| s.as_str()), Some("Julie Lee"));
assert_eq!(authors.get("1").map(|s| s.as_str()), Some("Sam Park"));
}
#[test]
fn test_parse_authors_modern() {
let xml = r#"<?xml version="1.0"?><p188:authorLst xmlns:p188="http://schemas.microsoft.com/office/powerpoint/2018/8/main"><p188:author id="{GUID-1}" name="Julie Lee" initials="JL" userId="u1" providerId="AD"/></p188:authorLst>"#;
let authors = parse_author_registry(xml, "author");
assert_eq!(
authors.get("{GUID-1}").map(|s| s.as_str()),
Some("Julie Lee")
);
}
// -- unit: comment files --
#[test]
fn test_parse_legacy_comments() {
let mut authors = HashMap::new();
authors.insert("0".to_string(), "Julie Lee".to_string());
let xml = r#"<?xml version="1.0"?><p:cmLst xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"><p:cm authorId="0" dt="2024-01-15T10:30:00.000" idx="1"><p:pos x="10" y="10"/><p:text>Add a diagram here.</p:text></p:cm></p:cmLst>"#;
let raw = parse_legacy_comments(xml, &authors);
assert_eq!(raw.len(), 1);
assert_eq!(raw[0].author, "Julie Lee");
assert_eq!(raw[0].date, "2024-01-15T10:30:00.000");
assert_eq!(raw[0].body, "Add a diagram here.");
assert!(!raw[0].is_reply);
}
#[test]
fn test_parse_legacy_comments_unknown_author_empty() {
let authors = HashMap::new();
let xml = r#"<?xml version="1.0"?><p:cmLst xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"><p:cm authorId="7" dt="2024-01-15T10:30:00.000" idx="1"><p:text>Orphan author.</p:text></p:cm></p:cmLst>"#;
let raw = parse_legacy_comments(xml, &authors);
assert_eq!(raw[0].author, ""); // resolved to Unknown downstream
assert_eq!(raw[0].body, "Orphan author.");
}
#[test]
fn test_parse_modern_comments_with_reply() {
let mut authors = HashMap::new();
authors.insert("{A1}".to_string(), "Alice".to_string());
authors.insert("{A2}".to_string(), "Bob".to_string());
let xml = r#"<?xml version="1.0"?><p188:cmLst xmlns:p188="http://schemas.microsoft.com/office/powerpoint/2018/8/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p188:cm id="{C1}" authorId="{A1}" created="2024-01-15T10:30:00.000" status="active"><p188:replyLst><p188:reply id="{C2}" authorId="{A2}" created="2024-01-15T11:00:00.000" status="active"><p188:txBody><a:bodyPr/><a:p><a:r><a:t>Agreed, will fix.</a:t></a:r></a:p></p188:txBody></p188:reply></p188:replyLst><p188:txBody><a:bodyPr/><a:p><a:r><a:t>Needs a clearer title.</a:t></a:r></a:p></p188:txBody></p188:cm></p188:cmLst>"#;
let raw = parse_modern_comments(xml, &authors);
assert_eq!(raw.len(), 2);
// The reply closes first (nested), then the parent.
let reply = raw.iter().find(|c| c.is_reply).expect("a reply");
let top = raw.iter().find(|c| !c.is_reply).expect("a top-level");
assert_eq!(reply.author, "Bob");
assert_eq!(reply.body, "Agreed, will fix.");
assert_eq!(top.author, "Alice");
assert_eq!(top.body, "Needs a clearer title.");
}
#[test]
fn test_slide_label() {
assert_eq!(
slide_label(2, Some("Quarterly Results")),
"Slide 2: Quarterly Results"
);
assert_eq!(slide_label(2, None), "Slide 2");
assert_eq!(slide_label(3, Some(" ")), "Slide 3");
}
#[test]
fn test_slide_label_collapses_newline_in_title() {
// Finding 2: a multi-line title must not inject a newline that breaks the
// single-line `- **source**:` list item.
let label = slide_label(1, Some("Line1\nLine2"));
assert!(!label.contains('\n'), "label has a newline: {label:?}");
assert_eq!(label, "Slide 1: Line1 Line2");
}
#[test]
fn test_parse_legacy_comments_self_closing_text_no_leak() {
// Finding 5: self-closing <p:text/> must not capture stray sibling text.
let xml = r#"<?xml version="1.0"?><p:cmLst xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"><p:cm authorId="0" dt="d"><p:text/></p:cm></p:cmLst>"#;
let authors = HashMap::new();
let raw = parse_legacy_comments(xml, &authors);
assert_eq!(raw.len(), 1);
assert_eq!(raw[0].body, "");
}
#[test]
fn test_parse_modern_comments_self_closing_text_no_leak() {
// Finding 5: self-closing <a:t/> must not leak text into the body.
let xml = r#"<?xml version="1.0"?><p188:cmLst xmlns:p188="http://schemas.microsoft.com/office/powerpoint/2018/8/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p188:cm authorId="a" created="c"><p188:txBody><a:p><a:r><a:t/></a:r><a:r><a:t>real</a:t></a:r></a:p></p188:txBody></p188:cm></p188:cmLst>"#;
let authors = HashMap::new();
let raw = parse_modern_comments(xml, &authors);
assert_eq!(raw.len(), 1);
assert_eq!(raw[0].body, "real");
}
#[test]
fn test_parse_author_registry_unescapes_name() {
// Finding 8: author names must be XML-unescaped.
let xml = r#"<?xml version="1.0"?><p:cmAuthorLst xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"><p:cmAuthor id="0" name="Ben & Jerry"/></p:cmAuthorLst>"#;
let authors = parse_author_registry(xml, "cmAuthor");
assert_eq!(authors.get("0").map(|s| s.as_str()), Some("Ben & Jerry"));
}
// -- integration: build a PPTX with comment parts --
/// Build a one-slide PPTX with a single legacy or modern comment part.
/// `modern` selects the relationship type + comment-file XML scheme.
fn build_pptx_with_comment(modern: bool, slide_title: Option<&str>) -> Vec<u8> {
use std::io::Write;
use zip::ZipWriter;
use zip::write::SimpleFileOptions;
let buf = Vec::new();
let mut zip = ZipWriter::new(Cursor::new(buf));
let opts = SimpleFileOptions::default();
zip.start_file("[Content_Types].xml", opts).unwrap();
zip.write_all(br#"<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Default Extension="xml" ContentType="application/xml"/></Types>"#).unwrap();
// presentation.xml + rels
zip.start_file("ppt/presentation.xml", opts).unwrap();
zip.write_all(br#"<?xml version="1.0"?><p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"><p:sldIdLst><p:sldId id="256" r:id="rId1"/></p:sldIdLst></p:presentation>"#).unwrap();
zip.start_file("ppt/_rels/presentation.xml.rels", opts)
.unwrap();
zip.write_all(br#"<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/></Relationships>"#).unwrap();
// slide1.xml
let title_shape = match slide_title {
Some(t) => format!(
r#"<p:sp><p:nvSpPr><p:cNvPr id="1" name="Title"/><p:cNvSpPr/><p:nvPr><p:ph type="title"/></p:nvPr></p:nvSpPr><p:txBody><a:p><a:r><a:t>{t}</a:t></a:r></a:p></p:txBody></p:sp>"#
),
None => String::new(),
};
let slide = format!(
r#"<?xml version="1.0"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p:cSld><p:spTree>{title_shape}</p:spTree></p:cSld></p:sld>"#
);
zip.start_file("ppt/slides/slide1.xml", opts).unwrap();
zip.write_all(slide.as_bytes()).unwrap();
// slide rels -> comment part
let (rel_type, comment_path) = if modern {
(
"http://schemas.microsoft.com/office/2018/10/relationships/comments",
"../comments/modernComment_x.xml",
)
} else {
(
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments",
"../comments/comment1.xml",
)
};
zip.start_file("ppt/slides/_rels/slide1.xml.rels", opts)
.unwrap();
zip.write_all(
format!(
r#"<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rIdC" Type="{rel_type}" Target="{comment_path}"/></Relationships>"#
)
.as_bytes(),
)
.unwrap();
if modern {
zip.start_file("ppt/authors.xml", opts).unwrap();
zip.write_all(br#"<?xml version="1.0"?><p188:authorLst xmlns:p188="http://schemas.microsoft.com/office/powerpoint/2018/8/main"><p188:author id="{A1}" name="Alice" userId="u" providerId="AD"/><p188:author id="{A2}" name="Bob" userId="u" providerId="AD"/></p188:authorLst>"#).unwrap();
zip.start_file("ppt/comments/modernComment_x.xml", opts)
.unwrap();
zip.write_all(br#"<?xml version="1.0"?><p188:cmLst xmlns:p188="http://schemas.microsoft.com/office/powerpoint/2018/8/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p188:cm id="{C1}" authorId="{A1}" created="2024-01-15T10:30:00.000"><p188:replyLst><p188:reply id="{C2}" authorId="{A2}" created="2024-01-15T11:00:00.000"><p188:txBody><a:bodyPr/><a:p><a:r><a:t>Agreed.</a:t></a:r></a:p></p188:txBody></p188:reply></p188:replyLst><p188:txBody><a:bodyPr/><a:p><a:r><a:t>Please clarify.</a:t></a:r></a:p></p188:txBody></p188:cm></p188:cmLst>"#).unwrap();
} else {
zip.start_file("ppt/commentAuthors.xml", opts).unwrap();
zip.write_all(br#"<?xml version="1.0"?><p:cmAuthorLst xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"><p:cmAuthor id="0" name="Julie Lee" initials="JL"/></p:cmAuthorLst>"#).unwrap();
zip.start_file("ppt/comments/comment1.xml", opts).unwrap();
zip.write_all(br#"<?xml version="1.0"?><p:cmLst xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"><p:cm authorId="0" dt="2024-01-15T10:30:00.000" idx="1"><p:pos x="10" y="10"/><p:text>Add a diagram here.</p:text></p:cm></p:cmLst>"#).unwrap();
}
let cursor = zip.finish().unwrap();
cursor.into_inner()
}
#[test]
fn test_pptx_legacy_comments_end_to_end() {
let data = build_pptx_with_comment(false, Some("Overview"));
let options = ConversionOptions {
extract_comments: true,
..Default::default()
};
let result = PptxConverter.convert(&data, &options).unwrap();
assert!(
result.markdown.contains("# Comments"),
"md: {}",
result.markdown
);
assert!(result.markdown.contains("## 1"));
assert!(
result
.markdown
.contains("- **author**: Julie Lee (2024-01-15T10:30:00.000)")
);
assert!(
result
.markdown
.contains("- **comment**: Add a diagram here.")
);
assert!(result.markdown.contains("- **source**: Slide 1: Overview"));
}
#[test]
fn test_pptx_modern_comments_with_reply_end_to_end() {
let data = build_pptx_with_comment(true, None);
let options = ConversionOptions {
extract_comments: true,
..Default::default()
};
let result = PptxConverter.convert(&data, &options).unwrap();
assert!(
result.markdown.contains("# Comments"),
"md: {}",
result.markdown
);
assert!(result.markdown.contains("- **comment**: Please clarify."));
assert!(
result.markdown.contains("- **comment**: (reply) Agreed."),
"reply not marked, md: {}",
result.markdown
);
// Parent comment must be emitted before its reply (document order).
let parent = result.markdown.find("Please clarify.").unwrap();
let reply = result.markdown.find("(reply) Agreed.").unwrap();
assert!(
parent < reply,
"parent must precede reply, md: {}",
result.markdown
);
// No slide title -> bare "Slide 1".
assert!(result.markdown.contains("- **source**: Slide 1\n"));
}
#[test]
fn test_pptx_comments_absent_when_flag_off() {
let data = build_pptx_with_comment(false, Some("Overview"));
let result = PptxConverter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(!result.markdown.contains("# Comments"));
}
#[test]
fn test_pptx_comments_plain_text_stripped() {
let data = build_pptx_with_comment(false, Some("Overview"));
let options = ConversionOptions {
extract_comments: true,
..Default::default()
};
let result = PptxConverter.convert(&data, &options).unwrap();
assert!(result.plain_text.contains("Comments\n"));
assert!(
result
.plain_text
.contains("author: Julie Lee (2024-01-15T10:30:00.000)")
);
assert!(result.plain_text.contains("source: Slide 1: Overview"));
assert!(!result.plain_text.contains("# Comments"));
assert!(!result.plain_text.contains("**"));
}
#[test]
fn test_pptx_dual_scheme_no_double_report() {
// Finding 2: a slide referencing BOTH a legacy and a modern comment part
// (same thread, for back-compat) must not report the comment twice.
use std::io::Write;
use zip::ZipWriter;
use zip::write::SimpleFileOptions;
let buf = Vec::new();
let mut zip = ZipWriter::new(Cursor::new(buf));
let opts = SimpleFileOptions::default();
zip.start_file("[Content_Types].xml", opts).unwrap();
zip.write_all(br#"<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Default Extension="xml" ContentType="application/xml"/></Types>"#).unwrap();
zip.start_file("ppt/presentation.xml", opts).unwrap();
zip.write_all(br#"<?xml version="1.0"?><p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"><p:sldIdLst><p:sldId id="256" r:id="rId1"/></p:sldIdLst></p:presentation>"#).unwrap();
zip.start_file("ppt/_rels/presentation.xml.rels", opts)
.unwrap();
zip.write_all(br#"<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/></Relationships>"#).unwrap();
zip.start_file("ppt/slides/slide1.xml", opts).unwrap();
zip.write_all(br#"<?xml version="1.0"?><p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p:cSld><p:spTree/></p:cSld></p:sld>"#).unwrap();
// Slide rels carry BOTH a legacy and a modern comments relationship.
zip.start_file("ppt/slides/_rels/slide1.xml.rels", opts)
.unwrap();
zip.write_all(br#"<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rIdL" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments" Target="../comments/comment1.xml"/><Relationship Id="rIdM" Type="http://schemas.microsoft.com/office/2018/10/relationships/comments" Target="../comments/modernComment_x.xml"/></Relationships>"#).unwrap();
zip.start_file("ppt/commentAuthors.xml", opts).unwrap();
zip.write_all(br#"<?xml version="1.0"?><p:cmAuthorLst xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"><p:cmAuthor id="0" name="Dana"/></p:cmAuthorLst>"#).unwrap();
zip.start_file("ppt/comments/comment1.xml", opts).unwrap();
zip.write_all(br#"<?xml version="1.0"?><p:cmLst xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"><p:cm authorId="0" dt="d" idx="1"><p:text>Shared thread.</p:text></p:cm></p:cmLst>"#).unwrap();
zip.start_file("ppt/authors.xml", opts).unwrap();
zip.write_all(br#"<?xml version="1.0"?><p188:authorLst xmlns:p188="http://schemas.microsoft.com/office/powerpoint/2018/8/main"><p188:author id="{A}" name="Dana" userId="u" providerId="AD"/></p188:authorLst>"#).unwrap();
zip.start_file("ppt/comments/modernComment_x.xml", opts)
.unwrap();
zip.write_all(br#"<?xml version="1.0"?><p188:cmLst xmlns:p188="http://schemas.microsoft.com/office/powerpoint/2018/8/main" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><p188:cm authorId="{A}" created="c"><p188:txBody><a:p><a:r><a:t>Shared thread.</a:t></a:r></a:p></p188:txBody></p188:cm></p188:cmLst>"#).unwrap();
let data = zip.finish().unwrap().into_inner();
let options = ConversionOptions {
extract_comments: true,
..Default::default()
};
let result = PptxConverter.convert(&data, &options).unwrap();
// Exactly one comment, not two.
assert_eq!(
result
.markdown
.matches("- **comment**: Shared thread.")
.count(),
1,
"comment double-reported, md: {}",
result.markdown
);
assert!(result.markdown.contains("## 1"));
assert!(!result.markdown.contains("## 2"));
}
}