mod container;
mod content_builder;
mod elements;
mod image_handling;
mod metadata;
mod parser;
use ahash::AHashMap;
use bytes::Bytes;
use crate::error::Result;
use crate::types::builder::{self, DocumentStructureBuilder};
use crate::types::document_structure::TextAnnotation;
use crate::types::extraction::BoundingBox;
use crate::types::{ExtractedImage, PptxExtractionResult};
use container::{PptxContainer, SlideIterator};
use content_builder::ContentBuilder;
use elements::{ParserConfig, Run, SlideElement};
use image_handling::detect_image_format;
use metadata::{extract_all_notes, extract_metadata};
fn join_runs_with_spacing(runs: &[Run], extract: impl Fn(&Run) -> String) -> String {
let mut result = String::new();
for run in runs {
let text = extract(run);
if !result.is_empty() && !text.is_empty() {
let ends_ws = result.ends_with(|c: char| c.is_whitespace());
let starts_ws = text.starts_with(|c: char| c.is_whitespace());
if !ends_ws && !starts_ws {
result.push(' ');
}
}
result.push_str(&text);
}
result
}
pub fn extract_pptx_from_path(
path: &str,
extract_images: bool,
page_config: Option<&crate::core::config::PageConfig>,
plain: bool,
include_structure: bool,
) -> Result<PptxExtractionResult> {
let container = PptxContainer::open(path)?;
extract_pptx_from_container(container, extract_images, page_config, plain, include_structure)
}
pub fn extract_pptx_from_bytes(
data: &[u8],
extract_images: bool,
page_config: Option<&crate::core::config::PageConfig>,
plain: bool,
include_structure: bool,
) -> Result<PptxExtractionResult> {
let container = PptxContainer::from_bytes(data)?;
extract_pptx_from_container(container, extract_images, page_config, plain, include_structure)
}
fn extract_pptx_from_container<R: std::io::Read + std::io::Seek>(
mut container: PptxContainer<R>,
extract_images: bool,
page_config: Option<&crate::core::config::PageConfig>,
plain: bool,
include_structure: bool,
) -> Result<PptxExtractionResult> {
let config = ParserConfig {
extract_images,
plain,
..Default::default()
};
let metadata = extract_metadata(&mut container.archive);
let notes = extract_all_notes(&mut container)?;
let mut iterator = SlideIterator::new(container);
let slide_count = iterator.slide_count();
let estimated_capacity = slide_count.saturating_mul(1000).max(8192);
let mut content_builder = ContentBuilder::with_page_config(estimated_capacity, page_config.cloned(), plain);
let mut total_image_count = 0;
let mut total_table_count = 0;
let mut extracted_images = Vec::new();
let mut doc_builder = if include_structure {
Some(DocumentStructureBuilder::new().source_format("pptx"))
} else {
None
};
let mut image_index_counter: u32 = 0;
while let Some(slide) = iterator.next_slide()? {
let byte_start = if page_config.is_some() {
content_builder.start_slide(slide.slide_number)
} else {
0
};
let slide_content = slide.to_markdown(&config);
content_builder.add_text(&slide_content);
if let Some(slide_notes) = notes.get(&slide.slide_number) {
content_builder.add_notes(slide_notes);
}
if page_config.is_some() {
content_builder.end_slide(slide.slide_number, byte_start, slide_content.clone());
}
if let Some(ref mut builder) = doc_builder {
build_slide_structure(&slide, builder, &mut image_index_counter);
}
if config.extract_images
&& let Ok(image_data) = iterator.get_slide_images(&slide)
{
let image_elements: Vec<_> = slide
.elements
.iter()
.filter_map(|e| {
if let SlideElement::Image(img_ref, pos) = e {
Some((img_ref, pos))
} else {
None
}
})
.collect();
for (img_idx_in_slide, (_, data)) in image_data.iter().enumerate() {
let format = detect_image_format(data);
let image_index = extracted_images.len();
let (width, height, description, bbox) =
if let Some((img_ref, pos)) = image_elements.get(img_idx_in_slide) {
let w = if pos.cx > 0 { Some((pos.cx / 9525) as u32) } else { None };
let h = if pos.cy > 0 { Some((pos.cy / 9525) as u32) } else { None };
(w, h, img_ref.description.clone(), position_to_bbox(pos))
} else {
(None, None, None, None)
};
extracted_images.push(ExtractedImage {
data: Bytes::from(data.clone()),
format,
image_index,
page_number: Some(slide.slide_number as usize),
width,
height,
colorspace: None,
bits_per_component: None,
is_mask: false,
description,
ocr_result: None,
bounding_box: bbox,
});
}
}
total_image_count += slide.image_count();
total_table_count += slide.table_count();
}
let (content, boundaries, mut page_contents) = content_builder.build();
if let Some(ref mut pcs) = page_contents {
for pc in pcs.iter_mut() {
if extracted_images
.iter()
.any(|img| img.page_number == Some(pc.page_number))
{
pc.is_blank = Some(false);
}
}
}
let page_structure = boundaries.as_ref().map(|bounds| crate::types::PageStructure {
total_count: slide_count,
unit_type: crate::types::PageUnitType::Slide,
boundaries: Some(bounds.clone()),
pages: page_contents.as_ref().map(|pcs| {
pcs.iter()
.map(|pc| crate::types::PageInfo {
number: pc.page_number,
title: None,
dimensions: None,
image_count: None,
table_count: None,
hidden: None,
is_blank: pc.is_blank,
})
.collect()
}),
});
let document = doc_builder
.map(|b| b.build())
.and_then(|d| if d.is_empty() { None } else { Some(d) });
Ok(PptxExtractionResult {
content,
metadata,
slide_count,
image_count: total_image_count,
table_count: total_table_count,
images: extracted_images,
page_structure,
page_contents,
document,
})
}
fn runs_to_text_and_annotations(runs: &[Run]) -> (String, Vec<TextAnnotation>) {
let mut text = String::new();
let mut annotations = Vec::new();
for run in runs {
let run_text = &run.text;
if run_text.is_empty() {
continue;
}
if !text.is_empty() {
let ends_ws = text.ends_with(|c: char| c.is_whitespace());
let starts_ws = run_text.starts_with(|c: char| c.is_whitespace());
if !ends_ws && !starts_ws {
text.push(' ');
}
}
let start = text.len() as u32;
text.push_str(run_text);
let end = text.len() as u32;
if run.formatting.bold {
annotations.push(builder::bold(start, end));
}
if run.formatting.italic {
annotations.push(builder::italic(start, end));
}
if run.formatting.underlined {
annotations.push(builder::underline(start, end));
}
if run.formatting.strikethrough {
annotations.push(builder::strikethrough(start, end));
}
if let Some(sz) = run.formatting.font_size {
let pts = sz as f64 / 100.0;
let value = if pts.fract() == 0.0 {
format!("{}pt", pts as u32)
} else {
format!("{:.1}pt", pts)
};
annotations.push(builder::font_size(start, end, &value));
}
}
(text, annotations)
}
fn position_to_bbox(pos: &elements::ElementPosition) -> Option<BoundingBox> {
if pos.x == 0 && pos.y == 0 && pos.cx == 0 && pos.cy == 0 {
return None;
}
const EMU_PER_PT: f64 = 914_400.0 / 72.0;
Some(BoundingBox {
x0: pos.x as f64 / EMU_PER_PT,
y0: pos.y as f64 / EMU_PER_PT,
x1: (pos.x + pos.cx) as f64 / EMU_PER_PT,
y1: (pos.y + pos.cy) as f64 / EMU_PER_PT,
})
}
fn build_slide_structure(
slide: &elements::Slide,
doc_builder: &mut DocumentStructureBuilder,
image_index_counter: &mut u32,
) {
let mut sorted_indices: Vec<usize> = (0..slide.elements.len()).collect();
sorted_indices.sort_by_key(|&i| {
let pos = slide.elements[i].position();
(pos.y, pos.x)
});
let slide_title = sorted_indices.iter().find_map(|&idx| {
if let SlideElement::Text(text, _) = &slide.elements[idx] {
let plain = join_runs_with_spacing(&text.runs, Run::extract);
let normalized = plain.replace('\n', " ");
if normalized.len() < 100 && !normalized.trim().is_empty() {
return Some(normalized.trim().to_string());
}
}
None
});
doc_builder.push_slide(slide.slide_number, slide_title.as_deref());
let mut first_title_seen = false;
for &idx in &sorted_indices {
let elem = &slide.elements[idx];
let pos = elem.position();
let bbox = position_to_bbox(&pos);
match elem {
SlideElement::Text(text, _) => {
let (plain_text, annotations) = runs_to_text_and_annotations(&text.runs);
let normalized = plain_text.replace('\n', " ");
let is_title = normalized.len() < 100 && !normalized.trim().is_empty();
if is_title && !first_title_seen {
first_title_seen = true;
doc_builder.push_heading(1, normalized.trim(), None, bbox);
} else if !plain_text.trim().is_empty() {
let node_idx = doc_builder.push_paragraph(&plain_text, annotations, None, bbox);
if let Some(lang) = text.runs.iter().find_map(|r| {
let l = &r.formatting.lang;
if l.is_empty() { None } else { Some(l.clone()) }
}) {
let mut attrs = AHashMap::new();
attrs.insert("lang".to_string(), lang);
doc_builder.set_attributes(node_idx, attrs);
}
}
}
SlideElement::Table(table, _) => {
let cells: Vec<Vec<String>> = table
.rows
.iter()
.map(|row| {
row.cells
.iter()
.map(|cell| join_runs_with_spacing(&cell.runs, Run::extract))
.collect()
})
.collect();
if !cells.is_empty() {
doc_builder.push_table_from_cells(&cells, None);
}
}
SlideElement::List(list, _) => {
if !list.items.is_empty() {
let is_ordered = list.items.first().is_some_and(|item| item.is_ordered);
let list_node = doc_builder.push_list(is_ordered, None);
for item in &list.items {
let item_text = join_runs_with_spacing(&item.runs, Run::extract);
if !item_text.trim().is_empty() {
doc_builder.push_list_item(list_node, item_text.trim(), None);
}
}
}
}
SlideElement::Image(img_ref, _) => {
let desc = img_ref.description.as_deref().or({
if img_ref.target.is_empty() {
None
} else {
Some(img_ref.target.as_str())
}
});
doc_builder.push_image(desc, Some(*image_index_counter), None, bbox);
*image_index_counter += 1;
}
SlideElement::Unknown => {}
}
}
doc_builder.exit_container();
}
impl elements::Slide {
fn from_xml(slide_number: u32, xml_data: &[u8], rels_data: Option<&[u8]>) -> Result<Self> {
let elements = parser::parse_slide_xml(xml_data)?;
let images = if let Some(rels) = rels_data {
parser::parse_slide_rels(rels)?
} else {
Vec::new()
};
Ok(Self {
slide_number,
elements,
images,
})
}
fn to_markdown(&self, config: &ParserConfig) -> String {
let mut builder = ContentBuilder::new(config.plain);
if config.include_slide_comment {
builder.add_slide_header(self.slide_number);
}
let mut element_indices: Vec<usize> = (0..self.elements.len()).collect();
element_indices.sort_by_key(|&i| {
let pos = self.elements[i].position();
(pos.y, pos.x)
});
for &idx in &element_indices {
match &self.elements[idx] {
SlideElement::Text(text, _) => {
let text_content: String = if config.plain {
join_runs_with_spacing(&text.runs, Run::extract)
} else {
join_runs_with_spacing(&text.runs, Run::render_as_md)
};
let normalized = text_content.replace('\n', " ");
let is_title = normalized.len() < 100 && !normalized.trim().is_empty();
if is_title {
builder.add_title(normalized.trim());
} else {
builder.add_text(&text_content);
}
}
SlideElement::Table(table, _) => {
let table_rows: Vec<Vec<String>> = table
.rows
.iter()
.map(|row| {
row.cells
.iter()
.map(|cell| join_runs_with_spacing(&cell.runs, Run::extract))
.collect()
})
.collect();
builder.add_table(&table_rows);
}
SlideElement::List(list, _) => {
for item in &list.items {
let item_text = join_runs_with_spacing(&item.runs, Run::extract);
builder.add_list_item(item.level, item.is_ordered, &item_text);
}
}
SlideElement::Image(img_ref, _) => {
builder.add_image(&img_ref.id, self.slide_number);
}
SlideElement::Unknown => {}
}
}
builder.build().0
}
fn image_count(&self) -> usize {
self.elements
.iter()
.filter(|e| matches!(e, SlideElement::Image(_, _)))
.count()
}
fn table_count(&self) -> usize {
self.elements
.iter()
.filter(|e| matches!(e, SlideElement::Table(_, _)))
.count()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn create_test_pptx_bytes(slides: Vec<&str>) -> Vec<u8> {
use std::io::Write;
use zip::write::{SimpleFileOptions, ZipWriter};
let mut buffer = Vec::new();
{
let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
let options = SimpleFileOptions::default();
zip.start_file("[Content_Types].xml", options).unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="xml" ContentType="application/xml"/>
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
</Types>"#,
)
.unwrap();
zip.start_file("ppt/presentation.xml", options).unwrap();
zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
zip.start_file("_rels/.rels", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
</Relationships>"#).unwrap();
let mut rels_xml = String::from(
r#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">"#,
);
for (i, _) in slides.iter().enumerate() {
use std::fmt::Write;
let _ = write!(
rels_xml,
r#"<Relationship Id="rId{}" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide{}.xml"/>"#,
i + 1,
i + 1
);
}
rels_xml.push_str("</Relationships>");
zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
zip.write_all(rels_xml.as_bytes()).unwrap();
for (i, text) in slides.iter().enumerate() {
let slide_xml = format!(
r#"<?xml version="1.0" encoding="UTF-8"?>
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
<p:cSld>
<p:spTree>
<p:sp>
<p:txBody>
<a:p>
<a:r>
<a:t>{}</a:t>
</a:r>
</a:p>
</p:txBody>
</p:sp>
</p:spTree>
</p:cSld>
</p:sld>"#,
text
);
zip.start_file(format!("ppt/slides/slide{}.xml", i + 1), options)
.unwrap();
zip.write_all(slide_xml.as_bytes()).unwrap();
}
zip.start_file("docProps/core.xml", options).unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dcterms="http://purl.org/dc/terms/">
<dc:title>Test Presentation</dc:title>
<dc:creator>Test Author</dc:creator>
<dc:description>Test Description</dc:description>
<dc:subject>Test Subject</dc:subject>
</cp:coreProperties>"#,
)
.unwrap();
let app_xml = format!(
r#"<?xml version="1.0" encoding="UTF-8"?>
<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"
xmlns:vt="http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes">
<Slides>{}</Slides>
<Application>Microsoft Office PowerPoint</Application>
</Properties>"#,
slides.len()
);
zip.start_file("docProps/app.xml", options).unwrap();
zip.write_all(app_xml.as_bytes()).unwrap();
let _ = zip.finish().unwrap();
}
buffer
}
#[test]
fn test_extract_pptx_from_bytes_single_slide() {
let pptx_bytes = create_test_pptx_bytes(vec!["Hello World"]);
let result = extract_pptx_from_bytes(&pptx_bytes, false, None, false, false).unwrap();
assert_eq!(result.slide_count, 1);
assert!(
result.content.contains("Hello World"),
"Content was: {}",
result.content
);
assert_eq!(result.image_count, 0);
assert_eq!(result.table_count, 0);
}
#[test]
fn test_extract_pptx_from_bytes_multiple_slides() {
let pptx_bytes = create_test_pptx_bytes(vec!["Slide 1", "Slide 2", "Slide 3"]);
let result = extract_pptx_from_bytes(&pptx_bytes, false, None, false, false).unwrap();
assert_eq!(result.slide_count, 3);
assert!(result.content.contains("Slide 1"));
assert!(result.content.contains("Slide 2"));
assert!(result.content.contains("Slide 3"));
}
#[test]
fn test_extract_pptx_metadata() {
let pptx_bytes = create_test_pptx_bytes(vec!["Content"]);
let result = extract_pptx_from_bytes(&pptx_bytes, false, None, false, false).unwrap();
assert_eq!(result.metadata.slide_count, 1);
}
#[test]
fn test_extract_pptx_empty_slides() {
let pptx_bytes = create_test_pptx_bytes(vec!["", "", ""]);
let result = extract_pptx_from_bytes(&pptx_bytes, false, None, false, false).unwrap();
assert_eq!(result.slide_count, 3);
}
#[test]
fn test_extract_pptx_from_bytes_invalid_data() {
use crate::error::KreuzbergError;
let invalid_bytes = b"not a valid pptx file";
let result = extract_pptx_from_bytes(invalid_bytes, false, None, false, false);
assert!(result.is_err());
if let Err(KreuzbergError::Parsing { message: msg, .. }) = result {
assert!(msg.contains("Failed to read PPTX archive") || msg.contains("Failed to write temp PPTX file"));
} else {
panic!("Expected ParsingError");
}
}
#[test]
fn test_extract_pptx_from_bytes_empty_data() {
let empty_bytes: &[u8] = &[];
let result = extract_pptx_from_bytes(empty_bytes, false, None, false, false);
assert!(result.is_err());
}
#[test]
fn test_detect_image_format_jpeg() {
let jpeg_header = vec![0xFF, 0xD8, 0xFF, 0xE0];
assert_eq!(detect_image_format(&jpeg_header), "jpeg");
}
#[test]
fn test_detect_image_format_png() {
let png_header = vec![0x89, 0x50, 0x4E, 0x47];
assert_eq!(detect_image_format(&png_header), "png");
}
#[test]
fn test_detect_image_format_gif() {
let gif_header = b"GIF89a";
assert_eq!(detect_image_format(gif_header), "gif");
}
#[test]
fn test_detect_image_format_bmp() {
let bmp_header = b"BM";
assert_eq!(detect_image_format(bmp_header), "bmp");
}
#[test]
fn test_detect_image_format_svg() {
let svg_header = b"<svg xmlns=\"http://www.w3.org/2000/svg\">";
assert_eq!(detect_image_format(svg_header), "svg");
}
#[test]
fn test_detect_image_format_tiff_little_endian() {
let tiff_header = vec![0x49, 0x49, 0x2A, 0x00];
assert_eq!(detect_image_format(&tiff_header), "tiff");
}
#[test]
fn test_detect_image_format_tiff_big_endian() {
let tiff_header = vec![0x4D, 0x4D, 0x00, 0x2A];
assert_eq!(detect_image_format(&tiff_header), "tiff");
}
#[test]
fn test_detect_image_format_unknown() {
let unknown_data = b"unknown format";
assert_eq!(detect_image_format(unknown_data), "unknown");
}
#[test]
fn test_get_slide_rels_path() {
assert_eq!(
image_handling::get_slide_rels_path("ppt/slides/slide1.xml"),
"ppt/slides/_rels/slide1.xml.rels"
);
assert_eq!(
image_handling::get_slide_rels_path("ppt/slides/slide10.xml"),
"ppt/slides/_rels/slide10.xml.rels"
);
}
#[test]
fn test_get_full_image_path_relative() {
assert_eq!(
image_handling::get_full_image_path("ppt/slides/slide1.xml", "../media/image1.png"),
"ppt/media/image1.png"
);
}
#[test]
fn test_get_full_image_path_direct() {
assert_eq!(
image_handling::get_full_image_path("ppt/slides/slide1.xml", "image1.png"),
"ppt/slides/image1.png"
);
}
}