use std::io::{BufRead, BufReader, Cursor, Read, Seek};
use quick_xml::Reader;
use quick_xml::events::{BytesStart, Event};
use crate::models::{Extraction, OutputWarning};
use crate::parsers::{
decode_xml_reference, decode_xml_text, merge_warnings, name_eq, parent_dir,
parse_relationship_map, rels_path_for, resolve_relationship_target,
};
use crate::vfs::OoxmlPackage;
use crate::{OxdocError, Result};
pub(crate) fn extract_text<R: Read + Seek>(
package: &mut OoxmlPackage<R>,
) -> Result<Extraction<String>> {
let presentation_path =
crate::parsers::find_office_document_path(package, "ppt/presentation.xml")?;
let presentation_xml = package.read_to_string(&presentation_path)?;
let slide_ids = parse_slide_relation_ids(&presentation_xml, &presentation_path)?;
let presentation_rels_path = rels_path_for(&presentation_path);
let presentation_rels_xml = package.read_to_string(&presentation_rels_path)?;
let presentation_rels =
parse_relationship_map(&presentation_rels_xml, &presentation_rels_path)?;
let mut text = String::new();
let mut warnings = slide_ids.warnings;
for slide_id in slide_ids.value {
let relationship = presentation_rels
.get(&slide_id)
.ok_or_else(|| OxdocError::MissingPart(slide_id.clone()))?;
let slide_path = resolve_relationship_target(
parent_dir(&presentation_path),
relationship,
&presentation_rels_path,
)?;
let slide = read_text_part(package, &slide_path)?;
append_part_text(&mut text, &slide.value);
warnings = merge_warnings(warnings, slide.warnings);
let notes = read_notes_for_slide(package, &slide_path)?;
append_part_text(&mut text, ¬es.value);
warnings = merge_warnings(warnings, notes.warnings);
}
Ok(Extraction::with_warnings(text, warnings))
}
fn parse_slide_relation_ids(xml: &str, path: &str) -> Result<Extraction<Vec<String>>> {
let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let mut slide_ids = Vec::new();
let mut warnings = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(element)) | Ok(Event::Empty(element))
if name_eq(element.name().as_ref(), b"sldId") =>
{
if let Some(relation_id) = relationship_id_value(&element) {
slide_ids.push(relation_id);
} else {
warnings.push(OutputWarning::new(
path,
"ignored presentation slide without relationship id",
));
}
}
Ok(Event::Eof) => break,
Err(source) => {
warnings.push(OutputWarning::malformed_xml(path, source));
break;
}
_ => {}
}
buf.clear();
}
Ok(Extraction::with_warnings(slide_ids, warnings))
}
fn relationship_id_value(element: &BytesStart<'_>) -> Option<String> {
element
.attributes()
.with_checks(false)
.flatten()
.find(|attr| {
let key = attr.key.as_ref();
key.contains(&b':') && crate::parsers::local_name(key) == b"id"
})
.map(|attr| decode_xml_text(attr.value.as_ref()))
}
fn read_notes_for_slide<R: Read + Seek>(
package: &mut OoxmlPackage<R>,
slide_path: &str,
) -> Result<Extraction<String>> {
let slide_rels_path = rels_path_for(slide_path);
let slide_rels_xml = match package.read_to_string(&slide_rels_path) {
Ok(xml) => xml,
Err(OxdocError::MissingPart(_)) => return Ok(Extraction::new(String::new())),
Err(err) => return Err(err),
};
let slide_rels = parse_relationship_map(&slide_rels_xml, &slide_rels_path)?;
let mut notes_relationships = slide_rels
.iter()
.filter(|(_, relationship)| {
relationship
.relationship_type
.as_deref()
.is_some_and(|kind| kind.ends_with("/notesSlide"))
})
.collect::<Vec<_>>();
notes_relationships.sort_by(|left, right| left.0.cmp(right.0));
let mut text = String::new();
let mut warnings = Vec::new();
for (_, relationship) in notes_relationships {
let notes_path =
resolve_relationship_target(parent_dir(slide_path), relationship, &slide_rels_path)?;
let notes = read_text_part(package, ¬es_path)?;
append_part_text(&mut text, ¬es.value);
warnings = merge_warnings(warnings, notes.warnings);
}
Ok(Extraction::with_warnings(text, warnings))
}
fn read_text_part<R: Read + Seek>(
package: &mut OoxmlPackage<R>,
path: &str,
) -> Result<Extraction<String>> {
package.with_entry(path, |entry| {
let reader = BufReader::new(entry);
extract_text_part(reader, path)
})
}
fn extract_text_part<R: BufRead>(source: R, path: &str) -> Result<Extraction<String>> {
let mut reader = Reader::from_reader(source);
reader.config_mut().trim_text(false);
let mut buf = Vec::new();
let mut text = String::new();
let mut warnings = Vec::new();
let mut in_text_node = false;
let mut paragraph_has_content = false;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(element)) => {
if name_eq(element.name().as_ref(), b"t") {
in_text_node = true;
} else if name_eq(element.name().as_ref(), b"tab") {
text.push('\t');
paragraph_has_content = true;
} else if name_eq(element.name().as_ref(), b"br")
|| name_eq(element.name().as_ref(), b"cr")
{
push_newline(&mut text);
paragraph_has_content = true;
}
}
Ok(Event::Empty(element)) => {
if name_eq(element.name().as_ref(), b"tab") {
text.push('\t');
paragraph_has_content = true;
} else if name_eq(element.name().as_ref(), b"br")
|| name_eq(element.name().as_ref(), b"cr")
{
push_newline(&mut text);
paragraph_has_content = true;
}
}
Ok(Event::Text(value)) if in_text_node => {
paragraph_has_content |= push_text(&mut text, &decode_xml_text(value.as_ref()));
}
Ok(Event::CData(value)) if in_text_node => {
paragraph_has_content |= push_text(&mut text, &decode_xml_text(value.as_ref()));
}
Ok(Event::GeneralRef(value)) if in_text_node => {
paragraph_has_content |=
push_text(&mut text, &decode_xml_reference(value.as_ref()));
}
Ok(Event::End(element)) => {
if name_eq(element.name().as_ref(), b"t") {
in_text_node = false;
} else if name_eq(element.name().as_ref(), b"p") {
if paragraph_has_content {
push_newline(&mut text);
}
paragraph_has_content = false;
}
}
Ok(Event::Eof) => break,
Err(source) => {
warnings.push(OutputWarning::malformed_xml(path, source));
break;
}
_ => {}
}
buf.clear();
}
Ok(Extraction::with_warnings(text, warnings))
}
fn append_part_text(text: &mut String, part: &str) {
if part.is_empty() {
return;
}
if !text.is_empty() && !text.ends_with('\n') {
text.push('\n');
}
text.push_str(part);
}
fn push_text(text: &mut String, value: &str) -> bool {
if value.is_empty() {
return false;
}
text.push_str(value);
true
}
fn push_newline(text: &mut String) {
if !text.ends_with('\n') {
text.push('\n');
}
}
#[doc(hidden)]
pub fn fuzz_extract_text(xml: &[u8]) -> Result<()> {
let _ = extract_text_part(Cursor::new(xml), "ppt/slides/slide1.xml")?;
Ok(())
}
#[cfg(test)]
mod tests {
use std::io::Cursor;
use super::{extract_text_part, parse_slide_relation_ids};
#[test]
fn extracts_drawing_text_with_breaks_tabs_and_cdata() {
let xml = r#"
<p:sld xmlns:p="p" xmlns:a="a">
<p:cSld>
<p:spTree>
<p:sp>
<p:txBody>
<a:p><a:r><a:t>Title</a:t></a:r></a:p>
<a:p><a:r><a:t>A</a:t></a:r><a:tab/><a:r><a:t>B & C</a:t></a:r><a:br/><a:r><a:t><![CDATA[D < E]]></a:t></a:r></a:p>
</p:txBody>
</p:sp>
</p:spTree>
</p:cSld>
</p:sld>
"#;
let result =
extract_text_part(Cursor::new(xml.as_bytes()), "ppt/slides/slide1.xml").unwrap();
assert_eq!(result.value, "Title\nA\tB & C\nD < E\n");
assert!(result.warnings.is_empty());
}
#[test]
fn keeps_partial_text_after_malformed_xml() {
let result = extract_text_part(
Cursor::new(br#"<p:sld><a:p><a:r><a:t>partial</a:t></a:r><"#.as_slice()),
"ppt/slides/slide1.xml",
)
.unwrap();
assert_eq!(result.value, "partial");
assert_eq!(result.warnings.len(), 1);
}
#[test]
fn omits_empty_text_paragraphs() {
let xml = r#"
<p:sld xmlns:p="p" xmlns:a="a">
<a:p><a:r><a:t></a:t></a:r></a:p>
<a:p><a:r><a:t>visible</a:t></a:r></a:p>
</p:sld>
"#;
let result =
extract_text_part(Cursor::new(xml.as_bytes()), "ppt/slides/slide1.xml").unwrap();
assert_eq!(result.value, "visible\n");
}
#[test]
fn parses_slide_relationship_ids_in_presentation_order() {
let xml = r#"
<p:presentation xmlns:p="p" xmlns:r="r">
<p:sldIdLst>
<p:sldId id="256" r:id="rId2"/>
<p:sldId id="257" r:id="rId1"/>
</p:sldIdLst>
</p:presentation>
"#;
let result = parse_slide_relation_ids(xml, "ppt/presentation.xml").unwrap();
assert_eq!(result.value, ["rId2", "rId1"]);
assert!(result.warnings.is_empty());
}
}