Skip to main content

ppt_rs/oxml/
slide.rs

1//! Slide XML parsing and content extraction
2//!
3//! Parses slide XML to extract text, shapes, tables, and other content.
4
5use super::xmlchemy::{XmlElement, XmlParser};
6use crate::exc::PptxError;
7
8/// Parsed text run with formatting
9#[derive(Debug, Clone)]
10pub struct TextRun {
11    pub text: String,
12    pub bold: bool,
13    pub italic: bool,
14    pub underline: bool,
15    pub font_size: Option<u32>,
16    pub color: Option<String>,
17}
18
19impl TextRun {
20    pub fn new(text: &str) -> Self {
21        TextRun {
22            text: text.to_string(),
23            bold: false,
24            italic: false,
25            underline: false,
26            font_size: None,
27            color: None,
28        }
29    }
30}
31
32/// Parsed paragraph with text runs
33#[derive(Debug, Clone)]
34pub struct Paragraph {
35    pub runs: Vec<TextRun>,
36    pub level: u32,
37}
38
39impl Paragraph {
40    pub fn new() -> Self {
41        Paragraph {
42            runs: Vec::new(),
43            level: 0,
44        }
45    }
46
47    /// Get full text content
48    pub fn text(&self) -> String {
49        self.runs.iter().map(|r| r.text.as_str()).collect()
50    }
51}
52
53impl Default for Paragraph {
54    fn default() -> Self {
55        Self::new()
56    }
57}
58
59/// Parsed shape from slide
60#[derive(Debug, Clone)]
61pub struct ParsedShape {
62    pub name: String,
63    pub shape_type: Option<String>,
64    pub paragraphs: Vec<Paragraph>,
65    pub x: i64,
66    pub y: i64,
67    pub width: i64,
68    pub height: i64,
69    pub is_title: bool,
70    pub is_body: bool,
71}
72
73impl ParsedShape {
74    pub fn new(name: &str) -> Self {
75        ParsedShape {
76            name: name.to_string(),
77            shape_type: None,
78            paragraphs: Vec::new(),
79            x: 0,
80            y: 0,
81            width: 0,
82            height: 0,
83            is_title: false,
84            is_body: false,
85        }
86    }
87
88    /// Get all text from shape
89    pub fn text(&self) -> String {
90        self.paragraphs
91            .iter()
92            .map(|p| p.text())
93            .collect::<Vec<_>>()
94            .join("\n")
95    }
96}
97
98/// Parsed table cell
99#[derive(Debug, Clone)]
100pub struct ParsedTableCell {
101    pub text: String,
102    pub row_span: u32,
103    pub col_span: u32,
104}
105
106/// Parsed table
107#[derive(Debug, Clone)]
108pub struct ParsedTable {
109    pub rows: Vec<Vec<ParsedTableCell>>,
110}
111
112impl ParsedTable {
113    pub fn new() -> Self {
114        ParsedTable { rows: Vec::new() }
115    }
116
117    pub fn row_count(&self) -> usize {
118        self.rows.len()
119    }
120
121    pub fn col_count(&self) -> usize {
122        self.rows.first().map(|r| r.len()).unwrap_or(0)
123    }
124}
125
126impl Default for ParsedTable {
127    fn default() -> Self {
128        Self::new()
129    }
130}
131
132/// Parsed slide content
133#[derive(Debug, Clone)]
134pub struct ParsedSlide {
135    pub shapes: Vec<ParsedShape>,
136    pub tables: Vec<ParsedTable>,
137    pub title: Option<String>,
138    pub body_text: Vec<String>,
139}
140
141impl ParsedSlide {
142    pub fn new() -> Self {
143        ParsedSlide {
144            shapes: Vec::new(),
145            tables: Vec::new(),
146            title: None,
147            body_text: Vec::new(),
148        }
149    }
150
151    /// Get all text from slide
152    pub fn all_text(&self) -> Vec<String> {
153        let mut texts = Vec::new();
154        if let Some(ref title) = self.title {
155            texts.push(title.clone());
156        }
157        texts.extend(self.body_text.clone());
158        for shape in &self.shapes {
159            let text = shape.text();
160            if !text.is_empty() {
161                texts.push(text);
162            }
163        }
164        texts
165    }
166}
167
168impl Default for ParsedSlide {
169    fn default() -> Self {
170        Self::new()
171    }
172}
173
174/// Slide parser
175pub struct SlideParser;
176
177impl SlideParser {
178    /// Parse slide XML content
179    pub fn parse(xml: &str) -> Result<ParsedSlide, PptxError> {
180        let root = XmlParser::parse_str(xml)?;
181        let mut slide = ParsedSlide::new();
182
183        // Find shape tree (spTree)
184        if let Some(sp_tree) = root.find_descendant("spTree") {
185            // Parse shapes
186            for sp in sp_tree.find_all("sp") {
187                if let Some(mut shape) = Self::parse_shape(sp) {
188                    // Check if this is title or body
189                    if Self::is_title_shape(sp) {
190                        shape.is_title = true;
191                        slide.title = Some(shape.text());
192                    } else if Self::is_body_shape(sp) {
193                        shape.is_body = true;
194                        for para in &shape.paragraphs {
195                            let text = para.text();
196                            if !text.is_empty() {
197                                slide.body_text.push(text);
198                            }
199                        }
200                    }
201                    slide.shapes.push(shape);
202                }
203            }
204
205            // Parse graphic frames (tables, charts)
206            for gf in sp_tree.find_all("graphicFrame") {
207                if let Some(table) = Self::parse_table_from_graphic_frame(gf) {
208                    slide.tables.push(table);
209                }
210            }
211        }
212
213        Ok(slide)
214    }
215
216    fn parse_shape(sp: &XmlElement) -> Option<ParsedShape> {
217        // Get shape name from nvSpPr/cNvPr
218        let name = sp
219            .find_descendant("cNvPr")
220            .and_then(|e| e.attr("name"))
221            .unwrap_or("Shape");
222
223        let mut shape = ParsedShape::new(name);
224
225        // Get position and size from spPr/xfrm
226        if let Some(xfrm) = sp.find_descendant("xfrm") {
227            if let Some(off) = xfrm.find("off") {
228                shape.x = off.attr("x").and_then(|v| v.parse().ok()).unwrap_or(0);
229                shape.y = off.attr("y").and_then(|v| v.parse().ok()).unwrap_or(0);
230            }
231            if let Some(ext) = xfrm.find("ext") {
232                shape.width = ext.attr("cx").and_then(|v| v.parse().ok()).unwrap_or(0);
233                shape.height = ext.attr("cy").and_then(|v| v.parse().ok()).unwrap_or(0);
234            }
235        }
236
237        // Get shape type from prstGeom
238        if let Some(prst_geom) = sp.find_descendant("prstGeom") {
239            shape.shape_type = prst_geom.attr("prst").map(|s| s.to_string());
240        }
241
242        // Parse text body
243        if let Some(tx_body) = sp.find_descendant("txBody") {
244            shape.paragraphs = Self::parse_text_body(tx_body);
245        }
246
247        Some(shape)
248    }
249
250    fn parse_text_body(tx_body: &XmlElement) -> Vec<Paragraph> {
251        let mut paragraphs = Vec::new();
252
253        for p in tx_body.find_all("p") {
254            let mut para = Paragraph::new();
255
256            // Get paragraph level
257            if let Some(ppr) = p.find("pPr") {
258                para.level = ppr.attr("lvl").and_then(|v| v.parse().ok()).unwrap_or(0);
259            }
260
261            // Parse text runs
262            for r in p.find_all("r") {
263                let text = r.find("t").map(|t| t.text_content()).unwrap_or_default();
264                if text.is_empty() {
265                    continue;
266                }
267
268                let mut run = TextRun::new(&text);
269
270                // Parse run properties
271                if let Some(rpr) = r.find("rPr") {
272                    run.bold = rpr
273                        .attr("b")
274                        .map(|v| v == "1" || v == "true")
275                        .unwrap_or(false);
276                    run.italic = rpr
277                        .attr("i")
278                        .map(|v| v == "1" || v == "true")
279                        .unwrap_or(false);
280                    run.underline = rpr.attr("u").is_some();
281                    run.font_size = rpr.attr("sz").and_then(|v| v.parse().ok());
282
283                    // Get color from solidFill/srgbClr
284                    if let Some(solid_fill) = rpr.find_descendant("solidFill") {
285                        if let Some(srgb) = solid_fill.find("srgbClr") {
286                            run.color = srgb.attr("val").map(|s| s.to_string());
287                        }
288                    }
289                }
290
291                para.runs.push(run);
292            }
293
294            if !para.runs.is_empty() {
295                paragraphs.push(para);
296            }
297        }
298
299        paragraphs
300    }
301
302    fn is_title_shape(sp: &XmlElement) -> bool {
303        // Check placeholder type first
304        if let Some(nv_pr) = sp.find_descendant("nvPr") {
305            if let Some(ph) = nv_pr.find("ph") {
306                let ph_type = ph.attr("type").unwrap_or("");
307                if ph_type == "title" || ph_type == "ctrTitle" {
308                    return true;
309                }
310            }
311        }
312        // Also check shape name for textbox-based titles
313        if let Some(cnv_pr) = sp.find_descendant("cNvPr") {
314            if let Some(name) = cnv_pr.attr("name") {
315                let name_lower = name.to_lowercase();
316                if name_lower == "title" || name_lower.contains("title") {
317                    return true;
318                }
319            }
320        }
321        false
322    }
323
324    fn is_body_shape(sp: &XmlElement) -> bool {
325        // Check placeholder type first
326        if let Some(nv_pr) = sp.find_descendant("nvPr") {
327            if let Some(ph) = nv_pr.find("ph") {
328                let ph_type = ph.attr("type").unwrap_or("body");
329                if ph_type == "body" || ph_type.is_empty() {
330                    return true;
331                }
332            }
333        }
334        // Also check shape name for textbox-based content
335        if let Some(cnv_pr) = sp.find_descendant("cNvPr") {
336            if let Some(name) = cnv_pr.attr("name") {
337                let name_lower = name.to_lowercase();
338                if name_lower == "content" || name_lower.contains("content") {
339                    return true;
340                }
341            }
342        }
343        false
344    }
345
346    fn parse_table_from_graphic_frame(gf: &XmlElement) -> Option<ParsedTable> {
347        // Find table element (a:tbl)
348        let tbl = gf.find_descendant("tbl")?;
349        let mut table = ParsedTable::new();
350
351        for tr in tbl.find_all("tr") {
352            let mut row = Vec::new();
353            for tc in tr.find_all("tc") {
354                let text = tc
355                    .find_descendant("t")
356                    .map(|t| t.text_content())
357                    .unwrap_or_default();
358
359                let row_span = tc.attr("rowSpan").and_then(|v| v.parse().ok()).unwrap_or(1);
360                let col_span = tc
361                    .attr("gridSpan")
362                    .and_then(|v| v.parse().ok())
363                    .unwrap_or(1);
364
365                row.push(ParsedTableCell {
366                    text,
367                    row_span,
368                    col_span,
369                });
370            }
371            if !row.is_empty() {
372                table.rows.push(row);
373            }
374        }
375
376        if table.rows.is_empty() {
377            None
378        } else {
379            Some(table)
380        }
381    }
382}
383
384#[cfg(test)]
385mod tests {
386    use super::*;
387
388    #[test]
389    fn test_parse_simple_slide() {
390        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
391        <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" 
392               xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
393            <p:cSld>
394                <p:spTree>
395                    <p:sp>
396                        <p:nvSpPr>
397                            <p:cNvPr id="2" name="Title"/>
398                            <p:nvPr><p:ph type="title"/></p:nvPr>
399                        </p:nvSpPr>
400                        <p:txBody>
401                            <a:p>
402                                <a:r><a:t>Test Title</a:t></a:r>
403                            </a:p>
404                        </p:txBody>
405                    </p:sp>
406                    <p:sp>
407                        <p:nvSpPr>
408                            <p:cNvPr id="3" name="Content"/>
409                            <p:nvPr><p:ph type="body"/></p:nvPr>
410                        </p:nvSpPr>
411                        <p:txBody>
412                            <a:p>
413                                <a:r><a:t>Bullet 1</a:t></a:r>
414                            </a:p>
415                            <a:p>
416                                <a:r><a:t>Bullet 2</a:t></a:r>
417                            </a:p>
418                        </p:txBody>
419                    </p:sp>
420                </p:spTree>
421            </p:cSld>
422        </p:sld>"#;
423
424        let slide = SlideParser::parse(xml).unwrap();
425        assert_eq!(slide.title, Some("Test Title".to_string()));
426        assert_eq!(slide.body_text.len(), 2);
427        assert_eq!(slide.body_text[0], "Bullet 1");
428        assert_eq!(slide.body_text[1], "Bullet 2");
429    }
430
431    #[test]
432    fn test_parse_formatted_text() {
433        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
434        <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" 
435               xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
436            <p:cSld>
437                <p:spTree>
438                    <p:sp>
439                        <p:nvSpPr>
440                            <p:cNvPr id="2" name="Title"/>
441                            <p:nvPr><p:ph type="title"/></p:nvPr>
442                        </p:nvSpPr>
443                        <p:txBody>
444                            <a:p>
445                                <a:r>
446                                    <a:rPr b="1" i="1" sz="4400"/>
447                                    <a:t>Bold Italic</a:t>
448                                </a:r>
449                            </a:p>
450                        </p:txBody>
451                    </p:sp>
452                </p:spTree>
453            </p:cSld>
454        </p:sld>"#;
455
456        let slide = SlideParser::parse(xml).unwrap();
457        assert!(slide.shapes.len() > 0);
458        let shape = &slide.shapes[0];
459        assert!(shape.paragraphs.len() > 0);
460        let run = &shape.paragraphs[0].runs[0];
461        assert!(run.bold);
462        assert!(run.italic);
463        assert_eq!(run.font_size, Some(4400));
464    }
465}