Skip to main content

ppt_rs/oxml/
slide.rs

1//! Slide XML parsing and content extraction
2//!
3//! Parses slide XML to extract text, shapes, tables, and other content.
4
5use super::xmlchemy::{XmlElement, XmlParser};
6use crate::exc::PptxError;
7
8/// Parsed text run with formatting
9#[derive(Debug, Clone)]
10pub struct TextRun {
11    pub text: String,
12    pub bold: bool,
13    pub italic: bool,
14    pub underline: bool,
15    pub font_size: Option<u32>,
16    pub color: Option<String>,
17}
18
19impl TextRun {
20    pub fn new(text: &str) -> Self {
21        TextRun {
22            text: text.to_string(),
23            bold: false,
24            italic: false,
25            underline: false,
26            font_size: None,
27            color: None,
28        }
29    }
30}
31
32/// Parsed paragraph with text runs
33#[derive(Debug, Clone)]
34pub struct Paragraph {
35    pub runs: Vec<TextRun>,
36    pub level: u32,
37}
38
39impl Paragraph {
40    pub fn new() -> Self {
41        Paragraph {
42            runs: Vec::new(),
43            level: 0,
44        }
45    }
46
47    /// Get full text content
48    pub fn text(&self) -> String {
49        self.runs.iter().map(|r| r.text.as_str()).collect()
50    }
51}
52
53impl Default for Paragraph {
54    fn default() -> Self {
55        Self::new()
56    }
57}
58
59/// Parsed shape from slide
60#[derive(Debug, Clone)]
61pub struct ParsedShape {
62    pub name: String,
63    pub shape_type: Option<String>,
64    pub paragraphs: Vec<Paragraph>,
65    pub x: i64,
66    pub y: i64,
67    pub width: i64,
68    pub height: i64,
69    pub is_title: bool,
70    pub is_body: bool,
71}
72
73impl ParsedShape {
74    pub fn new(name: &str) -> Self {
75        ParsedShape {
76            name: name.to_string(),
77            shape_type: None,
78            paragraphs: Vec::new(),
79            x: 0,
80            y: 0,
81            width: 0,
82            height: 0,
83            is_title: false,
84            is_body: false,
85        }
86    }
87
88    /// Get all text from shape
89    pub fn text(&self) -> String {
90        self.paragraphs.iter()
91            .map(|p| p.text())
92            .collect::<Vec<_>>()
93            .join("\n")
94    }
95}
96
97/// Parsed table cell
98#[derive(Debug, Clone)]
99pub struct ParsedTableCell {
100    pub text: String,
101    pub row_span: u32,
102    pub col_span: u32,
103}
104
105/// Parsed table
106#[derive(Debug, Clone)]
107pub struct ParsedTable {
108    pub rows: Vec<Vec<ParsedTableCell>>,
109}
110
111impl ParsedTable {
112    pub fn new() -> Self {
113        ParsedTable { rows: Vec::new() }
114    }
115
116    pub fn row_count(&self) -> usize {
117        self.rows.len()
118    }
119
120    pub fn col_count(&self) -> usize {
121        self.rows.first().map(|r| r.len()).unwrap_or(0)
122    }
123}
124
125impl Default for ParsedTable {
126    fn default() -> Self {
127        Self::new()
128    }
129}
130
131/// Parsed slide content
132#[derive(Debug, Clone)]
133pub struct ParsedSlide {
134    pub shapes: Vec<ParsedShape>,
135    pub tables: Vec<ParsedTable>,
136    pub title: Option<String>,
137    pub body_text: Vec<String>,
138}
139
140impl ParsedSlide {
141    pub fn new() -> Self {
142        ParsedSlide {
143            shapes: Vec::new(),
144            tables: Vec::new(),
145            title: None,
146            body_text: Vec::new(),
147        }
148    }
149
150    /// Get all text from slide
151    pub fn all_text(&self) -> Vec<String> {
152        let mut texts = Vec::new();
153        if let Some(ref title) = self.title {
154            texts.push(title.clone());
155        }
156        texts.extend(self.body_text.clone());
157        for shape in &self.shapes {
158            let text = shape.text();
159            if !text.is_empty() {
160                texts.push(text);
161            }
162        }
163        texts
164    }
165}
166
167impl Default for ParsedSlide {
168    fn default() -> Self {
169        Self::new()
170    }
171}
172
173/// Slide parser
174pub struct SlideParser;
175
176impl SlideParser {
177    /// Parse slide XML content
178    pub fn parse(xml: &str) -> Result<ParsedSlide, PptxError> {
179        let root = XmlParser::parse_str(xml)?;
180        let mut slide = ParsedSlide::new();
181
182        // Find shape tree (spTree)
183        if let Some(sp_tree) = root.find_descendant("spTree") {
184            // Parse shapes
185            for sp in sp_tree.find_all("sp") {
186                if let Some(mut shape) = Self::parse_shape(sp) {
187                    // Check if this is title or body
188                    if Self::is_title_shape(sp) {
189                        shape.is_title = true;
190                        slide.title = Some(shape.text());
191                    } else if Self::is_body_shape(sp) {
192                        shape.is_body = true;
193                        for para in &shape.paragraphs {
194                            let text = para.text();
195                            if !text.is_empty() {
196                                slide.body_text.push(text);
197                            }
198                        }
199                    }
200                    slide.shapes.push(shape);
201                }
202            }
203
204            // Parse graphic frames (tables, charts)
205            for gf in sp_tree.find_all("graphicFrame") {
206                if let Some(table) = Self::parse_table_from_graphic_frame(gf) {
207                    slide.tables.push(table);
208                }
209            }
210        }
211
212        Ok(slide)
213    }
214
215    fn parse_shape(sp: &XmlElement) -> Option<ParsedShape> {
216        // Get shape name from nvSpPr/cNvPr
217        let name = sp.find_descendant("cNvPr")
218            .and_then(|e| e.attr("name"))
219            .unwrap_or("Shape");
220
221        let mut shape = ParsedShape::new(name);
222
223        // Get position and size from spPr/xfrm
224        if let Some(xfrm) = sp.find_descendant("xfrm") {
225            if let Some(off) = xfrm.find("off") {
226                shape.x = off.attr("x").and_then(|v| v.parse().ok()).unwrap_or(0);
227                shape.y = off.attr("y").and_then(|v| v.parse().ok()).unwrap_or(0);
228            }
229            if let Some(ext) = xfrm.find("ext") {
230                shape.width = ext.attr("cx").and_then(|v| v.parse().ok()).unwrap_or(0);
231                shape.height = ext.attr("cy").and_then(|v| v.parse().ok()).unwrap_or(0);
232            }
233        }
234
235        // Get shape type from prstGeom
236        if let Some(prst_geom) = sp.find_descendant("prstGeom") {
237            shape.shape_type = prst_geom.attr("prst").map(|s| s.to_string());
238        }
239
240        // Parse text body
241        if let Some(tx_body) = sp.find_descendant("txBody") {
242            shape.paragraphs = Self::parse_text_body(tx_body);
243        }
244
245        Some(shape)
246    }
247
248    fn parse_text_body(tx_body: &XmlElement) -> Vec<Paragraph> {
249        let mut paragraphs = Vec::new();
250
251        for p in tx_body.find_all("p") {
252            let mut para = Paragraph::new();
253
254            // Get paragraph level
255            if let Some(ppr) = p.find("pPr") {
256                para.level = ppr.attr("lvl").and_then(|v| v.parse().ok()).unwrap_or(0);
257            }
258
259            // Parse text runs
260            for r in p.find_all("r") {
261                let text = r.find("t").map(|t| t.text_content()).unwrap_or_default();
262                if text.is_empty() {
263                    continue;
264                }
265
266                let mut run = TextRun::new(&text);
267
268                // Parse run properties
269                if let Some(rpr) = r.find("rPr") {
270                    run.bold = rpr.attr("b").map(|v| v == "1" || v == "true").unwrap_or(false);
271                    run.italic = rpr.attr("i").map(|v| v == "1" || v == "true").unwrap_or(false);
272                    run.underline = rpr.attr("u").is_some();
273                    run.font_size = rpr.attr("sz").and_then(|v| v.parse().ok());
274
275                    // Get color from solidFill/srgbClr
276                    if let Some(solid_fill) = rpr.find_descendant("solidFill") {
277                        if let Some(srgb) = solid_fill.find("srgbClr") {
278                            run.color = srgb.attr("val").map(|s| s.to_string());
279                        }
280                    }
281                }
282
283                para.runs.push(run);
284            }
285
286            if !para.runs.is_empty() {
287                paragraphs.push(para);
288            }
289        }
290
291        paragraphs
292    }
293
294    fn is_title_shape(sp: &XmlElement) -> bool {
295        // Check placeholder type first
296        if let Some(nv_pr) = sp.find_descendant("nvPr") {
297            if let Some(ph) = nv_pr.find("ph") {
298                let ph_type = ph.attr("type").unwrap_or("");
299                if ph_type == "title" || ph_type == "ctrTitle" {
300                    return true;
301                }
302            }
303        }
304        // Also check shape name for textbox-based titles
305        if let Some(cnv_pr) = sp.find_descendant("cNvPr") {
306            if let Some(name) = cnv_pr.attr("name") {
307                let name_lower = name.to_lowercase();
308                if name_lower == "title" || name_lower.contains("title") {
309                    return true;
310                }
311            }
312        }
313        false
314    }
315
316    fn is_body_shape(sp: &XmlElement) -> bool {
317        // Check placeholder type first
318        if let Some(nv_pr) = sp.find_descendant("nvPr") {
319            if let Some(ph) = nv_pr.find("ph") {
320                let ph_type = ph.attr("type").unwrap_or("body");
321                if ph_type == "body" || ph_type.is_empty() {
322                    return true;
323                }
324            }
325        }
326        // Also check shape name for textbox-based content
327        if let Some(cnv_pr) = sp.find_descendant("cNvPr") {
328            if let Some(name) = cnv_pr.attr("name") {
329                let name_lower = name.to_lowercase();
330                if name_lower == "content" || name_lower.contains("content") {
331                    return true;
332                }
333            }
334        }
335        false
336    }
337
338    fn parse_table_from_graphic_frame(gf: &XmlElement) -> Option<ParsedTable> {
339        // Find table element (a:tbl)
340        let tbl = gf.find_descendant("tbl")?;
341        let mut table = ParsedTable::new();
342
343        for tr in tbl.find_all("tr") {
344            let mut row = Vec::new();
345            for tc in tr.find_all("tc") {
346                let text = tc.find_descendant("t")
347                    .map(|t| t.text_content())
348                    .unwrap_or_default();
349                
350                let row_span = tc.attr("rowSpan").and_then(|v| v.parse().ok()).unwrap_or(1);
351                let col_span = tc.attr("gridSpan").and_then(|v| v.parse().ok()).unwrap_or(1);
352
353                row.push(ParsedTableCell {
354                    text,
355                    row_span,
356                    col_span,
357                });
358            }
359            if !row.is_empty() {
360                table.rows.push(row);
361            }
362        }
363
364        if table.rows.is_empty() {
365            None
366        } else {
367            Some(table)
368        }
369    }
370}
371
372#[cfg(test)]
373mod tests {
374    use super::*;
375
376    #[test]
377    fn test_parse_simple_slide() {
378        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
379        <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" 
380               xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
381            <p:cSld>
382                <p:spTree>
383                    <p:sp>
384                        <p:nvSpPr>
385                            <p:cNvPr id="2" name="Title"/>
386                            <p:nvPr><p:ph type="title"/></p:nvPr>
387                        </p:nvSpPr>
388                        <p:txBody>
389                            <a:p>
390                                <a:r><a:t>Test Title</a:t></a:r>
391                            </a:p>
392                        </p:txBody>
393                    </p:sp>
394                    <p:sp>
395                        <p:nvSpPr>
396                            <p:cNvPr id="3" name="Content"/>
397                            <p:nvPr><p:ph type="body"/></p:nvPr>
398                        </p:nvSpPr>
399                        <p:txBody>
400                            <a:p>
401                                <a:r><a:t>Bullet 1</a:t></a:r>
402                            </a:p>
403                            <a:p>
404                                <a:r><a:t>Bullet 2</a:t></a:r>
405                            </a:p>
406                        </p:txBody>
407                    </p:sp>
408                </p:spTree>
409            </p:cSld>
410        </p:sld>"#;
411
412        let slide = SlideParser::parse(xml).unwrap();
413        assert_eq!(slide.title, Some("Test Title".to_string()));
414        assert_eq!(slide.body_text.len(), 2);
415        assert_eq!(slide.body_text[0], "Bullet 1");
416        assert_eq!(slide.body_text[1], "Bullet 2");
417    }
418
419    #[test]
420    fn test_parse_formatted_text() {
421        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
422        <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" 
423               xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
424            <p:cSld>
425                <p:spTree>
426                    <p:sp>
427                        <p:nvSpPr>
428                            <p:cNvPr id="2" name="Title"/>
429                            <p:nvPr><p:ph type="title"/></p:nvPr>
430                        </p:nvSpPr>
431                        <p:txBody>
432                            <a:p>
433                                <a:r>
434                                    <a:rPr b="1" i="1" sz="4400"/>
435                                    <a:t>Bold Italic</a:t>
436                                </a:r>
437                            </a:p>
438                        </p:txBody>
439                    </p:sp>
440                </p:spTree>
441            </p:cSld>
442        </p:sld>"#;
443
444        let slide = SlideParser::parse(xml).unwrap();
445        assert!(slide.shapes.len() > 0);
446        let shape = &slide.shapes[0];
447        assert!(shape.paragraphs.len() > 0);
448        let run = &shape.paragraphs[0].runs[0];
449        assert!(run.bold);
450        assert!(run.italic);
451        assert_eq!(run.font_size, Some(4400));
452    }
453}