pptx_to_md/
slide.rs

1use crate::parser_config::ImageHandlingMode;
2use crate::{ElementPosition, ImageReference, ParserConfig, SlideElement};
3use base64::{engine::general_purpose, Engine as _};
4use image::ImageOutputFormat;
5use std::collections::HashMap;
6use std::fs;
7use std::io::Cursor;
8use std::path::{Path, PathBuf};
9
10/// Encapsulates images for manual extraction of images from slides
11#[derive(Debug)]
12pub struct ManualImage {
13    pub base64_content: String,
14    pub img_ref: ImageReference,
15}
16
17impl ManualImage {
18    pub fn new(base64_content: String, img_ref: ImageReference) -> ManualImage {
19        Self {
20            base64_content,
21            img_ref,
22        }
23    }
24}
25
26/// Represents a single slide extracted from a PowerPoint (pptx) file.
27///
28/// Contains structured slide data including slide number, parsed content elements
29/// (text, tables, images, lists), and associated image references.
30///
31/// A `Slide` can be converted into other formats, such as Markdown, or its
32/// contained images can be extracted in base64 representation.
33///
34/// Typically, you retrieve instances of `Slide` through [`PptxContainer::parse()`].
35#[derive(Debug)]
36pub struct Slide {
37    pub rel_path: String,
38    pub slide_number: u32,
39    pub elements: Vec<SlideElement>,
40    pub images: Vec<ImageReference>,
41    pub image_data: HashMap<String, Vec<u8>>,
42    pub config: ParserConfig
43}
44
45impl Slide {
46    pub fn new(
47        rel_path: String,
48        slide_number: u32,
49        elements: Vec<SlideElement>,
50        images: Vec<ImageReference>,
51        image_data: HashMap<String, Vec<u8>>,
52        config: ParserConfig,
53    ) -> Self {
54        Self {
55            rel_path,
56            slide_number,
57            elements,
58            images,
59            image_data,
60            config,
61        }
62    }
63
64    /// Converts slide contents into a Markdown formatted string.
65    ///
66    /// Translates internal slide elements (text, tables, lists, images) to valid
67    /// and readable Markdown. Embedded images will be encoded as base64 inline images.
68    ///
69    /// # Returns
70    ///
71    /// Returns an `Option<String>`:
72    /// - `Some(String)`: Markdown representation of slide if conversion succeeds.
73    /// - `None`: If a conversion error occurs during image encoding.
74    pub fn convert_to_md(&self) -> Option<String> {
75        let mut slide_txt = String::new();
76        if self.config.include_slide_comment { slide_txt.push_str(format!("<!-- Slide {} -->\n\n", self.slide_number).as_str()); }
77        let mut image_count = 0;
78
79        let mut sorted_elements = self.elements.clone();
80        sorted_elements.sort_by_key(|element| {
81            let ElementPosition { y, x } = element.position();
82            (y, x)
83        });
84        
85        for element in sorted_elements {
86            match element {
87                SlideElement::Text(text, _pos) => {
88                    for run in &text.runs {
89                        slide_txt.push_str(&run.render_as_md());
90                    }
91                    slide_txt.push('\n');
92                },
93                SlideElement::Table(table, _pos) => {
94                    let mut is_header = true;
95                    for row in &table.rows {
96                        let mut row_texts = Vec::new();
97                        for cell in &row.cells {
98                            let mut cell_text = String::new();
99                            for run in &cell.runs {
100                                cell_text.push_str(&run.extract());
101                            }
102                            row_texts.push(cell_text);
103                        }
104
105                        let row_line = format!("| {} |", row_texts.join(" | "));
106                        slide_txt.push_str(&row_line);
107                        slide_txt.push('\n');
108
109                        if is_header {
110                            let separator_line = format!("|{}|", row_texts.iter().map(|_| " --- ").collect::<Vec<_>>().join("|"));
111                            slide_txt.push_str(&separator_line);
112                            slide_txt.push('\n');
113                            is_header = false;
114                        }
115                    }
116                    slide_txt.push('\n');
117                },
118                SlideElement::Image(image_ref, _pos) => {
119                    match self.config.image_handling_mode {
120                        ImageHandlingMode::InMarkdown => {
121                            if let Some(image_data) = self.image_data.get(&image_ref.id) {
122                                let image_data = self.config.compress_images
123                                    .then(|| self.compress_image(image_data))
124                                    .unwrap_or_else(|| Option::from(image_data.clone()));
125
126                                let base64_string = general_purpose::STANDARD.encode(image_data?);
127                                let image_name = &image_ref.target.split('/').last()?;
128                                let file_ext = &image_name.split('.').last()?;
129
130                                slide_txt.push_str(format!("![{}](data:image/{};base64,{})", image_name, file_ext, base64_string).as_str());
131                            }
132                        }
133                        ImageHandlingMode::Save => {
134                            if let Some(image_data) = self.image_data.get(&image_ref.id) {
135                                let image_data = self.config.compress_images
136                                    .then(|| self.compress_image(image_data))
137                                    .unwrap_or_else(|| Option::from(image_data.clone()));
138
139                                let ext = self.config.compress_images
140                                    .then(|| "jpg".to_string())
141                                    .unwrap_or_else(|| self.get_image_extension(&image_ref.target.clone()));
142
143                                let output_dir = self.config
144                                    .image_output_path
145                                    .clone()
146                                    .unwrap_or_else(|| PathBuf::from("."));
147
148                                let _ = fs::create_dir_all(&output_dir);
149
150                                let mut image_path = output_dir.clone();
151                                let file_name = format!("slide{}_image{}_{}.{}", self.slide_number, image_count + 1, &image_ref.id, ext);
152                                image_path.push(&file_name);
153
154                                let _ = fs::write(&image_path, image_data?);
155
156                                let abs_file_url = self.path_to_file_url(&image_path);
157                                let html_link = format!(r#"<a href={:?}>{file_name}</a>"#, abs_file_url?);
158                                image_count += 1;
159                                slide_txt.push_str(&html_link);
160                                slide_txt.push('\n');
161                            }
162                        }
163                        ImageHandlingMode::Manually => { slide_txt.push('\n'); continue; }
164                    }
165                    slide_txt.push('\n');
166                }
167                SlideElement::List(list_element, _pos) => {
168                    let mut counters: Vec<usize> = Vec::new();
169                    let mut previous_level = 0;
170
171                    for item in &list_element.items {
172                        let mut item_text = String::new();
173                        for run in &item.runs {
174                            item_text.push_str(&run.extract());
175                        }
176
177                        let level = item.level as usize;
178                        if level >= counters.len() {
179                            counters.resize(level + 1, 0);
180                        }
181
182                        match level.cmp(&previous_level) {
183                            std::cmp::Ordering::Greater => counters[level] = 0,
184                            std::cmp::Ordering::Less => counters.truncate(level + 1),
185                            std::cmp::Ordering::Equal => {}
186                        }
187
188                        counters[level] += 1;
189                        previous_level = level;
190
191                        let indent = "\t".repeat(level);
192                        let marker = if item.is_ordered {
193                            format!("{}{}. ", indent, counters[level])
194                        } else {
195                            format!("{}- ", indent)
196                        };
197
198                        slide_txt.push_str(&format!("{}{}\n", marker, item_text));
199                    }
200                },
201                _ => ()
202            }
203        }
204        Some(slide_txt)
205    }
206
207    /// Extracts the numeric slide identifier from a slide path.
208    ///
209    /// Helper method to parse slide numbers from internal pptx
210    /// slide paths (e.g., "ppt/slides/slide1.xml" → `1`).
211    pub fn extract_slide_number(path: &str) -> Option<u32> {
212        path.split('/')
213            .next_back()
214            .and_then(|filename| {
215                filename
216                    .strip_prefix("slide")
217                    .and_then(|s| s.strip_suffix(".xml"))
218            })
219            .and_then(|num_str| num_str.parse::<u32>().ok())
220    }
221
222    /// Links slide images references with their corresponding targets.
223    ///
224    /// Ensures that each image referenced by its ID is correctly 
225    /// linked to the actual internal resource paths stored in the slide.
226    /// This method is typically used internally after parsing a slide
227    ///
228    /// # Notes
229    ///
230    /// Internally those are the values image references are holding
231    ///
232    /// | Parameter | Example value         |
233    /// |---------- |---------------------- |
234    /// | `id`      | *rId2*                |
235    /// | `target`  | *../media/image2.png* |
236    ///
237    pub fn link_images(&mut self) {
238        let id_to_target: HashMap<String, String> = self.images
239            .iter()
240            .map(|img_ref| (img_ref.id.clone(), img_ref.target.clone()))
241            .collect();
242
243        for element in &mut self.elements {
244            if let SlideElement::Image(ref mut img_ref, _pos) = element {
245                if let Some(target) = id_to_target.get(&img_ref.id) {
246                    img_ref.target = target.clone();
247                }
248            }
249        }
250    }
251
252    /// Extracts the file extension from image paths
253    pub fn get_image_extension(&self, path: &str) -> String {
254        Path::new(path)
255            .extension()
256            .and_then(|ext| ext.to_str())
257            .unwrap_or("bin")
258            .to_string()
259    }
260
261    /// Compresses the image data and returning it as a `jpg` byte slice
262    /// 
263    /// # Parameter
264    /// 
265    /// - `image_data`: The raw image data as a byte array
266    /// 
267    /// # Returns
268    /// 
269    /// - `Vec<u8>`: Returns the compressed and converted jpg byte array
270    ///
271    /// # Notes
272    ///
273    /// All images will be converted to `jpg`
274    pub fn compress_image(&self, image_data: &[u8]) -> Option<Vec<u8>> {
275        let img = match image::load_from_memory(image_data) {
276            Ok(image) => image,
277            Err(_) => return None,
278        };
279
280        let mut output = Vec::new();
281        let quality = self.config.quality;
282
283        if img.write_to(&mut Cursor::new(&mut output), ImageOutputFormat::Jpeg(quality)).is_ok() {
284            Some(output)
285        } else {
286            None
287        }
288    }
289    
290    pub fn load_images_manually(&self) -> Option<Vec<ManualImage>> {
291        let mut images: Vec<ManualImage> = Vec::new();
292        
293        let image_refs: Vec<&ImageReference> = self.elements
294            .iter()
295            .filter_map(|element| match element {
296                SlideElement::Image(ref img, _pos) => Some(img),
297                _ => None,
298            })
299            .collect();
300        
301        for image_ref in image_refs {
302            if let Some(image_data) = self.image_data.get(&image_ref.id) {
303                let image_data = self.config.compress_images
304                    .then( | | self.compress_image(image_data))
305                    .unwrap_or_else(|| Option::from(image_data.clone()));
306
307                let base64_str = general_purpose::STANDARD.encode(image_data?);
308                
309                let image = ManualImage::new(
310                    base64_str,
311                    image_ref.clone(),
312                );
313                images.push(image);
314            }
315        }
316        
317        Some(images)
318    }
319
320    fn path_to_file_url(&self, path: &Path) -> Option<String> {
321        let abs_path = path.canonicalize().ok()?;
322        let mut path_str = abs_path.to_string_lossy().replace('\\', "/");
323
324        // remove windows unc prefix
325        if cfg!(windows) {
326            if let Some(stripped) = path_str.strip_prefix("//?/") {
327                path_str = stripped.to_string();
328            }
329            Some(format!("file:///{}", path_str))
330        } else {
331            Some(format!("file://{}", path_str))
332        }
333    }
334}
335
336#[cfg(test)]
337mod tests {
338    use std::fs;
339    use std::path::PathBuf;
340    use crate::ElementPosition;
341    use super::*;
342
343    fn mock_slide() -> Slide {
344        Slide {
345            rel_path: "ppt/slides/slide1.xml".to_string(),
346            slide_number: 1,
347            elements: vec![],
348            images: vec![],
349            image_data: HashMap::new(),
350            config: ParserConfig::default(),
351        }
352    }
353
354    fn load_image_data(filename: &str) -> Vec<u8> {
355        let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
356        path.push("tests");
357        path.push("test_data");
358        path.push(filename);
359        fs::read(path).expect("Unable to read test data file")
360    }
361    
362    #[test]
363    fn test_extract_slide_number() {
364        let input = "ppt/slides/slide5.xml";
365        
366        let actual = Slide::extract_slide_number(input).unwrap();
367        let expected: u32 = 5;
368        
369        assert_eq!(actual, expected);
370    }
371    
372    #[test]
373    fn test_get_image_extension() {
374        let slide = mock_slide();
375        let input = "../media/image1.png";
376        
377        let actual = slide.get_image_extension(input);
378        let expected = "png";
379        
380        assert_eq!(actual, expected);
381    }
382
383    #[test]
384    fn test_link_images() {
385        let mut slide = mock_slide();
386        let _position = ElementPosition::default();
387        
388        slide.images.push(ImageReference { id: "rId2".to_string(), target: "../media/image1.png".to_string() });
389        slide.elements.push(SlideElement::Image(ImageReference { id: "rId2".to_string(), target: "".to_string() }, _position));
390
391        slide.link_images();
392
393        if let SlideElement::Image(img_ref, _postion) = &slide.elements[0] {
394            assert_eq!(img_ref.target, "../media/image1.png");
395        }
396    }
397
398    #[test]
399    fn test_image_compression_reduces_size() {
400        let mut slide = mock_slide();
401        slide.config.quality = 50;
402
403        let raw_image = load_image_data("example-image.jpg");
404
405        if let Some(compression_result) = slide.compress_image(&raw_image) {
406            assert!(compression_result.len() < raw_image.len());
407        } else {
408            panic!("Compression failed");
409        }
410    }
411
412    #[test]
413    fn test_compressed_image_is_valid_jpg() {
414        let slide = mock_slide();
415        let raw_image = load_image_data("example-image.jpg");
416
417        if let Some(compression_result) = slide.compress_image(&raw_image) {
418            let result = image::load_from_memory(&compression_result);
419            assert!(result.is_ok());
420        } else {
421            panic!("Compression failed");
422        }
423    }
424}