Skip to main content

ppt_rs/web2ppt/
parser.rs

1//! HTML parser for Web2PPT
2
3use super::{Web2PptError, Result, Web2PptConfig};
4use scraper::{Html, Selector, ElementRef};
5
6/// Type of content block
7#[derive(Clone, Debug, PartialEq, Eq)]
8pub enum ContentType {
9    /// Main title (h1)
10    Title,
11    /// Section heading (h2-h6)
12    Heading(u8),
13    /// Paragraph text
14    Paragraph,
15    /// List item
16    ListItem,
17    /// Code block
18    Code,
19    /// Image with URL and alt text
20    Image { src: String, alt: String },
21    /// Table data
22    Table(Vec<Vec<String>>),
23    /// Blockquote
24    Quote,
25    /// Link with URL
26    Link { text: String, href: String },
27}
28
29/// A block of content extracted from the page
30#[derive(Clone, Debug)]
31pub struct ContentBlock {
32    /// Type of content
33    pub content_type: ContentType,
34    /// Text content
35    pub text: String,
36    /// Nesting level (for lists)
37    pub level: u8,
38}
39
40impl ContentBlock {
41    /// Create a new content block
42    pub fn new(content_type: ContentType, text: &str) -> Self {
43        ContentBlock {
44            content_type,
45            text: text.trim().to_string(),
46            level: 0,
47        }
48    }
49
50    /// Create with level
51    pub fn with_level(mut self, level: u8) -> Self {
52        self.level = level;
53        self
54    }
55
56    /// Check if this is a heading
57    pub fn is_heading(&self) -> bool {
58        matches!(self.content_type, ContentType::Title | ContentType::Heading(_))
59    }
60
61    /// Get heading level (1 for title, 2-6 for headings)
62    pub fn heading_level(&self) -> Option<u8> {
63        match self.content_type {
64            ContentType::Title => Some(1),
65            ContentType::Heading(level) => Some(level),
66            _ => None,
67        }
68    }
69}
70
71/// Extracted web content
72#[derive(Clone, Debug)]
73pub struct WebContent {
74    /// Page title
75    pub title: String,
76    /// Page URL
77    pub url: String,
78    /// Meta description
79    pub description: Option<String>,
80    /// Content blocks
81    pub blocks: Vec<ContentBlock>,
82    /// Images found
83    pub images: Vec<(String, String)>, // (src, alt)
84}
85
86impl WebContent {
87    /// Create empty web content
88    pub fn new(url: &str) -> Self {
89        WebContent {
90            title: String::new(),
91            url: url.to_string(),
92            description: None,
93            blocks: Vec::new(),
94            images: Vec::new(),
95        }
96    }
97
98    /// Check if content is empty
99    pub fn is_empty(&self) -> bool {
100        self.blocks.is_empty()
101    }
102
103    /// Get all headings
104    pub fn headings(&self) -> Vec<&ContentBlock> {
105        self.blocks.iter().filter(|b| b.is_heading()).collect()
106    }
107
108    /// Get content grouped by headings
109    pub fn grouped_by_headings(&self) -> Vec<(&ContentBlock, Vec<&ContentBlock>)> {
110        let mut groups: Vec<(&ContentBlock, Vec<&ContentBlock>)> = Vec::new();
111        let mut current_heading: Option<&ContentBlock> = None;
112        let mut current_content: Vec<&ContentBlock> = Vec::new();
113
114        for block in &self.blocks {
115            if block.is_heading() {
116                // Save previous group
117                if let Some(heading) = current_heading {
118                    groups.push((heading, current_content));
119                    current_content = Vec::new();
120                }
121                current_heading = Some(block);
122            } else {
123                current_content.push(block);
124            }
125        }
126
127        // Save last group
128        if let Some(heading) = current_heading {
129            groups.push((heading, current_content));
130        }
131
132        groups
133    }
134}
135
136/// HTML parser for extracting content
137pub struct WebParser {
138    config: Web2PptConfig,
139}
140
141impl WebParser {
142    /// Create a new parser with default config
143    pub fn new() -> Self {
144        Self::with_config(Web2PptConfig::default())
145    }
146
147    /// Create a new parser with custom config
148    pub fn with_config(config: Web2PptConfig) -> Self {
149        WebParser { config }
150    }
151
152    /// Parse HTML content
153    pub fn parse(&self, html: &str, url: &str) -> Result<WebContent> {
154        let document = Html::parse_document(html);
155        let mut content = WebContent::new(url);
156
157        // Extract title
158        content.title = self.extract_title(&document);
159
160        // Extract meta description
161        content.description = self.extract_meta_description(&document);
162
163        // Extract main content
164        self.extract_content(&document, &mut content)?;
165
166        if content.is_empty() {
167            return Err(Web2PptError::NoContent);
168        }
169
170        Ok(content)
171    }
172
173    /// Extract page title
174    fn extract_title(&self, document: &Html) -> String {
175        // Try <title> tag first
176        if let Ok(selector) = Selector::parse("title") {
177            if let Some(element) = document.select(&selector).next() {
178                let title = element.text().collect::<String>().trim().to_string();
179                if !title.is_empty() {
180                    return title;
181                }
182            }
183        }
184
185        // Try h1
186        if let Ok(selector) = Selector::parse("h1") {
187            if let Some(element) = document.select(&selector).next() {
188                let title = element.text().collect::<String>().trim().to_string();
189                if !title.is_empty() {
190                    return title;
191                }
192            }
193        }
194
195        // Try og:title
196        if let Ok(selector) = Selector::parse("meta[property='og:title']") {
197            if let Some(element) = document.select(&selector).next() {
198                if let Some(content) = element.value().attr("content") {
199                    return content.trim().to_string();
200                }
201            }
202        }
203
204        "Untitled".to_string()
205    }
206
207    /// Extract meta description
208    fn extract_meta_description(&self, document: &Html) -> Option<String> {
209        // Try meta description
210        if let Ok(selector) = Selector::parse("meta[name='description']") {
211            if let Some(element) = document.select(&selector).next() {
212                if let Some(content) = element.value().attr("content") {
213                    let desc = content.trim().to_string();
214                    if !desc.is_empty() {
215                        return Some(desc);
216                    }
217                }
218            }
219        }
220
221        // Try og:description
222        if let Ok(selector) = Selector::parse("meta[property='og:description']") {
223            if let Some(element) = document.select(&selector).next() {
224                if let Some(content) = element.value().attr("content") {
225                    let desc = content.trim().to_string();
226                    if !desc.is_empty() {
227                        return Some(desc);
228                    }
229                }
230            }
231        }
232
233        None
234    }
235
236    /// Extract main content from document - preserving document order
237    fn extract_content(&self, document: &Html, content: &mut WebContent) -> Result<()> {
238        // Try to find main content area
239        let main_selectors = [
240            "main article",
241            "article",
242            "main",
243            "[role='main']",
244            ".content",
245            ".post-content",
246            ".article-content",
247            ".entry-content",
248            ".markdown-body",
249            ".prose",
250            "#content",
251            "#main",
252            "#article",
253            ".article",
254            "body",
255        ];
256
257        let mut main_element: Option<ElementRef> = None;
258
259        for selector_str in &main_selectors {
260            if let Ok(selector) = Selector::parse(selector_str) {
261                if let Some(element) = document.select(&selector).next() {
262                    // Check if this element has meaningful content
263                    let text_len: usize = element.text().collect::<String>().len();
264                    if text_len > 100 {
265                        main_element = Some(element);
266                        break;
267                    }
268                }
269            }
270        }
271
272        let main = main_element.ok_or(Web2PptError::NoContent)?;
273
274        // Extract content in document order by walking the DOM
275        self.walk_element(&main, content, 0);
276
277        Ok(())
278    }
279
280    /// Walk element tree and extract content in order
281    fn walk_element(&self, element: &ElementRef, content: &mut WebContent, depth: u8) {
282        // Skip script, style, nav, footer, aside, header elements
283        let tag_name = element.value().name();
284        let skip_tags = ["script", "style", "noscript", "svg", "form", "button", "input", "select", "textarea", "iframe"];
285        if skip_tags.contains(&tag_name) {
286            return;
287        }
288
289        // Check for class/id names that indicate non-content (but be less aggressive)
290        if let Some(class) = element.value().attr("class") {
291            let class_lower = class.to_lowercase();
292            // Only skip if clearly navigation/ads
293            let skip_classes = ["advertisement", "ad-container", "social-share", "comment-section"];
294            if skip_classes.iter().any(|c| class_lower.contains(c)) {
295                return;
296            }
297        }
298
299        match tag_name {
300            "h1" => {
301                let text = self.clean_text(element);
302                if !text.is_empty() && text.len() < 300 {
303                    content.blocks.push(ContentBlock::new(ContentType::Title, &text));
304                }
305            }
306            "h2" | "h3" | "h4" | "h5" | "h6" => {
307                let text = self.clean_text(element);
308                if !text.is_empty() && text.len() < 300 {
309                    let level = tag_name.chars().last().unwrap().to_digit(10).unwrap() as u8;
310                    content.blocks.push(ContentBlock::new(ContentType::Heading(level), &text));
311                }
312            }
313            "p" => {
314                let text = self.clean_text(element);
315                // Accept paragraphs with at least 10 chars
316                if text.len() >= 10 {
317                    content.blocks.push(ContentBlock::new(ContentType::Paragraph, &text));
318                }
319            }
320            "li" => {
321                let text = self.clean_text(element);
322                if !text.is_empty() && text.len() < 500 {
323                    content.blocks.push(ContentBlock::new(ContentType::ListItem, &text).with_level(depth));
324                }
325            }
326            "blockquote" => {
327                let text = self.clean_text(element);
328                if !text.is_empty() {
329                    content.blocks.push(ContentBlock::new(ContentType::Quote, &text));
330                }
331            }
332            "pre" | "code" => {
333                if self.config.include_code {
334                    let text = element.text().collect::<String>();
335                    let text = text.trim();
336                    if !text.is_empty() && text.len() <= 1000 {
337                        content.blocks.push(ContentBlock::new(ContentType::Code, text));
338                    }
339                }
340                return; // Don't recurse into code blocks
341            }
342            "img" => {
343                if self.config.include_images {
344                    if let Some(src) = element.value().attr("src") {
345                        let alt = element.value().attr("alt").unwrap_or("").to_string();
346                        if !src.starts_with("data:") && !alt.is_empty() {
347                            content.images.push((src.to_string(), alt.clone()));
348                            content.blocks.push(ContentBlock::new(
349                                ContentType::Image { src: src.to_string(), alt },
350                                ""
351                            ));
352                        }
353                    }
354                }
355            }
356            "table" => {
357                if self.config.include_tables {
358                    self.extract_table(element, content);
359                }
360                return; // Don't recurse into tables
361            }
362            "a" => {
363                // Extract important links
364                if self.config.extract_links {
365                    if let Some(href) = element.value().attr("href") {
366                        let text = self.clean_text(element);
367                        if !text.is_empty() && text.len() > 5 && href.starts_with("http") {
368                            // Only add standalone links, not inline ones
369                            // This is handled by not recursing for links with substantial text
370                        }
371                    }
372                }
373            }
374            _ => {}
375        }
376
377        // Always recurse into children (except for leaf elements we've already processed)
378        let no_recurse_tags = ["p", "li", "pre", "code", "img", "table", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6"];
379        if !no_recurse_tags.contains(&tag_name) {
380            for child in element.children() {
381                if let Some(child_elem) = ElementRef::wrap(child) {
382                    self.walk_element(&child_elem, content, depth + 1);
383                }
384            }
385        }
386    }
387
388    /// Clean and normalize text
389    fn clean_text(&self, element: &ElementRef) -> String {
390        let text: String = element.text().collect();
391        // Normalize whitespace
392        let text = text.split_whitespace().collect::<Vec<_>>().join(" ");
393        text.trim().to_string()
394    }
395
396    /// Extract table content
397    fn extract_table(&self, element: &ElementRef, content: &mut WebContent) {
398        let mut rows: Vec<Vec<String>> = Vec::new();
399
400        if let Ok(row_selector) = Selector::parse("tr") {
401            for row in element.select(&row_selector) {
402                let mut cells: Vec<String> = Vec::new();
403
404                if let Ok(cell_selector) = Selector::parse("th, td") {
405                    for cell in row.select(&cell_selector) {
406                        let text = self.clean_text(&cell);
407                        cells.push(text);
408                    }
409                }
410
411                if !cells.is_empty() {
412                    rows.push(cells);
413                }
414            }
415        }
416
417        if !rows.is_empty() && rows.len() <= 30 {
418            content.blocks.push(ContentBlock::new(
419                ContentType::Table(rows),
420                ""
421            ));
422        }
423    }
424}
425
426impl Default for WebParser {
427    fn default() -> Self {
428        Self::new()
429    }
430}
431
432#[cfg(test)]
433mod tests {
434    use super::*;
435
436    #[test]
437    fn test_parse_simple_html() {
438        let html = r#"
439            <!DOCTYPE html>
440            <html>
441            <head><title>Test Page</title></head>
442            <body>
443                <h1>Main Title</h1>
444                <p>This is a paragraph with enough text to be included.</p>
445                <h2>Section 1</h2>
446                <ul>
447                    <li>Item 1</li>
448                    <li>Item 2</li>
449                </ul>
450            </body>
451            </html>
452        "#;
453
454        let parser = WebParser::new();
455        let content = parser.parse(html, "https://example.com").unwrap();
456
457        assert_eq!(content.title, "Test Page");
458        assert!(!content.blocks.is_empty());
459    }
460
461    #[test]
462    fn test_content_block() {
463        let block = ContentBlock::new(ContentType::Heading(2), "Test Heading");
464        assert!(block.is_heading());
465        assert_eq!(block.heading_level(), Some(2));
466    }
467
468    #[test]
469    fn test_grouped_by_headings() {
470        let mut content = WebContent::new("https://example.com");
471        content.blocks.push(ContentBlock::new(ContentType::Title, "Title"));
472        content.blocks.push(ContentBlock::new(ContentType::Paragraph, "Intro text"));
473        content.blocks.push(ContentBlock::new(ContentType::Heading(2), "Section 1"));
474        content.blocks.push(ContentBlock::new(ContentType::Paragraph, "Section 1 text"));
475
476        let groups = content.grouped_by_headings();
477        assert_eq!(groups.len(), 2);
478    }
479}