1use super::{Web2PptError, Result, Web2PptConfig};
4use scraper::{Html, Selector, ElementRef};
5
6#[derive(Clone, Debug, PartialEq, Eq)]
8pub enum ContentType {
9 Title,
11 Heading(u8),
13 Paragraph,
15 ListItem,
17 Code,
19 Image { src: String, alt: String },
21 Table(Vec<Vec<String>>),
23 Quote,
25 Link { text: String, href: String },
27}
28
29#[derive(Clone, Debug)]
31pub struct ContentBlock {
32 pub content_type: ContentType,
34 pub text: String,
36 pub level: u8,
38}
39
40impl ContentBlock {
41 pub fn new(content_type: ContentType, text: &str) -> Self {
43 ContentBlock {
44 content_type,
45 text: text.trim().to_string(),
46 level: 0,
47 }
48 }
49
50 pub fn with_level(mut self, level: u8) -> Self {
52 self.level = level;
53 self
54 }
55
56 pub fn is_heading(&self) -> bool {
58 matches!(self.content_type, ContentType::Title | ContentType::Heading(_))
59 }
60
61 pub fn heading_level(&self) -> Option<u8> {
63 match self.content_type {
64 ContentType::Title => Some(1),
65 ContentType::Heading(level) => Some(level),
66 _ => None,
67 }
68 }
69}
70
71#[derive(Clone, Debug)]
73pub struct WebContent {
74 pub title: String,
76 pub url: String,
78 pub description: Option<String>,
80 pub blocks: Vec<ContentBlock>,
82 pub images: Vec<(String, String)>, }
85
86impl WebContent {
87 pub fn new(url: &str) -> Self {
89 WebContent {
90 title: String::new(),
91 url: url.to_string(),
92 description: None,
93 blocks: Vec::new(),
94 images: Vec::new(),
95 }
96 }
97
98 pub fn is_empty(&self) -> bool {
100 self.blocks.is_empty()
101 }
102
103 pub fn headings(&self) -> Vec<&ContentBlock> {
105 self.blocks.iter().filter(|b| b.is_heading()).collect()
106 }
107
108 pub fn grouped_by_headings(&self) -> Vec<(&ContentBlock, Vec<&ContentBlock>)> {
110 let mut groups: Vec<(&ContentBlock, Vec<&ContentBlock>)> = Vec::new();
111 let mut current_heading: Option<&ContentBlock> = None;
112 let mut current_content: Vec<&ContentBlock> = Vec::new();
113
114 for block in &self.blocks {
115 if block.is_heading() {
116 if let Some(heading) = current_heading {
118 groups.push((heading, current_content));
119 current_content = Vec::new();
120 }
121 current_heading = Some(block);
122 } else {
123 current_content.push(block);
124 }
125 }
126
127 if let Some(heading) = current_heading {
129 groups.push((heading, current_content));
130 }
131
132 groups
133 }
134}
135
136pub struct WebParser {
138 config: Web2PptConfig,
139}
140
141impl WebParser {
142 pub fn new() -> Self {
144 Self::with_config(Web2PptConfig::default())
145 }
146
147 pub fn with_config(config: Web2PptConfig) -> Self {
149 WebParser { config }
150 }
151
152 pub fn parse(&self, html: &str, url: &str) -> Result<WebContent> {
154 let document = Html::parse_document(html);
155 let mut content = WebContent::new(url);
156
157 content.title = self.extract_title(&document);
159
160 content.description = self.extract_meta_description(&document);
162
163 self.extract_content(&document, &mut content)?;
165
166 if content.is_empty() {
167 return Err(Web2PptError::NoContent);
168 }
169
170 Ok(content)
171 }
172
173 fn extract_title(&self, document: &Html) -> String {
175 if let Ok(selector) = Selector::parse("title") {
177 if let Some(element) = document.select(&selector).next() {
178 let title = element.text().collect::<String>().trim().to_string();
179 if !title.is_empty() {
180 return title;
181 }
182 }
183 }
184
185 if let Ok(selector) = Selector::parse("h1") {
187 if let Some(element) = document.select(&selector).next() {
188 let title = element.text().collect::<String>().trim().to_string();
189 if !title.is_empty() {
190 return title;
191 }
192 }
193 }
194
195 if let Ok(selector) = Selector::parse("meta[property='og:title']") {
197 if let Some(element) = document.select(&selector).next() {
198 if let Some(content) = element.value().attr("content") {
199 return content.trim().to_string();
200 }
201 }
202 }
203
204 "Untitled".to_string()
205 }
206
207 fn extract_meta_description(&self, document: &Html) -> Option<String> {
209 if let Ok(selector) = Selector::parse("meta[name='description']") {
211 if let Some(element) = document.select(&selector).next() {
212 if let Some(content) = element.value().attr("content") {
213 let desc = content.trim().to_string();
214 if !desc.is_empty() {
215 return Some(desc);
216 }
217 }
218 }
219 }
220
221 if let Ok(selector) = Selector::parse("meta[property='og:description']") {
223 if let Some(element) = document.select(&selector).next() {
224 if let Some(content) = element.value().attr("content") {
225 let desc = content.trim().to_string();
226 if !desc.is_empty() {
227 return Some(desc);
228 }
229 }
230 }
231 }
232
233 None
234 }
235
236 fn extract_content(&self, document: &Html, content: &mut WebContent) -> Result<()> {
238 let main_selectors = [
240 "main article",
241 "article",
242 "main",
243 "[role='main']",
244 ".content",
245 ".post-content",
246 ".article-content",
247 ".entry-content",
248 ".markdown-body",
249 ".prose",
250 "#content",
251 "#main",
252 "#article",
253 ".article",
254 "body",
255 ];
256
257 let mut main_element: Option<ElementRef> = None;
258
259 for selector_str in &main_selectors {
260 if let Ok(selector) = Selector::parse(selector_str) {
261 if let Some(element) = document.select(&selector).next() {
262 let text_len: usize = element.text().collect::<String>().len();
264 if text_len > 100 {
265 main_element = Some(element);
266 break;
267 }
268 }
269 }
270 }
271
272 let main = main_element.ok_or(Web2PptError::NoContent)?;
273
274 self.walk_element(&main, content, 0);
276
277 Ok(())
278 }
279
280 fn walk_element(&self, element: &ElementRef, content: &mut WebContent, depth: u8) {
282 let tag_name = element.value().name();
284 let skip_tags = ["script", "style", "noscript", "svg", "form", "button", "input", "select", "textarea", "iframe"];
285 if skip_tags.contains(&tag_name) {
286 return;
287 }
288
289 if let Some(class) = element.value().attr("class") {
291 let class_lower = class.to_lowercase();
292 let skip_classes = ["advertisement", "ad-container", "social-share", "comment-section"];
294 if skip_classes.iter().any(|c| class_lower.contains(c)) {
295 return;
296 }
297 }
298
299 match tag_name {
300 "h1" => {
301 let text = self.clean_text(element);
302 if !text.is_empty() && text.len() < 300 {
303 content.blocks.push(ContentBlock::new(ContentType::Title, &text));
304 }
305 }
306 "h2" | "h3" | "h4" | "h5" | "h6" => {
307 let text = self.clean_text(element);
308 if !text.is_empty() && text.len() < 300 {
309 let level = tag_name.chars().last().unwrap().to_digit(10).unwrap() as u8;
310 content.blocks.push(ContentBlock::new(ContentType::Heading(level), &text));
311 }
312 }
313 "p" => {
314 let text = self.clean_text(element);
315 if text.len() >= 10 {
317 content.blocks.push(ContentBlock::new(ContentType::Paragraph, &text));
318 }
319 }
320 "li" => {
321 let text = self.clean_text(element);
322 if !text.is_empty() && text.len() < 500 {
323 content.blocks.push(ContentBlock::new(ContentType::ListItem, &text).with_level(depth));
324 }
325 }
326 "blockquote" => {
327 let text = self.clean_text(element);
328 if !text.is_empty() {
329 content.blocks.push(ContentBlock::new(ContentType::Quote, &text));
330 }
331 }
332 "pre" | "code" => {
333 if self.config.include_code {
334 let text = element.text().collect::<String>();
335 let text = text.trim();
336 if !text.is_empty() && text.len() <= 1000 {
337 content.blocks.push(ContentBlock::new(ContentType::Code, text));
338 }
339 }
340 return; }
342 "img" => {
343 if self.config.include_images {
344 if let Some(src) = element.value().attr("src") {
345 let alt = element.value().attr("alt").unwrap_or("").to_string();
346 if !src.starts_with("data:") && !alt.is_empty() {
347 content.images.push((src.to_string(), alt.clone()));
348 content.blocks.push(ContentBlock::new(
349 ContentType::Image { src: src.to_string(), alt },
350 ""
351 ));
352 }
353 }
354 }
355 }
356 "table" => {
357 if self.config.include_tables {
358 self.extract_table(element, content);
359 }
360 return; }
362 "a" => {
363 if self.config.extract_links {
365 if let Some(href) = element.value().attr("href") {
366 let text = self.clean_text(element);
367 if !text.is_empty() && text.len() > 5 && href.starts_with("http") {
368 }
371 }
372 }
373 }
374 _ => {}
375 }
376
377 let no_recurse_tags = ["p", "li", "pre", "code", "img", "table", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6"];
379 if !no_recurse_tags.contains(&tag_name) {
380 for child in element.children() {
381 if let Some(child_elem) = ElementRef::wrap(child) {
382 self.walk_element(&child_elem, content, depth + 1);
383 }
384 }
385 }
386 }
387
388 fn clean_text(&self, element: &ElementRef) -> String {
390 let text: String = element.text().collect();
391 let text = text.split_whitespace().collect::<Vec<_>>().join(" ");
393 text.trim().to_string()
394 }
395
396 fn extract_table(&self, element: &ElementRef, content: &mut WebContent) {
398 let mut rows: Vec<Vec<String>> = Vec::new();
399
400 if let Ok(row_selector) = Selector::parse("tr") {
401 for row in element.select(&row_selector) {
402 let mut cells: Vec<String> = Vec::new();
403
404 if let Ok(cell_selector) = Selector::parse("th, td") {
405 for cell in row.select(&cell_selector) {
406 let text = self.clean_text(&cell);
407 cells.push(text);
408 }
409 }
410
411 if !cells.is_empty() {
412 rows.push(cells);
413 }
414 }
415 }
416
417 if !rows.is_empty() && rows.len() <= 30 {
418 content.blocks.push(ContentBlock::new(
419 ContentType::Table(rows),
420 ""
421 ));
422 }
423 }
424}
425
426impl Default for WebParser {
427 fn default() -> Self {
428 Self::new()
429 }
430}
431
432#[cfg(test)]
433mod tests {
434 use super::*;
435
436 #[test]
437 fn test_parse_simple_html() {
438 let html = r#"
439 <!DOCTYPE html>
440 <html>
441 <head><title>Test Page</title></head>
442 <body>
443 <h1>Main Title</h1>
444 <p>This is a paragraph with enough text to be included.</p>
445 <h2>Section 1</h2>
446 <ul>
447 <li>Item 1</li>
448 <li>Item 2</li>
449 </ul>
450 </body>
451 </html>
452 "#;
453
454 let parser = WebParser::new();
455 let content = parser.parse(html, "https://example.com").unwrap();
456
457 assert_eq!(content.title, "Test Page");
458 assert!(!content.blocks.is_empty());
459 }
460
461 #[test]
462 fn test_content_block() {
463 let block = ContentBlock::new(ContentType::Heading(2), "Test Heading");
464 assert!(block.is_heading());
465 assert_eq!(block.heading_level(), Some(2));
466 }
467
468 #[test]
469 fn test_grouped_by_headings() {
470 let mut content = WebContent::new("https://example.com");
471 content.blocks.push(ContentBlock::new(ContentType::Title, "Title"));
472 content.blocks.push(ContentBlock::new(ContentType::Paragraph, "Intro text"));
473 content.blocks.push(ContentBlock::new(ContentType::Heading(2), "Section 1"));
474 content.blocks.push(ContentBlock::new(ContentType::Paragraph, "Section 1 text"));
475
476 let groups = content.grouped_by_headings();
477 assert_eq!(groups.len(), 2);
478 }
479}