use super::{Web2PptError, Result, Web2PptConfig};
use scraper::{Html, Selector, ElementRef};
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum ContentType {
Title,
Heading(u8),
Paragraph,
ListItem,
Code,
Image { src: String, alt: String },
Table(Vec<Vec<String>>),
Quote,
Link { text: String, href: String },
}
#[derive(Clone, Debug)]
pub struct ContentBlock {
pub content_type: ContentType,
pub text: String,
pub level: u8,
}
impl ContentBlock {
pub fn new(content_type: ContentType, text: &str) -> Self {
ContentBlock {
content_type,
text: text.trim().to_string(),
level: 0,
}
}
pub fn with_level(mut self, level: u8) -> Self {
self.level = level;
self
}
pub fn is_heading(&self) -> bool {
matches!(self.content_type, ContentType::Title | ContentType::Heading(_))
}
pub fn heading_level(&self) -> Option<u8> {
match self.content_type {
ContentType::Title => Some(1),
ContentType::Heading(level) => Some(level),
_ => None,
}
}
}
#[derive(Clone, Debug)]
pub struct WebContent {
pub title: String,
pub url: String,
pub description: Option<String>,
pub blocks: Vec<ContentBlock>,
pub images: Vec<(String, String)>, }
impl WebContent {
pub fn new(url: &str) -> Self {
WebContent {
title: String::new(),
url: url.to_string(),
description: None,
blocks: Vec::new(),
images: Vec::new(),
}
}
pub fn is_empty(&self) -> bool {
self.blocks.is_empty()
}
pub fn headings(&self) -> Vec<&ContentBlock> {
self.blocks.iter().filter(|b| b.is_heading()).collect()
}
pub fn grouped_by_headings(&self) -> Vec<(&ContentBlock, Vec<&ContentBlock>)> {
let mut groups: Vec<(&ContentBlock, Vec<&ContentBlock>)> = Vec::new();
let mut current_heading: Option<&ContentBlock> = None;
let mut current_content: Vec<&ContentBlock> = Vec::new();
for block in &self.blocks {
if block.is_heading() {
if let Some(heading) = current_heading {
groups.push((heading, current_content));
current_content = Vec::new();
}
current_heading = Some(block);
} else {
current_content.push(block);
}
}
if let Some(heading) = current_heading {
groups.push((heading, current_content));
}
groups
}
}
pub struct WebParser {
config: Web2PptConfig,
}
impl WebParser {
pub fn new() -> Self {
Self::with_config(Web2PptConfig::default())
}
pub fn with_config(config: Web2PptConfig) -> Self {
WebParser { config }
}
pub fn parse(&self, html: &str, url: &str) -> Result<WebContent> {
let document = Html::parse_document(html);
let mut content = WebContent::new(url);
content.title = self.extract_title(&document);
content.description = self.extract_meta_description(&document);
self.extract_content(&document, &mut content)?;
if content.is_empty() {
return Err(Web2PptError::NoContent);
}
Ok(content)
}
fn extract_title(&self, document: &Html) -> String {
if let Ok(selector) = Selector::parse("title") {
if let Some(element) = document.select(&selector).next() {
let title = element.text().collect::<String>().trim().to_string();
if !title.is_empty() {
return title;
}
}
}
if let Ok(selector) = Selector::parse("h1") {
if let Some(element) = document.select(&selector).next() {
let title = element.text().collect::<String>().trim().to_string();
if !title.is_empty() {
return title;
}
}
}
if let Ok(selector) = Selector::parse("meta[property='og:title']") {
if let Some(element) = document.select(&selector).next() {
if let Some(content) = element.value().attr("content") {
return content.trim().to_string();
}
}
}
"Untitled".to_string()
}
fn extract_meta_description(&self, document: &Html) -> Option<String> {
if let Ok(selector) = Selector::parse("meta[name='description']") {
if let Some(element) = document.select(&selector).next() {
if let Some(content) = element.value().attr("content") {
let desc = content.trim().to_string();
if !desc.is_empty() {
return Some(desc);
}
}
}
}
if let Ok(selector) = Selector::parse("meta[property='og:description']") {
if let Some(element) = document.select(&selector).next() {
if let Some(content) = element.value().attr("content") {
let desc = content.trim().to_string();
if !desc.is_empty() {
return Some(desc);
}
}
}
}
None
}
fn extract_content(&self, document: &Html, content: &mut WebContent) -> Result<()> {
let main_selectors = [
"main article",
"article",
"main",
"[role='main']",
".content",
".post-content",
".article-content",
".entry-content",
".markdown-body",
".prose",
"#content",
"#main",
"#article",
".article",
"body",
];
let mut main_element: Option<ElementRef> = None;
for selector_str in &main_selectors {
if let Ok(selector) = Selector::parse(selector_str) {
if let Some(element) = document.select(&selector).next() {
let text_len: usize = element.text().collect::<String>().len();
if text_len > 100 {
main_element = Some(element);
break;
}
}
}
}
let main = main_element.ok_or(Web2PptError::NoContent)?;
self.walk_element(&main, content, 0);
Ok(())
}
fn walk_element(&self, element: &ElementRef, content: &mut WebContent, depth: u8) {
let tag_name = element.value().name();
let skip_tags = ["script", "style", "noscript", "svg", "form", "button", "input", "select", "textarea", "iframe"];
if skip_tags.contains(&tag_name) {
return;
}
if let Some(class) = element.value().attr("class") {
let class_lower = class.to_lowercase();
let skip_classes = ["advertisement", "ad-container", "social-share", "comment-section"];
if skip_classes.iter().any(|c| class_lower.contains(c)) {
return;
}
}
match tag_name {
"h1" => {
let text = self.clean_text(element);
if !text.is_empty() && text.len() < 300 {
content.blocks.push(ContentBlock::new(ContentType::Title, &text));
}
}
"h2" | "h3" | "h4" | "h5" | "h6" => {
let text = self.clean_text(element);
if !text.is_empty() && text.len() < 300 {
let level = tag_name.chars().last().unwrap().to_digit(10).unwrap() as u8;
content.blocks.push(ContentBlock::new(ContentType::Heading(level), &text));
}
}
"p" => {
let text = self.clean_text(element);
if text.len() >= 10 {
content.blocks.push(ContentBlock::new(ContentType::Paragraph, &text));
}
}
"li" => {
let text = self.clean_text(element);
if !text.is_empty() && text.len() < 500 {
content.blocks.push(ContentBlock::new(ContentType::ListItem, &text).with_level(depth));
}
}
"blockquote" => {
let text = self.clean_text(element);
if !text.is_empty() {
content.blocks.push(ContentBlock::new(ContentType::Quote, &text));
}
}
"pre" | "code" => {
if self.config.include_code {
let text = element.text().collect::<String>();
let text = text.trim();
if !text.is_empty() && text.len() <= 1000 {
content.blocks.push(ContentBlock::new(ContentType::Code, text));
}
}
return; }
"img" => {
if self.config.include_images {
if let Some(src) = element.value().attr("src") {
let alt = element.value().attr("alt").unwrap_or("").to_string();
if !src.starts_with("data:") && !alt.is_empty() {
content.images.push((src.to_string(), alt.clone()));
content.blocks.push(ContentBlock::new(
ContentType::Image { src: src.to_string(), alt },
""
));
}
}
}
}
"table" => {
if self.config.include_tables {
self.extract_table(element, content);
}
return; }
"a" => {
if self.config.extract_links {
if let Some(href) = element.value().attr("href") {
let text = self.clean_text(element);
if !text.is_empty() && text.len() > 5 && href.starts_with("http") {
}
}
}
}
_ => {}
}
let no_recurse_tags = ["p", "li", "pre", "code", "img", "table", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6"];
if !no_recurse_tags.contains(&tag_name) {
for child in element.children() {
if let Some(child_elem) = ElementRef::wrap(child) {
self.walk_element(&child_elem, content, depth + 1);
}
}
}
}
fn clean_text(&self, element: &ElementRef) -> String {
let text: String = element.text().collect();
let text = text.split_whitespace().collect::<Vec<_>>().join(" ");
text.trim().to_string()
}
fn extract_table(&self, element: &ElementRef, content: &mut WebContent) {
let mut rows: Vec<Vec<String>> = Vec::new();
if let Ok(row_selector) = Selector::parse("tr") {
for row in element.select(&row_selector) {
let mut cells: Vec<String> = Vec::new();
if let Ok(cell_selector) = Selector::parse("th, td") {
for cell in row.select(&cell_selector) {
let text = self.clean_text(&cell);
cells.push(text);
}
}
if !cells.is_empty() {
rows.push(cells);
}
}
}
if !rows.is_empty() && rows.len() <= 30 {
content.blocks.push(ContentBlock::new(
ContentType::Table(rows),
""
));
}
}
}
impl Default for WebParser {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_simple_html() {
let html = r#"
<!DOCTYPE html>
<html>
<head><title>Test Page</title></head>
<body>
<h1>Main Title</h1>
<p>This is a paragraph with enough text to be included.</p>
<h2>Section 1</h2>
<ul>
<li>Item 1</li>
<li>Item 2</li>
</ul>
</body>
</html>
"#;
let parser = WebParser::new();
let content = parser.parse(html, "https://example.com").unwrap();
assert_eq!(content.title, "Test Page");
assert!(!content.blocks.is_empty());
}
#[test]
fn test_content_block() {
let block = ContentBlock::new(ContentType::Heading(2), "Test Heading");
assert!(block.is_heading());
assert_eq!(block.heading_level(), Some(2));
}
#[test]
fn test_grouped_by_headings() {
let mut content = WebContent::new("https://example.com");
content.blocks.push(ContentBlock::new(ContentType::Title, "Title"));
content.blocks.push(ContentBlock::new(ContentType::Paragraph, "Intro text"));
content.blocks.push(ContentBlock::new(ContentType::Heading(2), "Section 1"));
content.blocks.push(ContentBlock::new(ContentType::Paragraph, "Section 1 text"));
let groups = content.grouped_by_headings();
assert_eq!(groups.len(), 2);
}
}