search5 0.3.2 - Docs.rs

use crate::parser::ParsedDocument;
use scraper::{Html, Selector};
use std::path::Path;

pub struct HtmlParser;

pub use crate::parser::HtmlParser as HtmlParserTrait;

impl HtmlParser {
    pub fn new() -> Self {
        Self
    }

    pub fn parse_file(&self, path: &Path) -> std::io::Result<ParsedDocument> {
        let content = std::fs::read_to_string(path)?;
        Ok(self.parse(&content, path.to_string_lossy().as_ref()))
    }

    pub fn parse(&self, html: &str, url: &str) -> ParsedDocument {
        let document = Html::parse_document(html);

        let title = self.extract_title(&document);
        let content = self.extract_content(&document);

        ParsedDocument::new(url.to_string(), title, content)
    }

    fn extract_title(&self, document: &Html) -> String {
        let title_selector = Selector::parse("title, h1").unwrap();

        for element in document.select(&title_selector) {
            let text = element.text().collect::<String>().trim().to_string();
            if !text.is_empty() {
                return text;
            }
        }

        String::from("Untitled")
    }

    fn extract_content(&self, document: &Html) -> String {
        let mut content = String::new();

        let p_selector = Selector::parse("p").unwrap();
        for element in document.select(&p_selector) {
            let text = element.text().collect::<String>().trim().to_string();
            if !text.is_empty() {
                content.push_str(&text);
                content.push(' ');
            }
        }

        content.trim().to_string()
    }

    pub fn parse_directory(&self, dir: &Path) -> std::io::Result<Vec<ParsedDocument>> {
        let mut documents = Vec::new();

        let entries = std::fs::read_dir(dir)?;
        for entry in entries.flatten() {
            let path = entry.path();
            if path.extension().is_some_and(|ext| ext == "html") {
                match self.parse_file(&path) {
                    Ok(doc) => documents.push(doc),
                    Err(e) => eprintln!("Failed to parse {:?}: {}", path, e),
                }
            }
        }

        Ok(documents)
    }
}

impl Default for HtmlParser {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_simple_html() {
        let parser = HtmlParser::new();
        let html = r#"<!DOCTYPE html>
<html>
<head><title>Test Title</title></head>
<body><p>Hello world</p></body>
</html>"#;

        let doc = parser.parse(html, "test.html");

        assert_eq!(doc.title, "Test Title");
        assert_eq!(doc.url, "test.html");
        assert!(doc.content.contains("Hello world"));
    }

    #[test]
    fn test_extract_title_from_h1() {
        let parser = HtmlParser::new();
        let html = r#"<!DOCTYPE html>
<html>
<body><h1>Heading One</h1></body>
</html>"#;

        let doc = parser.parse(html, "test.html");

        assert_eq!(doc.title, "Heading One");
    }

    #[test]
    fn test_extract_content_multiple_paragraphs() {
        let parser = HtmlParser::new();
        let html = r#"<!DOCTYPE html>
<html>
<body>
<p>First paragraph.</p>
<p>Second paragraph.</p>
<p>Third paragraph.</p>
</body>
</html>"#;

        let doc = parser.parse(html, "test.html");

        assert!(doc.content.contains("First paragraph"));
        assert!(doc.content.contains("Second paragraph"));
        assert!(doc.content.contains("Third paragraph"));
    }

    #[test]
    fn test_default_title() {
        let parser = HtmlParser::new();
        let html = r#"<!DOCTYPE html>
<html><body><p>No title here</p></body></html>"#;

        let doc = parser.parse(html, "test.html");

        assert_eq!(doc.title, "Untitled");
    }

    #[test]
    fn test_chinese_content() {
        let parser = HtmlParser::new();
        let html = r#"<!DOCTYPE html>
<html>
<head><title>測試標題</title></head>
<body><p>這是中文內容測試</p></body>
</html>"#;

        let doc = parser.parse(html, "test.html");

        assert_eq!(doc.title, "測試標題");
        assert!(doc.content.contains("這是中文內容測試"));
    }

    #[test]
    fn test_mixed_chinese_english() {
        let parser = HtmlParser::new();
        let html = r#"<!DOCTYPE html>
<html>
<head><title>Rust Programming</title></head>
<body><p>Rust 是一種安全的程式語言。</p></body>
</html>"#;

        let doc = parser.parse(html, "test.html");

        assert!(doc.content.contains("Rust"));
        assert!(doc.content.contains("程式語言"));
    }
}