Skip to main content

stillo_core/
extractor.rs

1pub mod readability;
2pub mod spa_detection;
3
4use html5ever::parse_document;
5use html5ever::tendril::TendrilSink;
6use markup5ever_rcdom::RcDom;
7
8use crate::document::{ExtractedContent, RawHtml, SpaDetection};
9use self::readability::ReadabilityExtractor;
10use self::spa_detection::{detect_spa, extract_text_length};
11
12#[derive(Debug, Clone)]
13pub struct ExtractorConfig {
14    pub min_content_length: usize,
15    pub noise_selectors: Vec<String>,
16    pub preserve_links: bool,
17}
18
19impl Default for ExtractorConfig {
20    fn default() -> Self {
21        Self {
22            min_content_length: 500,
23            noise_selectors: vec![],
24            preserve_links: true,
25        }
26    }
27}
28
29#[derive(Debug, thiserror::Error)]
30pub enum ExtractionError {
31    #[error("Failed to decode HTML: {0}")]
32    Decode(String),
33    #[error("No content found")]
34    NoContent,
35}
36
37pub struct ContentExtractor {
38    config: ExtractorConfig,
39}
40
41impl ContentExtractor {
42    pub fn new(config: ExtractorConfig) -> Self {
43        Self { config }
44    }
45
46    /// RawHtml → ExtractedContent(純粋関数)
47    pub fn extract(&self, raw: &RawHtml) -> Result<ExtractedContent, ExtractionError> {
48        let html_str = self.decode_bytes(raw)?;
49        let dom = self.parse_html(&html_str);
50        let root = dom.document.clone();
51
52        let text_len = extract_text_length(&root);
53        let _spa = detect_spa(&root, text_len, self.config.min_content_length);
54
55        let extractor = ReadabilityExtractor {
56            preserve_links: self.config.preserve_links,
57        };
58        let content = extractor.extract(&root, &raw.url);
59
60        Ok(content)
61    }
62
63    pub fn detect_spa_for(&self, raw: &RawHtml) -> Result<SpaDetection, ExtractionError> {
64        let html_str = self.decode_bytes(raw)?;
65        let dom = self.parse_html(&html_str);
66        let root = dom.document.clone();
67        let text_len = extract_text_length(&root);
68        Ok(detect_spa(&root, text_len, self.config.min_content_length))
69    }
70
71    fn decode_bytes(&self, raw: &RawHtml) -> Result<String, ExtractionError> {
72        // content-type から文字コードを判定、デフォルトは UTF-8
73        // BOMチェックを行い、フォールバックはlatin1として再解釈
74        if let Ok(s) = std::str::from_utf8(&raw.bytes) {
75            return Ok(s.to_owned());
76        }
77        // latin1 フォールバック
78        Ok(raw.bytes.iter().map(|&b| b as char).collect())
79    }
80
81    fn parse_html(&self, html: &str) -> RcDom {
82        parse_document(RcDom::default(), Default::default())
83            .from_utf8()
84            .read_from(&mut html.as_bytes())
85            .unwrap_or_default()
86    }
87}