crw_extract/
lib.rs

1//! HTML content extraction and format conversion for the CRW web scraper.
2//!
3//! Converts raw HTML into clean, structured output formats:
4//!
5//! - **Markdown** — via [`markdown::html_to_markdown`] (htmd)
6//! - **Plain text** — via [`plaintext::html_to_plaintext`]
7//! - **Cleaned HTML** — boilerplate removal with [`clean::clean_html`]
8//! - **Readability** — main-content extraction with text-density scoring
9//! - **CSS/XPath selector** — narrow content to a specific element
10//! - **Chunking** — split content into sentence/topic/regex chunks
11//! - **Filtering** — BM25 or cosine-similarity ranking of chunks
12//! - **Structured JSON** — LLM-based extraction with JSON Schema validation
13
14pub mod chunking;
15pub mod clean;
16pub mod filter;
17pub mod markdown;
18#[cfg(feature = "pdf")]
19pub mod pdf;
20pub mod plaintext;
21pub mod readability;
22pub mod selector;
23pub mod structured;
24
25use crw_core::error::{CrwError, CrwResult};
26use crw_core::types::{
27    ChunkResult, ChunkStrategy, FilterMode, OutputFormat, PageMetadata, ScrapeData,
28};
29
30/// Options for the high-level extraction pipeline.
31pub struct ExtractOptions<'a> {
32    pub raw_html: &'a str,
33    pub source_url: &'a str,
34    pub status_code: u16,
35    pub rendered_with: Option<String>,
36    pub elapsed_ms: u64,
37    pub formats: &'a [OutputFormat],
38    pub only_main_content: bool,
39    pub include_tags: &'a [String],
40    pub exclude_tags: &'a [String],
41    /// CSS selector to narrow content before readability extraction.
42    pub css_selector: Option<&'a str>,
43    /// XPath expression to narrow content before readability extraction.
44    pub xpath: Option<&'a str>,
45    /// Strategy for chunking the extracted markdown.
46    pub chunk_strategy: Option<&'a ChunkStrategy>,
47    /// Query for chunk filtering (requires filter_mode).
48    pub query: Option<&'a str>,
49    /// Filtering algorithm for chunk ranking.
50    pub filter_mode: Option<&'a FilterMode>,
51    /// Number of top chunks to return (default: 5).
52    pub top_k: Option<usize>,
53}
54
55/// High-level extraction: given raw HTML + options, produce ScrapeData.
56pub fn extract(opts: ExtractOptions<'_>) -> CrwResult<ScrapeData> {
57    let ExtractOptions {
58        raw_html,
59        source_url,
60        status_code,
61        rendered_with,
62        elapsed_ms,
63        formats,
64        only_main_content,
65        include_tags,
66        exclude_tags,
67        css_selector,
68        xpath,
69        chunk_strategy,
70        query,
71        filter_mode,
72        top_k,
73    } = opts;
74
75    // Step 1: Extract metadata from raw HTML.
76    let meta = readability::extract_metadata(raw_html);
77
78    // Step 2: Clean HTML (remove boilerplate, nav, ads, etc.).
79    let cleaned = clean::clean_html(raw_html, only_main_content, include_tags, exclude_tags)
80        .unwrap_or_else(|_| raw_html.to_string());
81
82    // Step 3: Apply CSS/XPath selector if provided (narrows to a specific element).
83    let selected_html = apply_selector(&cleaned, css_selector, xpath)?;
84    let after_selection = selected_html.as_deref().unwrap_or(&cleaned);
85
86    // Step 4: If only_main_content, try to narrow further with readability scoring.
87    let (content_html, cleaned_ref) = if only_main_content && selected_html.is_none() {
88        let main = readability::extract_main_content(after_selection);
89        // Re-clean: readability may have selected a broad container (e.g. <article>)
90        // that still contains noise elements (infobox, navbox, catlinks, etc.)
91        let re_cleaned = clean::clean_html(&main, true, &[], &[]).unwrap_or(main);
92        (re_cleaned, Some(cleaned))
93    } else {
94        (after_selection.to_string(), None)
95    };
96
97    // Step 5: Produce requested formats.
98    let md = if formats.contains(&OutputFormat::Markdown) || formats.contains(&OutputFormat::Json) {
99        let md = markdown::html_to_markdown(&content_html);
100        // Trigger fallback if markdown is empty OR suspiciously short relative to HTML.
101        // Skip fallback when a CSS/XPath selector was explicitly used — short output is intentional.
102        let md_too_short =
103            selected_html.is_none() && md.trim().len() < 100 && raw_html.len() > 5000;
104        if md_too_short {
105            let fallback_md = if only_main_content && selected_html.is_none() {
106                // Try both fallbacks and pick whichever produced more content.
107                let from_cleaned = cleaned_ref
108                    .as_ref()
109                    .map(|c| markdown::html_to_markdown(c))
110                    .unwrap_or_default();
111
112                let basic_md = {
113                    let basic_cleaned =
114                        clean::clean_html(raw_html, false, include_tags, exclude_tags)
115                            .unwrap_or_else(|_| raw_html.to_string());
116                    markdown::html_to_markdown(&basic_cleaned)
117                };
118
119                // Pick whichever produced more content
120                if from_cleaned.trim().len() >= basic_md.trim().len() {
121                    from_cleaned
122                } else {
123                    basic_md
124                }
125            } else {
126                markdown::html_to_markdown(raw_html)
127            };
128
129            let fallback_too_short = fallback_md.trim().len() < 100 && raw_html.len() > 5000;
130            if fallback_too_short {
131                let text = plaintext::html_to_plaintext(&content_html);
132                if text.trim().is_empty() {
133                    let basic_cleaned =
134                        clean::clean_html(raw_html, false, include_tags, exclude_tags)
135                            .unwrap_or_else(|_| raw_html.to_string());
136                    Some(plaintext::html_to_plaintext(&basic_cleaned))
137                } else {
138                    Some(text)
139                }
140            } else {
141                Some(fallback_md)
142            }
143        } else {
144            Some(md)
145        }
146    } else {
147        None
148    };
149
150    let plain = if formats.contains(&OutputFormat::PlainText) {
151        Some(plaintext::html_to_plaintext(&content_html))
152    } else {
153        None
154    };
155
156    let raw = if formats.contains(&OutputFormat::RawHtml) {
157        Some(raw_html.to_string())
158    } else {
159        None
160    };
161
162    let html = if formats.contains(&OutputFormat::Html) {
163        Some(content_html)
164    } else {
165        None
166    };
167
168    let links = if formats.contains(&OutputFormat::Links) {
169        Some(readability::extract_links(raw_html, source_url))
170    } else {
171        None
172    };
173
174    // JSON extraction is handled asynchronously in scrape_url after extract() returns.
175    let json = None;
176
177    // Warn if filtering params are provided without a chunking strategy.
178    let orphan_chunk_warning =
179        if chunk_strategy.is_none() && (query.is_some() || filter_mode.is_some()) {
180            Some(
181                "'query' and 'filterMode' require 'chunkStrategy' to be set. \
182             These parameters were ignored."
183                    .to_string(),
184            )
185        } else {
186            None
187        };
188
189    // Step 6: Chunk the markdown if a strategy is provided.
190    let chunks = if let Some(strategy) = chunk_strategy
191        && let Some(ref markdown_text) = md
192        && !markdown_text.trim().is_empty()
193    {
194        let raw_chunks = chunking::chunk_text(markdown_text, strategy);
195
196        // Step 7: Filter chunks by relevance if query + filter_mode are set.
197        let chunk_results = if let (Some(q), Some(mode)) = (query, filter_mode)
198            && !q.trim().is_empty()
199            && !raw_chunks.is_empty()
200        {
201            filter::filter_chunks_scored(&raw_chunks, q, mode, top_k.unwrap_or(5))
202                .into_iter()
203                .map(|sc| ChunkResult {
204                    content: sc.content,
205                    score: Some(sc.score),
206                    index: sc.index,
207                })
208                .collect::<Vec<_>>()
209        } else {
210            let mut results: Vec<_> = raw_chunks
211                .into_iter()
212                .enumerate()
213                .map(|(i, c)| ChunkResult {
214                    content: c,
215                    score: None,
216                    index: i,
217                })
218                .collect();
219            if let Some(k) = top_k {
220                results.truncate(k);
221            }
222            results
223        };
224
225        if chunk_results.is_empty() {
226            None
227        } else {
228            Some(chunk_results)
229        }
230    } else {
231        None
232    };
233
234    Ok(ScrapeData {
235        markdown: md,
236        html,
237        raw_html: raw,
238        plain_text: plain,
239        links,
240        json,
241        chunks,
242        warning: orphan_chunk_warning,
243        metadata: PageMetadata {
244            title: meta.title,
245            description: meta.description,
246            og_title: meta.og_title,
247            og_description: meta.og_description,
248            og_image: meta.og_image,
249            canonical_url: meta.canonical_url,
250            source_url: source_url.to_string(),
251            language: meta.language,
252            status_code,
253            rendered_with,
254            elapsed_ms,
255        },
256    })
257}
258
259/// Apply CSS selector or XPath to narrow HTML content.
260/// Returns None if no selector is set or no match is found.
261fn apply_selector(html: &str, css: Option<&str>, xpath: Option<&str>) -> CrwResult<Option<String>> {
262    if let Some(sel) = css {
263        let result = selector::extract_by_css(html, sel).map_err(CrwError::ExtractionError)?;
264        if result.is_some() {
265            return Ok(result);
266        }
267    }
268    if let Some(xp) = xpath
269        && let Some(texts) =
270            selector::extract_by_xpath(html, xp).map_err(CrwError::ExtractionError)?
271    {
272        let wrapped = texts
273            .into_iter()
274            .map(|text| {
275                let escaped = text
276                    .replace('&', "&amp;")
277                    .replace('<', "&lt;")
278                    .replace('>', "&gt;");
279                format!("<div>{escaped}</div>")
280            })
281            .collect::<Vec<_>>()
282            .join("\n");
283        return Ok(Some(wrapped));
284    }
285    Ok(None)
286}
crw_extract/lib.rs

crw_extract/
lib.rs