pub mod antibot;
pub mod chunking;
pub mod clean;
pub mod dom_features;
pub mod dom_util;
pub mod filter;
pub mod markdown;
pub mod plaintext;
pub mod quality;
pub mod readability;
pub mod selector;
pub mod structured;
pub mod tables;
use crw_core::error::{CrwError, CrwResult};
use crw_core::types::{
CapturedNetworkResponse, ChunkResult, ChunkStrategy, DebugAttempt, DebugCandidate,
DebugExtraction, FilterMode, OutputFormat, PageMetadata, RenderDecision, ScrapeData,
};
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
#[derive(Debug, Default)]
pub struct DebugCollector {
attempts: Vec<DebugAttempt>,
}
impl DebugCollector {
pub fn new() -> Self {
Self::default()
}
pub fn push_attempt(&mut self, attempt: DebugAttempt) {
self.attempts.push(attempt);
}
pub fn into_extraction(self) -> DebugExtraction {
DebugExtraction {
attempts: self.attempts,
}
}
}
pub fn debug_candidate(
kind: impl Into<String>,
text: Option<String>,
score: f64,
cap_chars: Option<usize>,
) -> DebugCandidate {
let text_excerpt = text.as_ref().map(|s| {
let mut idx = 200.min(s.len());
while idx > 0 && !s.is_char_boundary(idx) {
idx -= 1;
}
s[..idx].to_string()
});
DebugCandidate {
kind: kind.into(),
text,
text_excerpt,
cap_chars,
score,
}
}
pub mod answer;
pub mod llm;
pub mod pricing;
pub mod summary;
pub struct ExtractOptions<'a> {
pub raw_html: &'a str,
pub source_url: &'a str,
pub status_code: u16,
pub rendered_with: Option<String>,
pub elapsed_ms: u64,
pub render_decision: Option<RenderDecision>,
pub credit_cost: u32,
pub warnings: Vec<String>,
pub formats: &'a [OutputFormat],
pub only_main_content: bool,
pub include_tags: &'a [String],
pub exclude_tags: &'a [String],
pub css_selector: Option<&'a str>,
pub xpath: Option<&'a str>,
pub chunk_strategy: Option<&'a ChunkStrategy>,
pub query: Option<&'a str>,
pub filter_mode: Option<&'a FilterMode>,
pub top_k: Option<usize>,
pub domain_selectors: Option<&'a HashMap<String, String>>,
pub captured_responses: &'a [CapturedNetworkResponse],
pub llm_fallback: Option<LlmFallbackParams<'a>>,
pub debug: bool,
pub debug_sink: Option<Arc<Mutex<DebugCollector>>>,
}
#[derive(Debug, Clone)]
pub struct LlmFallbackParams<'a> {
pub api_key: &'a str,
pub model: &'a str,
pub provider: &'a str,
pub base_url: Option<&'a str>,
pub quality_threshold: f32,
pub max_html_bytes: usize,
pub max_tokens: u32,
pub azure_api_version: Option<&'a str>,
pub always_run: bool,
}
pub async fn maybe_run_llm_fallback(
data: &mut ScrapeData,
raw_html: &str,
params: &LlmFallbackParams<'_>,
) -> CrwResult<()> {
let current_md = match data.markdown.as_deref() {
Some(m) if !m.trim().is_empty() => m,
_ => "",
};
let current_quality = quality::analyze_md_only(current_md);
if !params.always_run && current_quality.score >= params.quality_threshold {
return Ok(());
}
match llm::extract_via_llm(
raw_html,
params.api_key,
params.provider,
params.model,
params.base_url,
params.max_tokens,
params.max_html_bytes,
params.azure_api_version,
)
.await
{
Ok(llm_md) => {
let llm_quality = quality::analyze_md_only(&llm_md);
if llm_quality.score > current_quality.score {
tracing::info!(
prior_score = current_quality.score,
llm_score = llm_quality.score,
"LLM fallback produced higher-quality markdown"
);
data.markdown = Some(llm_md);
data.warnings.push("extracted_via=llm".to_string());
} else {
tracing::debug!(
prior_score = current_quality.score,
llm_score = llm_quality.score,
"LLM fallback produced lower-quality markdown; keeping original"
);
}
}
Err(e) => {
tracing::warn!(error = %e, "LLM fallback call failed; keeping DOM extraction");
}
}
Ok(())
}
fn lookup_domain_selector(source_url: &str, map: &HashMap<String, String>) -> Option<String> {
if map.is_empty() {
return None;
}
let host = url::Url::parse(source_url)
.ok()
.and_then(|u| u.host_str().map(|s| s.to_string()))?;
map.get(&host).cloned()
}
#[cfg(test)]
mod private_tests {
use super::*;
use crw_core::types::CapturedNetworkResponse;
#[test]
fn domain_selector_matches_exact_host() {
let mut map = HashMap::new();
map.insert("news.example.com".to_string(), ".article".to_string());
let got = lookup_domain_selector("https://news.example.com/p/42", &map);
assert_eq!(got.as_deref(), Some(".article"));
}
#[test]
fn domain_selector_misses_on_other_host() {
let mut map = HashMap::new();
map.insert("news.example.com".to_string(), ".article".to_string());
let got = lookup_domain_selector("https://other.example.com/p/42", &map);
assert!(got.is_none());
}
#[test]
fn domain_selector_empty_map_returns_none() {
let map = HashMap::new();
assert!(lookup_domain_selector("https://x.example.com/", &map).is_none());
}
#[test]
fn xhr_extract_returns_none_for_empty_input() {
assert!(extract_xhr_text(&[]).is_none());
}
#[test]
fn xhr_extract_collects_long_string_fields() {
let body = serde_json::json!({
"title": "short",
"body": "a".repeat(300),
"meta": { "summary": "b".repeat(200) },
"tags": ["c".repeat(150), "short"],
"url": "https://example.com/should/skip",
})
.to_string();
let resp = vec![CapturedNetworkResponse {
url: "https://api.example.com/article/1".to_string(),
request_id: "1".to_string(),
status: 200,
mime_type: Some("application/json".to_string()),
body: Some(body),
body_size_bytes: 800,
}];
let got = extract_xhr_text(&resp).expect("expected long-text fields");
assert!(got.contains(&"a".repeat(300)));
assert!(got.contains(&"b".repeat(200)));
assert!(got.contains(&"c".repeat(150)));
assert!(!got.contains("short"));
assert!(!got.contains("example.com/should/skip"));
}
#[test]
fn xhr_extract_skips_invalid_json() {
let resp = vec![CapturedNetworkResponse {
url: "x".into(),
request_id: "1".into(),
status: 200,
mime_type: Some("application/json".into()),
body: Some("not json".into()),
body_size_bytes: 8,
}];
assert!(extract_xhr_text(&resp).is_none());
}
}
fn decode_basic_html_entities(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut chars = s.char_indices();
while let Some((i, ch)) = chars.next() {
if ch != '&' {
out.push(ch);
continue;
}
let rest = &s[i..];
let replacement: Option<(&str, &str)> = [
("&", "&"),
("<", "<"),
(">", ">"),
(""", "\""),
("'", "'"),
("'", "'"),
(" ", " "),
("…", "…"),
("—", "—"),
("–", "–"),
("’", "\u{2019}"),
("‘", "\u{2018}"),
("”", "\u{201D}"),
("“", "\u{201C}"),
]
.into_iter()
.find(|(needle, _)| rest.starts_with(needle));
if let Some((needle, value)) = replacement {
out.push_str(value);
for _ in 0..(needle.len() - 1) {
chars.next();
}
} else {
out.push(ch);
}
}
out
}
fn reflow_inline_lists(s: String) -> String {
if !s.contains('\u{00a0}') && !s.contains(",\n\n") && !s.contains(":\n\n") {
return s;
}
let mut t = s.replace('\u{00a0}', " ");
t = INLINE_LINK_AFTER_PUNCT.replace_all(&t, "$p [").into_owned();
t = INLINE_LINK_AFTER_CLOSE.replace_all(&t, "), [").into_owned();
t = TRAILING_LIST_ITEM.replace_all(&t, ", $w").into_owned();
t
}
static INLINE_LINK_AFTER_PUNCT: once_cell::sync::Lazy<regex::Regex> =
once_cell::sync::Lazy::new(|| {
regex::Regex::new(r"(?P<p>[,:])[ \t]*\n[\s]*\[").expect("inline-link regex compiles")
});
static INLINE_LINK_AFTER_CLOSE: once_cell::sync::Lazy<regex::Regex> =
once_cell::sync::Lazy::new(|| {
regex::Regex::new(r"\),[ \t]*\n[\s]*\[").expect("inline-link close regex compiles")
});
static TRAILING_LIST_ITEM: once_cell::sync::Lazy<regex::Regex> = once_cell::sync::Lazy::new(|| {
regex::Regex::new(r",[ \t]*\n\n+(?P<w>[A-Za-z\u{00C0}-\u{FFFF}])")
.expect("trailing list-item regex compiles")
});
pub fn extract(opts: ExtractOptions<'_>) -> CrwResult<ScrapeData> {
let ExtractOptions {
raw_html,
source_url,
status_code,
rendered_with,
elapsed_ms,
render_decision,
credit_cost,
warnings,
formats,
only_main_content,
include_tags,
exclude_tags,
css_selector,
xpath,
chunk_strategy,
query,
filter_mode,
top_k,
domain_selectors,
captured_responses,
llm_fallback: _,
debug: _,
debug_sink: _,
} = opts;
let user_selected = css_selector.is_some() || xpath.is_some();
let domain_selector_owned: Option<String> =
if !user_selected && let Some(map) = domain_selectors {
lookup_domain_selector(source_url, map)
} else {
None
};
let css_selector = css_selector.or(domain_selector_owned.as_deref());
let meta = readability::extract_metadata(raw_html);
let cleaned = clean::clean_html(raw_html, only_main_content, include_tags, exclude_tags)
.unwrap_or_else(|_| raw_html.to_string());
let selected_html = apply_selector(&cleaned, css_selector, xpath)?;
let after_selection = selected_html.as_deref().unwrap_or(&cleaned);
let (content_html, cleaned_ref) = if only_main_content && selected_html.is_none() {
match readability::extract_main_content_with_provenance(after_selection) {
readability::ReadabilityOutcome::Selected { html: main, .. } => {
let re_cleaned = clean::clean_html(&main, true, &[], &[]).unwrap_or(main);
(re_cleaned, Some(cleaned))
}
readability::ReadabilityOutcome::Rejected { .. } => {
(cleaned.clone(), Some(cleaned))
}
}
} else {
(after_selection.to_string(), None)
};
let md = if formats.contains(&OutputFormat::Markdown)
|| formats.contains(&OutputFormat::Json)
|| formats.contains(&OutputFormat::Summary)
{
let primary_md = markdown::html_to_markdown(&content_html);
let primary_quality = quality::analyze_md_only(&primary_md);
if selected_html.is_some() || primary_quality.score > 0.4 {
Some(primary_md)
} else {
let mut candidates: Vec<(&'static str, String, quality::Quality)> = Vec::new();
if only_main_content && let Some(c) = cleaned_ref.as_ref() {
let m = markdown::html_to_markdown(c);
let q = quality::analyze_md_only(&m);
candidates.push(("cleaned", m, q));
}
let basic_cleaned = clean::clean_html(raw_html, false, include_tags, exclude_tags)
.unwrap_or_else(|_| raw_html.to_string());
let basic_md = markdown::html_to_markdown(&basic_cleaned);
let basic_q = quality::analyze_md_only(&basic_md);
candidates.push(("basic_clean", basic_md, basic_q));
if let Some(structural) = extract_tables_and_lists(raw_html) {
let q = quality::analyze_md_only(&structural);
candidates.push(("structural", structural, q));
}
if let Some(xhr_md) = extract_xhr_text(captured_responses) {
let q = quality::analyze_md_only(&xhr_md);
candidates.push(("xhr_json", xhr_md, q));
}
let plain_md = {
let text = plaintext::html_to_plaintext(&content_html);
if text.trim().is_empty() {
plaintext::html_to_plaintext(&basic_cleaned)
} else {
text
}
};
let plain_q = quality::analyze_md_only(&plain_md);
candidates.push(("plaintext", plain_md, plain_q));
candidates.insert(0, ("primary", primary_md, primary_quality));
const PRIMARY_MARGIN: f32 = 0.15;
let primary_score = candidates[0].2.score;
let chosen_idx = candidates
.iter()
.enumerate()
.skip(1)
.filter(|(_, c)| c.2.score >= primary_score + PRIMARY_MARGIN)
.max_by(|(_, a), (_, b)| {
a.2.score
.partial_cmp(&b.2.score)
.unwrap_or(std::cmp::Ordering::Equal)
.then(a.2.bytes.cmp(&b.2.bytes))
})
.map(|(i, _)| i)
.unwrap_or(0);
let names: Vec<&'static str> = candidates.iter().map(|c| c.0).collect();
let scores: Vec<f32> = candidates.iter().map(|c| c.2.score).collect();
let chosen_name = candidates[chosen_idx].0;
tracing::debug!(
strategies = ?names,
scores = ?scores,
chosen = %chosen_name,
"quality-selected markdown extraction"
);
Some(candidates.swap_remove(chosen_idx).1)
}
} else {
None
};
let md = md.map(|m| {
if user_selected {
return m;
}
let title = meta
.og_title
.as_deref()
.or(meta.title.as_deref())
.map(str::trim)
.filter(|t| !t.is_empty());
let Some(title) = title else { return m };
let core = title
.split('|')
.next()
.map(str::trim)
.filter(|s| !s.is_empty())
.unwrap_or(title);
let core = core
.rsplit_once(" – ")
.map(|(l, _)| l.trim())
.filter(|s| !s.is_empty())
.unwrap_or(core);
let core = core
.rsplit_once(" — ")
.map(|(l, _)| l.trim())
.filter(|s| !s.is_empty())
.unwrap_or(core);
let core = core
.rsplit_once(" - ")
.map(|(l, _)| l.trim())
.unwrap_or(core);
if m.contains(core) || m.contains(title) {
return m;
}
format!("# {core}\n\n{m}")
});
let md = md.map(|m| {
if user_selected {
return m;
}
if m.len() >= 1500 {
return m;
}
let name_desc = meta
.description
.as_deref()
.map(str::trim)
.filter(|d| !d.is_empty());
let og_desc = meta
.og_description
.as_deref()
.map(str::trim)
.filter(|d| !d.is_empty());
let combined = match (name_desc, og_desc) {
(Some(a), Some(b)) if a == b => decode_basic_html_entities(a),
(Some(a), Some(b)) => {
let (longer, shorter) = if a.len() >= b.len() { (a, b) } else { (b, a) };
let l = decode_basic_html_entities(longer);
let s = decode_basic_html_entities(shorter);
let probe_len = s.chars().take(60).map(char::len_utf8).sum::<usize>();
let probe = &s[..probe_len.min(s.len())];
if l.contains(probe) {
l
} else {
format!("{l}\n\n{s}")
}
}
(Some(a), None) | (None, Some(a)) => decode_basic_html_entities(a),
(None, None) => return m,
};
let trimmed = combined.trim();
if trimmed.chars().count() < 80 {
return m;
}
let title_lc = meta
.og_title
.as_deref()
.or(meta.title.as_deref())
.map(|t| t.trim().to_lowercase())
.unwrap_or_default();
if !title_lc.is_empty() && trimmed.to_lowercase() == title_lc {
return m;
}
let probe_len = trimmed.chars().take(120).map(char::len_utf8).sum::<usize>();
let probe = &trimmed[..probe_len.min(trimmed.len())];
if m.contains(probe) {
return m;
}
format!("{m}\n\n{trimmed}\n")
});
let md = md.map(reflow_inline_lists);
let plain = if formats.contains(&OutputFormat::PlainText) {
Some(plaintext::html_to_plaintext(&content_html))
} else {
None
};
let raw = if formats.contains(&OutputFormat::RawHtml) {
Some(raw_html.to_string())
} else {
None
};
let html = if formats.contains(&OutputFormat::Html) {
Some(content_html)
} else {
None
};
let links = if formats.contains(&OutputFormat::Links) {
Some(readability::extract_links(raw_html, source_url))
} else {
None
};
let json = None;
let orphan_chunk_warning =
if chunk_strategy.is_none() && (query.is_some() || filter_mode.is_some()) {
Some(
"'query' and 'filterMode' require 'chunkStrategy' to be set. \
These parameters were ignored."
.to_string(),
)
} else {
None
};
let chunks = if let Some(strategy) = chunk_strategy
&& let Some(ref markdown_text) = md
&& !markdown_text.trim().is_empty()
{
let raw_chunks = chunking::chunk_text(markdown_text, strategy);
let chunk_results = if let (Some(q), Some(mode)) = (query, filter_mode)
&& !q.trim().is_empty()
&& !raw_chunks.is_empty()
{
filter::filter_chunks_scored(&raw_chunks, q, mode, top_k.unwrap_or(5))
.into_iter()
.map(|sc| ChunkResult {
content: sc.content,
score: Some(sc.score),
index: sc.index,
})
.collect::<Vec<_>>()
} else {
let mut results: Vec<_> = raw_chunks
.into_iter()
.enumerate()
.map(|(i, c)| ChunkResult {
content: c,
score: None,
index: i,
})
.collect();
if let Some(k) = top_k {
results.truncate(k);
}
results
};
if chunk_results.is_empty() {
None
} else {
Some(chunk_results)
}
} else {
None
};
Ok(ScrapeData {
markdown: md,
html,
raw_html: raw,
plain_text: plain,
links,
json,
summary: None,
llm_usage: None,
chunks,
warning: orphan_chunk_warning,
warnings,
render_decision,
credit_cost,
metadata: PageMetadata {
title: meta.title,
description: meta.description,
og_title: meta.og_title,
og_description: meta.og_description,
og_image: meta.og_image,
canonical_url: meta.canonical_url,
source_url: source_url.to_string(),
language: meta.language,
status_code,
rendered_with,
elapsed_ms,
},
debug_extraction: None,
})
}
fn apply_selector(html: &str, css: Option<&str>, xpath: Option<&str>) -> CrwResult<Option<String>> {
if let Some(sel) = css {
let result = selector::extract_by_css(html, sel).map_err(CrwError::ExtractionError)?;
if result.is_some() {
return Ok(result);
}
}
if let Some(xp) = xpath
&& let Some(texts) =
selector::extract_by_xpath(html, xp).map_err(CrwError::ExtractionError)?
{
let wrapped = texts
.into_iter()
.map(|text| {
let escaped = text
.replace('&', "&")
.replace('<', "<")
.replace('>', ">");
format!("<div>{escaped}</div>")
})
.collect::<Vec<_>>()
.join("\n");
return Ok(Some(wrapped));
}
Ok(None)
}
fn extract_xhr_text(captured: &[CapturedNetworkResponse]) -> Option<String> {
const MIN_FIELD_LEN: usize = 120;
const MIN_TOTAL_LEN: usize = 400;
if captured.is_empty() {
return None;
}
let mut paragraphs: Vec<String> = Vec::new();
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
for resp in captured {
let body = match resp.body.as_deref() {
Some(b) if !b.is_empty() => b,
_ => continue,
};
let value: serde_json::Value = match serde_json::from_str(body) {
Ok(v) => v,
Err(_) => continue,
};
walk_json_strings(&value, &mut |s| {
if s.len() >= MIN_FIELD_LEN && seen.insert(s.to_string()) {
paragraphs.push(s.to_string());
}
});
}
if paragraphs.is_empty() {
return None;
}
let joined = paragraphs.join("\n\n");
if joined.len() < MIN_TOTAL_LEN {
return None;
}
Some(joined)
}
fn walk_json_strings(value: &serde_json::Value, on_string: &mut dyn FnMut(&str)) {
match value {
serde_json::Value::String(s) => {
let trimmed = s.trim();
if trimmed.starts_with("http://")
|| trimmed.starts_with("https://")
|| trimmed.starts_with('/')
|| trimmed.starts_with('<')
{
return;
}
on_string(trimmed);
}
serde_json::Value::Array(arr) => {
for v in arr {
walk_json_strings(v, on_string);
}
}
serde_json::Value::Object(map) => {
for (_, v) in map {
walk_json_strings(v, on_string);
}
}
_ => {}
}
}
fn extract_tables_and_lists(html: &str) -> Option<String> {
use scraper::{Html, Selector};
let doc = Html::parse_document(html);
let table_sel = Selector::parse("table").ok()?;
let list_sel = Selector::parse("ul, ol").ok()?;
let row_sel = Selector::parse("tr").ok()?;
let item_sel = Selector::parse("li").ok()?;
let mut chunks: Vec<String> = Vec::new();
for table in doc.select(&table_sel) {
if table.select(&row_sel).count() < 2 {
continue;
}
let html_chunk = table.html();
let md = markdown::html_to_markdown(&html_chunk);
if md.trim().len() >= 40 {
chunks.push(md);
}
}
for list in doc.select(&list_sel) {
if list.select(&item_sel).count() < 5 {
continue;
}
let in_nav = list
.ancestors()
.filter_map(scraper::ElementRef::wrap)
.any(|el| {
let n = el.value().name();
n == "nav" || n == "footer" || n == "header"
});
if in_nav {
continue;
}
let html_chunk = list.html();
let md = markdown::html_to_markdown(&html_chunk);
if md.trim().len() >= 40 {
chunks.push(md);
}
}
if chunks.is_empty() {
return None;
}
Some(chunks.join("\n\n"))
}
#[cfg(test)]
mod table_list_fallback_tests {
use super::*;
#[test]
fn extracts_two_row_table() {
let html = "<html><body><nav>x</nav><table>\
<tr><th>Name</th><th>Value</th></tr>\
<tr><td>Alpha</td><td>1</td></tr>\
<tr><td>Bravo</td><td>2</td></tr>\
</table></body></html>";
let md = extract_tables_and_lists(html).expect("table should extract");
assert!(md.contains("Alpha"));
assert!(md.contains("Bravo"));
}
#[test]
fn skips_short_table() {
let html = "<table><tr><td>only</td></tr></table>";
assert!(extract_tables_and_lists(html).is_none());
}
#[test]
fn skips_nav_list() {
let html = "<nav><ul>\
<li>a</li><li>b</li><li>c</li><li>d</li><li>e</li><li>f</li>\
</ul></nav>";
assert!(extract_tables_and_lists(html).is_none());
}
#[test]
fn extracts_long_list() {
let html = "<main><ul>\
<li>Job A</li><li>Job B</li><li>Job C</li>\
<li>Job D</li><li>Job E</li><li>Job F</li>\
</ul></main>";
let md = extract_tables_and_lists(html).expect("list should extract");
assert!(md.contains("Job A"));
assert!(md.contains("Job F"));
}
}