crw_extract/
lib.rs

1//! HTML content extraction and format conversion for the CRW web scraper.
2//!
3//! Converts raw HTML into clean, structured output formats:
4//!
5//! - **Markdown** — via [`markdown::html_to_markdown`] (htmd)
6//! - **Plain text** — via [`plaintext::html_to_plaintext`]
7//! - **Cleaned HTML** — boilerplate removal with [`clean::clean_html`]
8//! - **Readability** — main-content extraction with text-density scoring
9//! - **CSS/XPath selector** — narrow content to a specific element
10//! - **Chunking** — split content into sentence/topic/regex chunks
11//! - **Filtering** — BM25 or cosine-similarity ranking of chunks
12//! - **Structured JSON** — LLM-based extraction with JSON Schema validation
13
14pub mod antibot;
15pub mod chunking;
16pub mod clean;
17pub mod dom_features;
18pub mod dom_util;
19pub mod filter;
20pub mod judge;
21pub mod markdown;
22pub mod pdf;
23pub mod plaintext;
24pub mod quality;
25pub mod readability;
26pub mod selector;
27pub mod structured;
28pub mod tables;
29
30use crw_core::error::{CrwError, CrwResult};
31use crw_core::types::{
32    CapturedNetworkResponse, ChunkResult, ChunkStrategy, DebugAttempt, DebugCandidate,
33    DebugExtraction, FilterMode, OutputFormat, PageMetadata, RenderDecision, ScrapeData,
34};
35use std::collections::HashMap;
36use std::sync::{Arc, Mutex};
37
38/// Per-request collector for extraction debug traces. Wired in through
39/// [`ExtractOptions::debug_sink`]; the extractor pushes one
40/// [`DebugAttempt`] per `extract()` invocation, capturing the candidate
41/// ladder and the chosen output. Wrapped in an `Arc<Mutex<_>>` so the
42/// renderer / multi-attempt loop in `crw-crawl` can share a single sink
43/// across the JS-escalation retry.
44#[derive(Debug, Default)]
45pub struct DebugCollector {
46    attempts: Vec<DebugAttempt>,
47}
48
49impl DebugCollector {
50    pub fn new() -> Self {
51        Self::default()
52    }
53
54    pub fn push_attempt(&mut self, attempt: DebugAttempt) {
55        self.attempts.push(attempt);
56    }
57
58    pub fn into_extraction(self) -> DebugExtraction {
59        DebugExtraction {
60            attempts: self.attempts,
61        }
62    }
63}
64
65/// Convenience: lift a single candidate description into a
66/// [`DebugCandidate`].
67pub fn debug_candidate(
68    kind: impl Into<String>,
69    text: Option<String>,
70    score: f64,
71    cap_chars: Option<usize>,
72) -> DebugCandidate {
73    let text_excerpt = text.as_ref().map(|s| {
74        let mut idx = 200.min(s.len());
75        while idx > 0 && !s.is_char_boundary(idx) {
76            idx -= 1;
77        }
78        s[..idx].to_string()
79    });
80    DebugCandidate {
81        kind: kind.into(),
82        text,
83        text_excerpt,
84        cap_chars,
85        score,
86    }
87}
88
89pub mod answer;
90pub mod llm;
91pub mod pricing;
92pub mod summary;
93
94/// Options for the high-level extraction pipeline.
95pub struct ExtractOptions<'a> {
96    pub raw_html: &'a str,
97    pub source_url: &'a str,
98    pub status_code: u16,
99    pub rendered_with: Option<String>,
100    pub elapsed_ms: u64,
101    /// Routing decision metadata to surface to API consumers.
102    pub render_decision: Option<RenderDecision>,
103    /// Credit cost attributed to this fetch.
104    pub credit_cost: u32,
105    /// Soft-failure warnings collected through the render chain.
106    pub warnings: Vec<String>,
107    pub formats: &'a [OutputFormat],
108    pub only_main_content: bool,
109    pub include_tags: &'a [String],
110    pub exclude_tags: &'a [String],
111    /// CSS selector to narrow content before readability extraction.
112    pub css_selector: Option<&'a str>,
113    /// XPath expression to narrow content before readability extraction.
114    pub xpath: Option<&'a str>,
115    /// Strategy for chunking the extracted markdown.
116    pub chunk_strategy: Option<&'a ChunkStrategy>,
117    /// Query for chunk filtering (requires filter_mode).
118    pub query: Option<&'a str>,
119    /// Filtering algorithm for chunk ranking.
120    pub filter_mode: Option<&'a FilterMode>,
121    /// Number of top chunks to return (default: 5).
122    pub top_k: Option<usize>,
123    /// Per-host CSS selector overrides. Used only when the request did not
124    /// supply an explicit `css_selector` / `xpath`. The selector for the
125    /// source URL's host is applied before readability narrowing.
126    pub domain_selectors: Option<&'a HashMap<String, String>>,
127    /// XHR/fetch responses captured during navigation. Used as a fallback
128    /// content source when DOM-based extraction is low quality.
129    pub captured_responses: &'a [CapturedNetworkResponse],
130    /// LLM-assisted extraction fallback configuration. When the chosen
131    /// candidate's quality score is below `quality_threshold` and `enable`
132    /// is true, the raw HTML (truncated to `max_html_bytes`) is sent to the
133    /// configured LLM provider for re-extraction.
134    pub llm_fallback: Option<LlmFallbackParams<'a>>,
135    /// Opt-in extraction debug trace. When true, the extractor populates
136    /// `debug_sink` with one [`DebugAttempt`] per `extract()` invocation.
137    pub debug: bool,
138    /// Sink for debug attempts. Shared across the multi-attempt
139    /// JS-escalation loop so that all attempts land in one trace.
140    pub debug_sink: Option<Arc<Mutex<DebugCollector>>>,
141}
142
143/// Parameters for the LLM-assisted extraction fallback. See
144/// [`LlmFallbackConfig`](crw_core::config::LlmFallbackConfig).
145#[derive(Debug, Clone)]
146pub struct LlmFallbackParams<'a> {
147    pub api_key: &'a str,
148    pub model: &'a str,
149    pub provider: &'a str,
150    pub base_url: Option<&'a str>,
151    pub quality_threshold: f32,
152    pub max_html_bytes: usize,
153    pub max_tokens: u32,
154    pub azure_api_version: Option<&'a str>,
155    /// When true, run the LLM regardless of DOM-extraction quality
156    /// ("primary extractor" mode); when false, only run as a fallback for
157    /// candidates scoring below `quality_threshold`.
158    pub always_run: bool,
159}
160
161/// Re-extract via the configured LLM provider when the current markdown
162/// scores below `params.quality_threshold`. If the LLM result has a higher
163/// quality score, it replaces `data.markdown` in place and a warning is
164/// appended noting the swap. On any failure (network, auth, parse) the
165/// original markdown is preserved and the error is logged.
166pub async fn maybe_run_llm_fallback(
167    data: &mut ScrapeData,
168    raw_html: &str,
169    params: &LlmFallbackParams<'_>,
170) -> CrwResult<()> {
171    let current_md = match data.markdown.as_deref() {
172        Some(m) if !m.trim().is_empty() => m,
173        _ => "",
174    };
175    let current_quality = quality::analyze_md_only(current_md);
176    if !params.always_run && current_quality.score >= params.quality_threshold {
177        return Ok(());
178    }
179    match llm::extract_via_llm(
180        raw_html,
181        params.api_key,
182        params.provider,
183        params.model,
184        params.base_url,
185        params.max_tokens,
186        params.max_html_bytes,
187        params.azure_api_version,
188    )
189    .await
190    {
191        Ok(llm_md) => {
192            let llm_quality = quality::analyze_md_only(&llm_md);
193            if llm_quality.score > current_quality.score {
194                tracing::info!(
195                    prior_score = current_quality.score,
196                    llm_score = llm_quality.score,
197                    "LLM fallback produced higher-quality markdown"
198                );
199                data.markdown = Some(llm_md);
200                data.warnings.push("extracted_via=llm".to_string());
201            } else {
202                tracing::debug!(
203                    prior_score = current_quality.score,
204                    llm_score = llm_quality.score,
205                    "LLM fallback produced lower-quality markdown; keeping original"
206                );
207            }
208        }
209        Err(e) => {
210            tracing::warn!(error = %e, "LLM fallback call failed; keeping DOM extraction");
211        }
212    }
213    Ok(())
214}
215
216/// Look up the host-specific CSS selector override for a URL.
217fn lookup_domain_selector(source_url: &str, map: &HashMap<String, String>) -> Option<String> {
218    if map.is_empty() {
219        return None;
220    }
221    let host = url::Url::parse(source_url)
222        .ok()
223        .and_then(|u| u.host_str().map(|s| s.to_string()))?;
224    map.get(&host).cloned()
225}
226
227#[cfg(test)]
228mod private_tests {
229    use super::*;
230    use crw_core::types::CapturedNetworkResponse;
231
232    #[test]
233    fn domain_selector_matches_exact_host() {
234        let mut map = HashMap::new();
235        map.insert("news.example.com".to_string(), ".article".to_string());
236        let got = lookup_domain_selector("https://news.example.com/p/42", &map);
237        assert_eq!(got.as_deref(), Some(".article"));
238    }
239
240    #[test]
241    fn domain_selector_misses_on_other_host() {
242        let mut map = HashMap::new();
243        map.insert("news.example.com".to_string(), ".article".to_string());
244        let got = lookup_domain_selector("https://other.example.com/p/42", &map);
245        assert!(got.is_none());
246    }
247
248    #[test]
249    fn domain_selector_empty_map_returns_none() {
250        let map = HashMap::new();
251        assert!(lookup_domain_selector("https://x.example.com/", &map).is_none());
252    }
253
254    #[test]
255    fn xhr_extract_returns_none_for_empty_input() {
256        assert!(extract_xhr_text(&[]).is_none());
257    }
258
259    #[test]
260    fn xhr_extract_collects_long_string_fields() {
261        let body = serde_json::json!({
262            "title": "short",
263            "body": "a".repeat(300),
264            "meta": { "summary": "b".repeat(200) },
265            "tags": ["c".repeat(150), "short"],
266            "url": "https://example.com/should/skip",
267        })
268        .to_string();
269        let resp = vec![CapturedNetworkResponse {
270            url: "https://api.example.com/article/1".to_string(),
271            request_id: "1".to_string(),
272            status: 200,
273            mime_type: Some("application/json".to_string()),
274            body: Some(body),
275            body_size_bytes: 800,
276        }];
277        let got = extract_xhr_text(&resp).expect("expected long-text fields");
278        assert!(got.contains(&"a".repeat(300)));
279        assert!(got.contains(&"b".repeat(200)));
280        assert!(got.contains(&"c".repeat(150)));
281        assert!(!got.contains("short"));
282        assert!(!got.contains("example.com/should/skip"));
283    }
284
285    #[test]
286    fn xhr_extract_skips_invalid_json() {
287        let resp = vec![CapturedNetworkResponse {
288            url: "x".into(),
289            request_id: "1".into(),
290            status: 200,
291            mime_type: Some("application/json".into()),
292            body: Some("not json".into()),
293            body_size_bytes: 8,
294        }];
295        assert!(extract_xhr_text(&resp).is_none());
296    }
297}
298
299/// Decode the small set of HTML entities that commonly appear in `<meta>`
300/// `content` attributes. We don't pull in a full entity decoder because the
301/// metadata path only sees author-curated short text, and the long tail of
302/// `&amp_lt_named_;` references is empty in practice.
303fn decode_basic_html_entities(s: &str) -> String {
304    let mut out = String::with_capacity(s.len());
305    let mut chars = s.char_indices();
306    while let Some((i, ch)) = chars.next() {
307        if ch != '&' {
308            out.push(ch);
309            continue;
310        }
311        let rest = &s[i..];
312        let replacement: Option<(&str, &str)> = [
313            ("&amp;", "&"),
314            ("&lt;", "<"),
315            ("&gt;", ">"),
316            ("&quot;", "\""),
317            ("&apos;", "'"),
318            ("&#39;", "'"),
319            ("&nbsp;", " "),
320            ("&hellip;", "…"),
321            ("&mdash;", "—"),
322            ("&ndash;", "–"),
323            ("&rsquo;", "\u{2019}"),
324            ("&lsquo;", "\u{2018}"),
325            ("&rdquo;", "\u{201D}"),
326            ("&ldquo;", "\u{201C}"),
327        ]
328        .into_iter()
329        .find(|(needle, _)| rest.starts_with(needle));
330        if let Some((needle, value)) = replacement {
331            out.push_str(value);
332            for _ in 0..(needle.len() - 1) {
333                chars.next();
334            }
335        } else {
336            out.push(ch);
337        }
338    }
339    out
340}
341
342/// Collapse blank-line splits inside an inline punctuation-separated link list.
343///
344/// Pages like `dnb.com/business-directory/...` emit each `<a>` for an
345/// industry tag inside its own block-level wrapper, so htmd serialises
346/// `Industry:\u{a0}<a>X</a>, <a>Y</a>, Z` as
347/// `Industry:\u{a0}\n\n[X],\n\n[Y],\n\nZ`. Substring matching against
348/// `"Industry: X, Y, Z"` then fails over the embedded blank lines. The
349/// rendered page (and our plainText output) keeps the items inline; only
350/// the markdown emitter breaks them up. We undo that locally without
351/// changing real paragraph structure: NBSP → space (markdown has no
352/// NBSP semantics), then collapse exactly the blank-lines that sit
353/// between trailing punctuation (`,`, `:`, `)`) and the next inline link
354/// (`[`) or a continuing list-item word.
355fn reflow_inline_lists(s: String) -> String {
356    if !s.contains('\u{00a0}') && !s.contains(",\n\n") && !s.contains(":\n\n") {
357        return s;
358    }
359    let mut t = s.replace('\u{00a0}', " ");
360    // ":<spaces>?\n+<spaces>?[" → ": ["
361    t = INLINE_LINK_AFTER_PUNCT.replace_all(&t, "$p [").into_owned();
362    // "),<spaces>?\n+<spaces>?[" → "), ["
363    t = INLINE_LINK_AFTER_CLOSE.replace_all(&t, "), [").into_owned();
364    // ",<spaces>?\n+<spaces>?<letter>" → ", <letter>" (trailing list item that
365    // isn't itself a link, e.g. "[X], [Y], \n\nMarketing consulting services")
366    t = TRAILING_LIST_ITEM.replace_all(&t, ", $w").into_owned();
367    t
368}
369
370static INLINE_LINK_AFTER_PUNCT: once_cell::sync::Lazy<regex::Regex> =
371    once_cell::sync::Lazy::new(|| {
372        regex::Regex::new(r"(?P<p>[,:])[ \t]*\n[\s]*\[").expect("inline-link regex compiles")
373    });
374static INLINE_LINK_AFTER_CLOSE: once_cell::sync::Lazy<regex::Regex> =
375    once_cell::sync::Lazy::new(|| {
376        regex::Regex::new(r"\),[ \t]*\n[\s]*\[").expect("inline-link close regex compiles")
377    });
378static TRAILING_LIST_ITEM: once_cell::sync::Lazy<regex::Regex> = once_cell::sync::Lazy::new(|| {
379    regex::Regex::new(r",[ \t]*\n\n+(?P<w>[A-Za-z\u{00C0}-\u{FFFF}])")
380        .expect("trailing list-item regex compiles")
381});
382
383/// High-level extraction: given raw HTML + options, produce ScrapeData.
384pub fn extract(opts: ExtractOptions<'_>) -> CrwResult<ScrapeData> {
385    let ExtractOptions {
386        raw_html,
387        source_url,
388        status_code,
389        rendered_with,
390        elapsed_ms,
391        render_decision,
392        credit_cost,
393        warnings,
394        formats,
395        only_main_content,
396        include_tags,
397        exclude_tags,
398        css_selector,
399        xpath,
400        chunk_strategy,
401        query,
402        filter_mode,
403        top_k,
404        domain_selectors,
405        captured_responses,
406        llm_fallback: _,
407        debug: _,
408        debug_sink: _,
409    } = opts;
410
411    // Per-host fallback selector — used only when the caller didn't pass an
412    // explicit css_selector / xpath. User input always wins over host defaults.
413    // Track whether the *caller* opted into a narrow extraction; downstream
414    // metadata injection (title prepend) keys off this, not the merged value,
415    // so a domain-config default doesn't suppress the title fallback.
416    let user_selected = css_selector.is_some() || xpath.is_some();
417    let domain_selector_owned: Option<String> =
418        if !user_selected && let Some(map) = domain_selectors {
419            lookup_domain_selector(source_url, map)
420        } else {
421            None
422        };
423    let css_selector = css_selector.or(domain_selector_owned.as_deref());
424
425    // Step 1: Extract metadata from raw HTML.
426    let meta = readability::extract_metadata(raw_html);
427
428    // Step 2: Clean HTML (remove boilerplate, nav, ads, etc.).
429    let cleaned = clean::clean_html(raw_html, only_main_content, include_tags, exclude_tags)
430        .unwrap_or_else(|_| raw_html.to_string());
431
432    // Step 3: Apply CSS/XPath selector if provided (narrows to a specific element).
433    let selected_html = apply_selector(&cleaned, css_selector, xpath)?;
434    let after_selection = selected_html.as_deref().unwrap_or(&cleaned);
435
436    // Step 4: If only_main_content, try to narrow further with readability scoring.
437    let (content_html, cleaned_ref) = if only_main_content && selected_html.is_none() {
438        match readability::extract_main_content_with_provenance(after_selection) {
439            readability::ReadabilityOutcome::Selected { html: main, .. } => {
440                // Re-clean: readability may have selected a broad container
441                // (e.g. <article>) that still contains noise elements
442                // (infobox, navbox, catlinks, etc.).
443                let re_cleaned = clean::clean_html(&main, true, &[], &[]).unwrap_or(main);
444                (re_cleaned, Some(cleaned))
445            }
446            readability::ReadabilityOutcome::Rejected { .. } => {
447                // Listing root or empty body — skip readability and let the
448                // alternates ladder pick from cleaned / basic-clean.
449                (cleaned.clone(), Some(cleaned))
450            }
451        }
452    } else {
453        (after_selection.to_string(), None)
454    };
455
456    // Step 5: Produce requested formats. `Summary` also needs markdown
457    // internally — the summary path feeds the markdown into the LLM and then
458    // strips it from the response unless the caller also asked for markdown.
459    let md = if formats.contains(&OutputFormat::Markdown)
460        || formats.contains(&OutputFormat::Json)
461        || formats.contains(&OutputFormat::Summary)
462    {
463        let primary_md = markdown::html_to_markdown(&content_html);
464        let primary_quality = quality::analyze_md_only(&primary_md);
465
466        // Skip alternates when a selector was explicitly used (short output is
467        // intentional) or when the primary extraction is healthy.
468        // Threshold 0.4 (not 0.6) — readability output that scores 0.4+ is
469        // good enough; running alternates on it tends to swap in basic_clean
470        // (whole-body) which boosts word count but reintroduces nav noise.
471        if selected_html.is_some() || primary_quality.score > 0.4 {
472            Some(primary_md)
473        } else {
474            let mut candidates: Vec<(&'static str, String, quality::Quality)> = Vec::new();
475
476            // Alt 1: cleaned HTML (only_main_content path bypasses readability).
477            if only_main_content && let Some(c) = cleaned_ref.as_ref() {
478                let m = markdown::html_to_markdown(c);
479                let q = quality::analyze_md_only(&m);
480                candidates.push(("cleaned", m, q));
481            }
482
483            // Alt 2: basic clean without only_main_content (no readability narrowing).
484            let basic_cleaned = clean::clean_html(raw_html, false, include_tags, exclude_tags)
485                .unwrap_or_else(|_| raw_html.to_string());
486            let basic_md = markdown::html_to_markdown(&basic_cleaned);
487            let basic_q = quality::analyze_md_only(&basic_md);
488            candidates.push(("basic_clean", basic_md, basic_q));
489
490            // Alt 3: structural table/list extraction from raw HTML.
491            if let Some(structural) = extract_tables_and_lists(raw_html) {
492                let q = quality::analyze_md_only(&structural);
493                candidates.push(("structural", structural, q));
494            }
495
496            // Alt 4: XHR/fetch JSON capture — recursively walk every captured
497            // JSON body and gather long text fields. Useful when the article
498            // body lives in an API response loaded after `loadEventFired`
499            // (newsroom feeds, infinite-scroll, paywall-shielded prose).
500            if let Some(xhr_md) = extract_xhr_text(captured_responses) {
501                let q = quality::analyze_md_only(&xhr_md);
502                candidates.push(("xhr_json", xhr_md, q));
503            }
504
505            // Alt 5: plaintext fallback.
506            let plain_md = {
507                let text = plaintext::html_to_plaintext(&content_html);
508                if text.trim().is_empty() {
509                    plaintext::html_to_plaintext(&basic_cleaned)
510                } else {
511                    text
512                }
513            };
514            let plain_q = quality::analyze_md_only(&plain_md);
515            candidates.push(("plaintext", plain_md, plain_q));
516
517            // Include the primary at the head of the candidate list.
518            candidates.insert(0, ("primary", primary_md, primary_quality));
519
520            // Primary-biased pick: keep primary unless an alternate beats it by
521            // a clear margin (0.15). Without this margin, basic_clean tends to
522            // win simply by including more nav/footer words, which boosts its
523            // word count but reintroduces noise the readability primary had
524            // correctly excluded.
525            const PRIMARY_MARGIN: f32 = 0.15;
526            let primary_score = candidates[0].2.score;
527            let chosen_idx = candidates
528                .iter()
529                .enumerate()
530                .skip(1)
531                .filter(|(_, c)| c.2.score >= primary_score + PRIMARY_MARGIN)
532                .max_by(|(_, a), (_, b)| {
533                    a.2.score
534                        .partial_cmp(&b.2.score)
535                        .unwrap_or(std::cmp::Ordering::Equal)
536                        .then(a.2.bytes.cmp(&b.2.bytes))
537                })
538                .map(|(i, _)| i)
539                .unwrap_or(0);
540
541            let names: Vec<&'static str> = candidates.iter().map(|c| c.0).collect();
542            let scores: Vec<f32> = candidates.iter().map(|c| c.2.score).collect();
543            let chosen_name = candidates[chosen_idx].0;
544            tracing::debug!(
545                strategies = ?names,
546                scores = ?scores,
547                chosen = %chosen_name,
548                "quality-selected markdown extraction"
549            );
550
551            Some(candidates.swap_remove(chosen_idx).1)
552        }
553    } else {
554        None
555    };
556
557    // News/blog templates frequently render the article H1 inside a `<header>`
558    // sibling of the scored container, so readability drops it. Prepend the
559    // metadata title (preferring the cleaner og:title) when it isn't already
560    // present in the markdown — otherwise downstream recall scoring loses the
561    // most important phrase on the page (the title itself).
562    let md = md.map(|m| {
563        if user_selected {
564            return m;
565        }
566        let title = meta
567            .og_title
568            .as_deref()
569            .or(meta.title.as_deref())
570            .map(str::trim)
571            .filter(|t| !t.is_empty());
572        let Some(title) = title else { return m };
573        // <title> commonly carries " | Site Name", " – Site Name", " — Site Name",
574        // or " - Site Name" suffix; og:title is usually clean, but strip
575        // defensively in either case. Pipe is rare inside real titles so we
576        // split on the first occurrence (no whitespace required). En/em dash
577        // and ASCII hyphen REQUIRE surrounding whitespace — a bare en dash
578        // appears inside titles like "Northern Song Dynasty (960–1127)" and
579        // must not split there; bare ASCII hyphens are common in compound
580        // words. Dash splits are right-anchored so multi-segment titles like
581        // "Foo – Bar – Site Name" reduce to "Foo – Bar" rather than "Foo".
582        let core = title
583            .split('|')
584            .next()
585            .map(str::trim)
586            .filter(|s| !s.is_empty())
587            .unwrap_or(title);
588        let core = core
589            .rsplit_once(" – ")
590            .map(|(l, _)| l.trim())
591            .filter(|s| !s.is_empty())
592            .unwrap_or(core);
593        let core = core
594            .rsplit_once(" — ")
595            .map(|(l, _)| l.trim())
596            .filter(|s| !s.is_empty())
597            .unwrap_or(core);
598        let core = core
599            .rsplit_once(" - ")
600            .map(|(l, _)| l.trim())
601            .unwrap_or(core);
602        if m.contains(core) || m.contains(title) {
603            return m;
604        }
605        format!("# {core}\n\n{m}")
606    });
607
608    // When the extracted markdown is unusually short, append the page's
609    // meta description / og:description — these are author-curated summaries
610    // that frequently contain the article's key phrases, especially on:
611    //   - Forum threads where readability picked one comment instead of the
612    //     question (Discourse, vBulletin: meta description = first post body)
613    //   - Listing pages whose readability rejection drops to a thin
614    //     post-fallback ladder
615    //   - Login-walled / app-shell pages where the SSR'd description is the
616    //     only signal of what the page is about
617    // Skip when the caller used a selector (intentional narrowness), the md
618    // is already substantial, the description is short or already present,
619    // or it duplicates the page title.
620    let md = md.map(|m| {
621        if user_selected {
622            return m;
623        }
624        if m.len() >= 1500 {
625            return m;
626        }
627        // Prefer whichever of `<meta name="description">` and
628        // `<meta property="og:description">` is longer — Discourse and other
629        // forum templates set the two to *different* posts (name=description
630        // → original question, og:description → currently-displayed reply),
631        // so picking the longer surfaces more unique content. When the two
632        // diverge significantly (e.g. forum threads), append both so the
633        // markdown captures the question *and* the highlighted reply.
634        let name_desc = meta
635            .description
636            .as_deref()
637            .map(str::trim)
638            .filter(|d| !d.is_empty());
639        let og_desc = meta
640            .og_description
641            .as_deref()
642            .map(str::trim)
643            .filter(|d| !d.is_empty());
644        let combined = match (name_desc, og_desc) {
645            (Some(a), Some(b)) if a == b => decode_basic_html_entities(a),
646            (Some(a), Some(b)) => {
647                let (longer, shorter) = if a.len() >= b.len() { (a, b) } else { (b, a) };
648                let l = decode_basic_html_entities(longer);
649                let s = decode_basic_html_entities(shorter);
650                let probe_len = s.chars().take(60).map(char::len_utf8).sum::<usize>();
651                let probe = &s[..probe_len.min(s.len())];
652                if l.contains(probe) {
653                    l
654                } else {
655                    format!("{l}\n\n{s}")
656                }
657            }
658            (Some(a), None) | (None, Some(a)) => decode_basic_html_entities(a),
659            (None, None) => return m,
660        };
661        let trimmed = combined.trim();
662        // Defend against tagline-only descriptions (~30-50 chars) which add
663        // no signal but pollute the leading content. Real article summaries
664        // are nearly always >80 chars.
665        if trimmed.chars().count() < 80 {
666            return m;
667        }
668        let title_lc = meta
669            .og_title
670            .as_deref()
671            .or(meta.title.as_deref())
672            .map(|t| t.trim().to_lowercase())
673            .unwrap_or_default();
674        if !title_lc.is_empty() && trimmed.to_lowercase() == title_lc {
675            return m;
676        }
677        // Cheap containment check — if the first ~120 chars of the
678        // description already appear in the markdown, the body covers it.
679        let probe_len = trimmed.chars().take(120).map(char::len_utf8).sum::<usize>();
680        let probe = &trimmed[..probe_len.min(trimmed.len())];
681        if m.contains(probe) {
682            return m;
683        }
684        format!("{m}\n\n{trimmed}\n")
685    });
686
687    // Inline-list reflow: htmd emits each `<a>` inside a `<div>`/`<p>` wrapper as
688    // its own paragraph, so a comma-separated label-and-link list (common on
689    // company directories like dnb.com — `Industry: <a>X</a>, <a>Y</a>, Z`)
690    // becomes `Industry:\u{00a0}\n\n[X], \n\n[Y], \n\nZ`. The runtime
691    // `<a>` markup keeps the items inline; the surrounding blank lines are
692    // a markdown-emit artefact that breaks substring matching across them.
693    // Two passes: 1) NBSP → space (lossless: markdown has no NBSP semantics),
694    // 2) collapse a blank line that sits between `, : )` punctuation and the
695    // next inline link or comma-continuation paragraph.
696    let md = md.map(reflow_inline_lists);
697
698    let plain = if formats.contains(&OutputFormat::PlainText) {
699        Some(plaintext::html_to_plaintext(&content_html))
700    } else {
701        None
702    };
703
704    let raw = if formats.contains(&OutputFormat::RawHtml) {
705        Some(raw_html.to_string())
706    } else {
707        None
708    };
709
710    let html = if formats.contains(&OutputFormat::Html) {
711        Some(content_html)
712    } else {
713        None
714    };
715
716    let links = if formats.contains(&OutputFormat::Links) {
717        Some(readability::extract_links(raw_html, source_url))
718    } else {
719        None
720    };
721
722    // JSON extraction is handled asynchronously in scrape_url after extract() returns.
723    let json = None;
724
725    // Warn if filtering params are provided without a chunking strategy.
726    let orphan_chunk_warning =
727        if chunk_strategy.is_none() && (query.is_some() || filter_mode.is_some()) {
728            Some(
729                "'query' and 'filterMode' require 'chunkStrategy' to be set. \
730             These parameters were ignored."
731                    .to_string(),
732            )
733        } else {
734            None
735        };
736
737    // Step 6: Chunk the markdown if a strategy is provided.
738    let chunks = if let Some(strategy) = chunk_strategy
739        && let Some(ref markdown_text) = md
740        && !markdown_text.trim().is_empty()
741    {
742        let raw_chunks = chunking::chunk_text(markdown_text, strategy);
743
744        // Step 7: Filter chunks by relevance if query + filter_mode are set.
745        let chunk_results = if let (Some(q), Some(mode)) = (query, filter_mode)
746            && !q.trim().is_empty()
747            && !raw_chunks.is_empty()
748        {
749            filter::filter_chunks_scored(&raw_chunks, q, mode, top_k.unwrap_or(5))
750                .into_iter()
751                .map(|sc| ChunkResult {
752                    content: sc.content,
753                    score: Some(sc.score),
754                    index: sc.index,
755                })
756                .collect::<Vec<_>>()
757        } else {
758            let mut results: Vec<_> = raw_chunks
759                .into_iter()
760                .enumerate()
761                .map(|(i, c)| ChunkResult {
762                    content: c,
763                    score: None,
764                    index: i,
765                })
766                .collect();
767            if let Some(k) = top_k {
768                results.truncate(k);
769            }
770            results
771        };
772
773        if chunk_results.is_empty() {
774            None
775        } else {
776            Some(chunk_results)
777        }
778    } else {
779        None
780    };
781
782    Ok(ScrapeData {
783        markdown: md,
784        html,
785        raw_html: raw,
786        plain_text: plain,
787        links,
788        json,
789        summary: None,
790        llm_usage: None,
791        chunks,
792        warning: orphan_chunk_warning,
793        warnings,
794        render_decision,
795        credit_cost,
796        metadata: PageMetadata {
797            title: meta.title,
798            description: meta.description,
799            og_title: meta.og_title,
800            og_description: meta.og_description,
801            og_image: meta.og_image,
802            canonical_url: meta.canonical_url,
803            source_url: source_url.to_string(),
804            language: meta.language,
805            status_code,
806            rendered_with,
807            elapsed_ms,
808            page_count: None,
809            source_filename: None,
810        },
811        debug_extraction: None,
812        // Populated post-extract by the caller (single.rs / crawl.rs) from
813        // FetchResult.content_type; change_tracking is computed there too.
814        content_type: None,
815        change_tracking: None,
816    })
817}
818
819/// Apply CSS selector or XPath to narrow HTML content.
820/// Returns None if no selector is set or no match is found.
821fn apply_selector(html: &str, css: Option<&str>, xpath: Option<&str>) -> CrwResult<Option<String>> {
822    if let Some(sel) = css {
823        let result = selector::extract_by_css(html, sel).map_err(CrwError::ExtractionError)?;
824        if result.is_some() {
825            return Ok(result);
826        }
827    }
828    if let Some(xp) = xpath
829        && let Some(texts) =
830            selector::extract_by_xpath(html, xp).map_err(CrwError::ExtractionError)?
831    {
832        let wrapped = texts
833            .into_iter()
834            .map(|text| {
835                let escaped = text
836                    .replace('&', "&amp;")
837                    .replace('<', "&lt;")
838                    .replace('>', "&gt;");
839                format!("<div>{escaped}</div>")
840            })
841            .collect::<Vec<_>>()
842            .join("\n");
843        return Ok(Some(wrapped));
844    }
845    Ok(None)
846}
847
848/// Walk the raw HTML for substantial `<table>` (≥2 data rows) and
849/// `<ul>/<ol>` (≥5 items) elements, render each to markdown, and return
850/// the concatenation. Returns `None` if no qualifying structure is found.
851///
852/// This exists as a last-ditch fallback: readability and the htmd-on-cleaned
853/// path treat tabular and list-only pages (county finance reports, job
854/// listings, niche product catalogs) as navigation noise. By pulling those
855/// structures out of the raw DOM we surface real content that would
856/// otherwise be reported as thin.
857/// Walk the captured XHR/fetch JSON responses and harvest long text fields.
858/// Each response is parsed as JSON; every string value with at least
859/// `MIN_FIELD_LEN` characters is appended (deduplicated). Returned as a
860/// markdown-ish body (paragraph-separated). `None` if total content is
861/// too small to be useful.
862fn extract_xhr_text(captured: &[CapturedNetworkResponse]) -> Option<String> {
863    const MIN_FIELD_LEN: usize = 120;
864    const MIN_TOTAL_LEN: usize = 400;
865
866    if captured.is_empty() {
867        return None;
868    }
869    let mut paragraphs: Vec<String> = Vec::new();
870    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
871
872    for resp in captured {
873        let body = match resp.body.as_deref() {
874            Some(b) if !b.is_empty() => b,
875            _ => continue,
876        };
877        let value: serde_json::Value = match serde_json::from_str(body) {
878            Ok(v) => v,
879            Err(_) => continue,
880        };
881        walk_json_strings(&value, &mut |s| {
882            if s.len() >= MIN_FIELD_LEN && seen.insert(s.to_string()) {
883                paragraphs.push(s.to_string());
884            }
885        });
886    }
887
888    if paragraphs.is_empty() {
889        return None;
890    }
891    let joined = paragraphs.join("\n\n");
892    if joined.len() < MIN_TOTAL_LEN {
893        return None;
894    }
895    Some(joined)
896}
897
898fn walk_json_strings(value: &serde_json::Value, on_string: &mut dyn FnMut(&str)) {
899    match value {
900        serde_json::Value::String(s) => {
901            // Skip URLs, IDs, dates, and HTML tag fragments — keep prose only.
902            let trimmed = s.trim();
903            if trimmed.starts_with("http://")
904                || trimmed.starts_with("https://")
905                || trimmed.starts_with('/')
906                || trimmed.starts_with('<')
907            {
908                return;
909            }
910            on_string(trimmed);
911        }
912        serde_json::Value::Array(arr) => {
913            for v in arr {
914                walk_json_strings(v, on_string);
915            }
916        }
917        serde_json::Value::Object(map) => {
918            for (_, v) in map {
919                walk_json_strings(v, on_string);
920            }
921        }
922        _ => {}
923    }
924}
925
926fn extract_tables_and_lists(html: &str) -> Option<String> {
927    use scraper::{Html, Selector};
928
929    let doc = Html::parse_document(html);
930    let table_sel = Selector::parse("table").ok()?;
931    let list_sel = Selector::parse("ul, ol").ok()?;
932    let row_sel = Selector::parse("tr").ok()?;
933    let item_sel = Selector::parse("li").ok()?;
934
935    let mut chunks: Vec<String> = Vec::new();
936
937    for table in doc.select(&table_sel) {
938        if table.select(&row_sel).count() < 2 {
939            continue;
940        }
941        let html_chunk = table.html();
942        let md = markdown::html_to_markdown(&html_chunk);
943        if md.trim().len() >= 40 {
944            chunks.push(md);
945        }
946    }
947
948    for list in doc.select(&list_sel) {
949        if list.select(&item_sel).count() < 5 {
950            continue;
951        }
952        // Skip nav/footer lists — those are usually identifiable by ancestor
953        // tag and would otherwise drown out real content.
954        let in_nav = list
955            .ancestors()
956            .filter_map(scraper::ElementRef::wrap)
957            .any(|el| {
958                let n = el.value().name();
959                n == "nav" || n == "footer" || n == "header"
960            });
961        if in_nav {
962            continue;
963        }
964        let html_chunk = list.html();
965        let md = markdown::html_to_markdown(&html_chunk);
966        if md.trim().len() >= 40 {
967            chunks.push(md);
968        }
969    }
970
971    if chunks.is_empty() {
972        return None;
973    }
974    Some(chunks.join("\n\n"))
975}
976
977#[cfg(test)]
978mod table_list_fallback_tests {
979    use super::*;
980
981    #[test]
982    fn extracts_two_row_table() {
983        let html = "<html><body><nav>x</nav><table>\
984            <tr><th>Name</th><th>Value</th></tr>\
985            <tr><td>Alpha</td><td>1</td></tr>\
986            <tr><td>Bravo</td><td>2</td></tr>\
987            </table></body></html>";
988        let md = extract_tables_and_lists(html).expect("table should extract");
989        assert!(md.contains("Alpha"));
990        assert!(md.contains("Bravo"));
991    }
992
993    #[test]
994    fn skips_short_table() {
995        let html = "<table><tr><td>only</td></tr></table>";
996        assert!(extract_tables_and_lists(html).is_none());
997    }
998
999    #[test]
1000    fn skips_nav_list() {
1001        let html = "<nav><ul>\
1002            <li>a</li><li>b</li><li>c</li><li>d</li><li>e</li><li>f</li>\
1003            </ul></nav>";
1004        assert!(extract_tables_and_lists(html).is_none());
1005    }
1006
1007    #[test]
1008    fn extracts_long_list() {
1009        let html = "<main><ul>\
1010            <li>Job A</li><li>Job B</li><li>Job C</li>\
1011            <li>Job D</li><li>Job E</li><li>Job F</li>\
1012            </ul></main>";
1013        let md = extract_tables_and_lists(html).expect("list should extract");
1014        assert!(md.contains("Job A"));
1015        assert!(md.contains("Job F"));
1016    }
1017}
crw_extract/lib.rs

crw_extract/
lib.rs