1pub mod antibot;
15pub mod chunking;
16pub mod clean;
17pub mod dom_features;
18pub mod dom_util;
19pub mod filter;
20pub mod judge;
21pub mod markdown;
22pub mod pdf;
23pub mod plaintext;
24pub mod quality;
25pub mod readability;
26pub mod selector;
27pub mod structured;
28pub mod tables;
29
30use crw_core::error::{CrwError, CrwResult};
31use crw_core::types::{
32 CapturedNetworkResponse, ChunkResult, ChunkStrategy, DebugAttempt, DebugCandidate,
33 DebugExtraction, FilterMode, OutputFormat, PageMetadata, RenderDecision, ScrapeData,
34};
35use std::collections::HashMap;
36use std::sync::{Arc, Mutex};
37
38#[derive(Debug, Default)]
45pub struct DebugCollector {
46 attempts: Vec<DebugAttempt>,
47}
48
49impl DebugCollector {
50 pub fn new() -> Self {
51 Self::default()
52 }
53
54 pub fn push_attempt(&mut self, attempt: DebugAttempt) {
55 self.attempts.push(attempt);
56 }
57
58 pub fn into_extraction(self) -> DebugExtraction {
59 DebugExtraction {
60 attempts: self.attempts,
61 }
62 }
63}
64
65pub fn debug_candidate(
68 kind: impl Into<String>,
69 text: Option<String>,
70 score: f64,
71 cap_chars: Option<usize>,
72) -> DebugCandidate {
73 let text_excerpt = text.as_ref().map(|s| {
74 let mut idx = 200.min(s.len());
75 while idx > 0 && !s.is_char_boundary(idx) {
76 idx -= 1;
77 }
78 s[..idx].to_string()
79 });
80 DebugCandidate {
81 kind: kind.into(),
82 text,
83 text_excerpt,
84 cap_chars,
85 score,
86 }
87}
88
89pub mod answer;
90pub mod llm;
91pub mod pricing;
92pub mod summary;
93
94pub struct ExtractOptions<'a> {
96 pub raw_html: &'a str,
97 pub source_url: &'a str,
98 pub status_code: u16,
99 pub rendered_with: Option<String>,
100 pub elapsed_ms: u64,
101 pub render_decision: Option<RenderDecision>,
103 pub credit_cost: u32,
105 pub warnings: Vec<String>,
107 pub formats: &'a [OutputFormat],
108 pub only_main_content: bool,
109 pub include_tags: &'a [String],
110 pub exclude_tags: &'a [String],
111 pub css_selector: Option<&'a str>,
113 pub xpath: Option<&'a str>,
115 pub chunk_strategy: Option<&'a ChunkStrategy>,
117 pub query: Option<&'a str>,
119 pub filter_mode: Option<&'a FilterMode>,
121 pub top_k: Option<usize>,
123 pub domain_selectors: Option<&'a HashMap<String, String>>,
127 pub captured_responses: &'a [CapturedNetworkResponse],
130 pub llm_fallback: Option<LlmFallbackParams<'a>>,
135 pub debug: bool,
138 pub debug_sink: Option<Arc<Mutex<DebugCollector>>>,
141}
142
143#[derive(Debug, Clone)]
146pub struct LlmFallbackParams<'a> {
147 pub api_key: &'a str,
148 pub model: &'a str,
149 pub provider: &'a str,
150 pub base_url: Option<&'a str>,
151 pub quality_threshold: f32,
152 pub max_html_bytes: usize,
153 pub max_tokens: u32,
154 pub azure_api_version: Option<&'a str>,
155 pub always_run: bool,
159}
160
161pub async fn maybe_run_llm_fallback(
167 data: &mut ScrapeData,
168 raw_html: &str,
169 params: &LlmFallbackParams<'_>,
170) -> CrwResult<()> {
171 let current_md = match data.markdown.as_deref() {
172 Some(m) if !m.trim().is_empty() => m,
173 _ => "",
174 };
175 let current_quality = quality::analyze_md_only(current_md);
176 if !params.always_run && current_quality.score >= params.quality_threshold {
177 return Ok(());
178 }
179 match llm::extract_via_llm(
180 raw_html,
181 params.api_key,
182 params.provider,
183 params.model,
184 params.base_url,
185 params.max_tokens,
186 params.max_html_bytes,
187 params.azure_api_version,
188 )
189 .await
190 {
191 Ok(llm_md) => {
192 let llm_quality = quality::analyze_md_only(&llm_md);
193 if llm_quality.score > current_quality.score {
194 tracing::info!(
195 prior_score = current_quality.score,
196 llm_score = llm_quality.score,
197 "LLM fallback produced higher-quality markdown"
198 );
199 data.markdown = Some(llm_md);
200 data.warnings.push("extracted_via=llm".to_string());
201 } else {
202 tracing::debug!(
203 prior_score = current_quality.score,
204 llm_score = llm_quality.score,
205 "LLM fallback produced lower-quality markdown; keeping original"
206 );
207 }
208 }
209 Err(e) => {
210 tracing::warn!(error = %e, "LLM fallback call failed; keeping DOM extraction");
211 }
212 }
213 Ok(())
214}
215
216fn lookup_domain_selector(source_url: &str, map: &HashMap<String, String>) -> Option<String> {
218 if map.is_empty() {
219 return None;
220 }
221 let host = url::Url::parse(source_url)
222 .ok()
223 .and_then(|u| u.host_str().map(|s| s.to_string()))?;
224 map.get(&host).cloned()
225}
226
227#[cfg(test)]
228mod private_tests {
229 use super::*;
230 use crw_core::types::CapturedNetworkResponse;
231
232 #[test]
233 fn domain_selector_matches_exact_host() {
234 let mut map = HashMap::new();
235 map.insert("news.example.com".to_string(), ".article".to_string());
236 let got = lookup_domain_selector("https://news.example.com/p/42", &map);
237 assert_eq!(got.as_deref(), Some(".article"));
238 }
239
240 #[test]
241 fn domain_selector_misses_on_other_host() {
242 let mut map = HashMap::new();
243 map.insert("news.example.com".to_string(), ".article".to_string());
244 let got = lookup_domain_selector("https://other.example.com/p/42", &map);
245 assert!(got.is_none());
246 }
247
248 #[test]
249 fn domain_selector_empty_map_returns_none() {
250 let map = HashMap::new();
251 assert!(lookup_domain_selector("https://x.example.com/", &map).is_none());
252 }
253
254 #[test]
255 fn xhr_extract_returns_none_for_empty_input() {
256 assert!(extract_xhr_text(&[]).is_none());
257 }
258
259 #[test]
260 fn xhr_extract_collects_long_string_fields() {
261 let body = serde_json::json!({
262 "title": "short",
263 "body": "a".repeat(300),
264 "meta": { "summary": "b".repeat(200) },
265 "tags": ["c".repeat(150), "short"],
266 "url": "https://example.com/should/skip",
267 })
268 .to_string();
269 let resp = vec![CapturedNetworkResponse {
270 url: "https://api.example.com/article/1".to_string(),
271 request_id: "1".to_string(),
272 status: 200,
273 mime_type: Some("application/json".to_string()),
274 body: Some(body),
275 body_size_bytes: 800,
276 }];
277 let got = extract_xhr_text(&resp).expect("expected long-text fields");
278 assert!(got.contains(&"a".repeat(300)));
279 assert!(got.contains(&"b".repeat(200)));
280 assert!(got.contains(&"c".repeat(150)));
281 assert!(!got.contains("short"));
282 assert!(!got.contains("example.com/should/skip"));
283 }
284
285 #[test]
286 fn xhr_extract_skips_invalid_json() {
287 let resp = vec![CapturedNetworkResponse {
288 url: "x".into(),
289 request_id: "1".into(),
290 status: 200,
291 mime_type: Some("application/json".into()),
292 body: Some("not json".into()),
293 body_size_bytes: 8,
294 }];
295 assert!(extract_xhr_text(&resp).is_none());
296 }
297}
298
299fn decode_basic_html_entities(s: &str) -> String {
304 let mut out = String::with_capacity(s.len());
305 let mut chars = s.char_indices();
306 while let Some((i, ch)) = chars.next() {
307 if ch != '&' {
308 out.push(ch);
309 continue;
310 }
311 let rest = &s[i..];
312 let replacement: Option<(&str, &str)> = [
313 ("&", "&"),
314 ("<", "<"),
315 (">", ">"),
316 (""", "\""),
317 ("'", "'"),
318 ("'", "'"),
319 (" ", " "),
320 ("…", "…"),
321 ("—", "—"),
322 ("–", "–"),
323 ("’", "\u{2019}"),
324 ("‘", "\u{2018}"),
325 ("”", "\u{201D}"),
326 ("“", "\u{201C}"),
327 ]
328 .into_iter()
329 .find(|(needle, _)| rest.starts_with(needle));
330 if let Some((needle, value)) = replacement {
331 out.push_str(value);
332 for _ in 0..(needle.len() - 1) {
333 chars.next();
334 }
335 } else {
336 out.push(ch);
337 }
338 }
339 out
340}
341
342fn reflow_inline_lists(s: String) -> String {
356 if !s.contains('\u{00a0}') && !s.contains(",\n\n") && !s.contains(":\n\n") {
357 return s;
358 }
359 let mut t = s.replace('\u{00a0}', " ");
360 t = INLINE_LINK_AFTER_PUNCT.replace_all(&t, "$p [").into_owned();
362 t = INLINE_LINK_AFTER_CLOSE.replace_all(&t, "), [").into_owned();
364 t = TRAILING_LIST_ITEM.replace_all(&t, ", $w").into_owned();
367 t
368}
369
370static INLINE_LINK_AFTER_PUNCT: once_cell::sync::Lazy<regex::Regex> =
371 once_cell::sync::Lazy::new(|| {
372 regex::Regex::new(r"(?P<p>[,:])[ \t]*\n[\s]*\[").expect("inline-link regex compiles")
373 });
374static INLINE_LINK_AFTER_CLOSE: once_cell::sync::Lazy<regex::Regex> =
375 once_cell::sync::Lazy::new(|| {
376 regex::Regex::new(r"\),[ \t]*\n[\s]*\[").expect("inline-link close regex compiles")
377 });
378static TRAILING_LIST_ITEM: once_cell::sync::Lazy<regex::Regex> = once_cell::sync::Lazy::new(|| {
379 regex::Regex::new(r",[ \t]*\n\n+(?P<w>[A-Za-z\u{00C0}-\u{FFFF}])")
380 .expect("trailing list-item regex compiles")
381});
382
383pub fn extract(opts: ExtractOptions<'_>) -> CrwResult<ScrapeData> {
385 let ExtractOptions {
386 raw_html,
387 source_url,
388 status_code,
389 rendered_with,
390 elapsed_ms,
391 render_decision,
392 credit_cost,
393 warnings,
394 formats,
395 only_main_content,
396 include_tags,
397 exclude_tags,
398 css_selector,
399 xpath,
400 chunk_strategy,
401 query,
402 filter_mode,
403 top_k,
404 domain_selectors,
405 captured_responses,
406 llm_fallback: _,
407 debug: _,
408 debug_sink: _,
409 } = opts;
410
411 let user_selected = css_selector.is_some() || xpath.is_some();
417 let domain_selector_owned: Option<String> =
418 if !user_selected && let Some(map) = domain_selectors {
419 lookup_domain_selector(source_url, map)
420 } else {
421 None
422 };
423 let css_selector = css_selector.or(domain_selector_owned.as_deref());
424
425 let meta = readability::extract_metadata(raw_html);
427
428 let cleaned = clean::clean_html(raw_html, only_main_content, include_tags, exclude_tags)
430 .unwrap_or_else(|_| raw_html.to_string());
431
432 let selected_html = apply_selector(&cleaned, css_selector, xpath)?;
434 let after_selection = selected_html.as_deref().unwrap_or(&cleaned);
435
436 let (content_html, cleaned_ref) = if only_main_content && selected_html.is_none() {
438 match readability::extract_main_content_with_provenance(after_selection) {
439 readability::ReadabilityOutcome::Selected { html: main, .. } => {
440 let re_cleaned = clean::clean_html(&main, true, &[], &[]).unwrap_or(main);
444 (re_cleaned, Some(cleaned))
445 }
446 readability::ReadabilityOutcome::Rejected { .. } => {
447 (cleaned.clone(), Some(cleaned))
450 }
451 }
452 } else {
453 (after_selection.to_string(), None)
454 };
455
456 let md = if formats.contains(&OutputFormat::Markdown)
460 || formats.contains(&OutputFormat::Json)
461 || formats.contains(&OutputFormat::Summary)
462 {
463 let primary_md = markdown::html_to_markdown(&content_html);
464 let primary_quality = quality::analyze_md_only(&primary_md);
465
466 if selected_html.is_some() || primary_quality.score > 0.4 {
472 Some(primary_md)
473 } else {
474 let mut candidates: Vec<(&'static str, String, quality::Quality)> = Vec::new();
475
476 if only_main_content && let Some(c) = cleaned_ref.as_ref() {
478 let m = markdown::html_to_markdown(c);
479 let q = quality::analyze_md_only(&m);
480 candidates.push(("cleaned", m, q));
481 }
482
483 let basic_cleaned = clean::clean_html(raw_html, false, include_tags, exclude_tags)
485 .unwrap_or_else(|_| raw_html.to_string());
486 let basic_md = markdown::html_to_markdown(&basic_cleaned);
487 let basic_q = quality::analyze_md_only(&basic_md);
488 candidates.push(("basic_clean", basic_md, basic_q));
489
490 if let Some(structural) = extract_tables_and_lists(raw_html) {
492 let q = quality::analyze_md_only(&structural);
493 candidates.push(("structural", structural, q));
494 }
495
496 if let Some(xhr_md) = extract_xhr_text(captured_responses) {
501 let q = quality::analyze_md_only(&xhr_md);
502 candidates.push(("xhr_json", xhr_md, q));
503 }
504
505 let plain_md = {
507 let text = plaintext::html_to_plaintext(&content_html);
508 if text.trim().is_empty() {
509 plaintext::html_to_plaintext(&basic_cleaned)
510 } else {
511 text
512 }
513 };
514 let plain_q = quality::analyze_md_only(&plain_md);
515 candidates.push(("plaintext", plain_md, plain_q));
516
517 candidates.insert(0, ("primary", primary_md, primary_quality));
519
520 const PRIMARY_MARGIN: f32 = 0.15;
526 let primary_score = candidates[0].2.score;
527 let chosen_idx = candidates
528 .iter()
529 .enumerate()
530 .skip(1)
531 .filter(|(_, c)| c.2.score >= primary_score + PRIMARY_MARGIN)
532 .max_by(|(_, a), (_, b)| {
533 a.2.score
534 .partial_cmp(&b.2.score)
535 .unwrap_or(std::cmp::Ordering::Equal)
536 .then(a.2.bytes.cmp(&b.2.bytes))
537 })
538 .map(|(i, _)| i)
539 .unwrap_or(0);
540
541 let names: Vec<&'static str> = candidates.iter().map(|c| c.0).collect();
542 let scores: Vec<f32> = candidates.iter().map(|c| c.2.score).collect();
543 let chosen_name = candidates[chosen_idx].0;
544 tracing::debug!(
545 strategies = ?names,
546 scores = ?scores,
547 chosen = %chosen_name,
548 "quality-selected markdown extraction"
549 );
550
551 Some(candidates.swap_remove(chosen_idx).1)
552 }
553 } else {
554 None
555 };
556
557 let md = md.map(|m| {
563 if user_selected {
564 return m;
565 }
566 let title = meta
567 .og_title
568 .as_deref()
569 .or(meta.title.as_deref())
570 .map(str::trim)
571 .filter(|t| !t.is_empty());
572 let Some(title) = title else { return m };
573 let core = title
583 .split('|')
584 .next()
585 .map(str::trim)
586 .filter(|s| !s.is_empty())
587 .unwrap_or(title);
588 let core = core
589 .rsplit_once(" – ")
590 .map(|(l, _)| l.trim())
591 .filter(|s| !s.is_empty())
592 .unwrap_or(core);
593 let core = core
594 .rsplit_once(" — ")
595 .map(|(l, _)| l.trim())
596 .filter(|s| !s.is_empty())
597 .unwrap_or(core);
598 let core = core
599 .rsplit_once(" - ")
600 .map(|(l, _)| l.trim())
601 .unwrap_or(core);
602 if m.contains(core) || m.contains(title) {
603 return m;
604 }
605 format!("# {core}\n\n{m}")
606 });
607
608 let md = md.map(|m| {
621 if user_selected {
622 return m;
623 }
624 if m.len() >= 1500 {
625 return m;
626 }
627 let name_desc = meta
635 .description
636 .as_deref()
637 .map(str::trim)
638 .filter(|d| !d.is_empty());
639 let og_desc = meta
640 .og_description
641 .as_deref()
642 .map(str::trim)
643 .filter(|d| !d.is_empty());
644 let combined = match (name_desc, og_desc) {
645 (Some(a), Some(b)) if a == b => decode_basic_html_entities(a),
646 (Some(a), Some(b)) => {
647 let (longer, shorter) = if a.len() >= b.len() { (a, b) } else { (b, a) };
648 let l = decode_basic_html_entities(longer);
649 let s = decode_basic_html_entities(shorter);
650 let probe_len = s.chars().take(60).map(char::len_utf8).sum::<usize>();
651 let probe = &s[..probe_len.min(s.len())];
652 if l.contains(probe) {
653 l
654 } else {
655 format!("{l}\n\n{s}")
656 }
657 }
658 (Some(a), None) | (None, Some(a)) => decode_basic_html_entities(a),
659 (None, None) => return m,
660 };
661 let trimmed = combined.trim();
662 if trimmed.chars().count() < 80 {
666 return m;
667 }
668 let title_lc = meta
669 .og_title
670 .as_deref()
671 .or(meta.title.as_deref())
672 .map(|t| t.trim().to_lowercase())
673 .unwrap_or_default();
674 if !title_lc.is_empty() && trimmed.to_lowercase() == title_lc {
675 return m;
676 }
677 let probe_len = trimmed.chars().take(120).map(char::len_utf8).sum::<usize>();
680 let probe = &trimmed[..probe_len.min(trimmed.len())];
681 if m.contains(probe) {
682 return m;
683 }
684 format!("{m}\n\n{trimmed}\n")
685 });
686
687 let md = md.map(reflow_inline_lists);
697
698 let plain = if formats.contains(&OutputFormat::PlainText) {
699 Some(plaintext::html_to_plaintext(&content_html))
700 } else {
701 None
702 };
703
704 let raw = if formats.contains(&OutputFormat::RawHtml) {
705 Some(raw_html.to_string())
706 } else {
707 None
708 };
709
710 let html = if formats.contains(&OutputFormat::Html) {
711 Some(content_html)
712 } else {
713 None
714 };
715
716 let links = if formats.contains(&OutputFormat::Links) {
717 Some(readability::extract_links(raw_html, source_url))
718 } else {
719 None
720 };
721
722 let json = None;
724
725 let orphan_chunk_warning =
727 if chunk_strategy.is_none() && (query.is_some() || filter_mode.is_some()) {
728 Some(
729 "'query' and 'filterMode' require 'chunkStrategy' to be set. \
730 These parameters were ignored."
731 .to_string(),
732 )
733 } else {
734 None
735 };
736
737 let chunks = if let Some(strategy) = chunk_strategy
739 && let Some(ref markdown_text) = md
740 && !markdown_text.trim().is_empty()
741 {
742 let raw_chunks = chunking::chunk_text(markdown_text, strategy);
743
744 let chunk_results = if let (Some(q), Some(mode)) = (query, filter_mode)
746 && !q.trim().is_empty()
747 && !raw_chunks.is_empty()
748 {
749 filter::filter_chunks_scored(&raw_chunks, q, mode, top_k.unwrap_or(5))
750 .into_iter()
751 .map(|sc| ChunkResult {
752 content: sc.content,
753 score: Some(sc.score),
754 index: sc.index,
755 })
756 .collect::<Vec<_>>()
757 } else {
758 let mut results: Vec<_> = raw_chunks
759 .into_iter()
760 .enumerate()
761 .map(|(i, c)| ChunkResult {
762 content: c,
763 score: None,
764 index: i,
765 })
766 .collect();
767 if let Some(k) = top_k {
768 results.truncate(k);
769 }
770 results
771 };
772
773 if chunk_results.is_empty() {
774 None
775 } else {
776 Some(chunk_results)
777 }
778 } else {
779 None
780 };
781
782 Ok(ScrapeData {
783 markdown: md,
784 html,
785 raw_html: raw,
786 plain_text: plain,
787 links,
788 json,
789 summary: None,
790 llm_usage: None,
791 chunks,
792 warning: orphan_chunk_warning,
793 warnings,
794 render_decision,
795 credit_cost,
796 metadata: PageMetadata {
797 title: meta.title,
798 description: meta.description,
799 og_title: meta.og_title,
800 og_description: meta.og_description,
801 og_image: meta.og_image,
802 canonical_url: meta.canonical_url,
803 source_url: source_url.to_string(),
804 language: meta.language,
805 status_code,
806 rendered_with,
807 elapsed_ms,
808 page_count: None,
809 source_filename: None,
810 },
811 debug_extraction: None,
812 content_type: None,
815 change_tracking: None,
816 })
817}
818
819fn apply_selector(html: &str, css: Option<&str>, xpath: Option<&str>) -> CrwResult<Option<String>> {
822 if let Some(sel) = css {
823 let result = selector::extract_by_css(html, sel).map_err(CrwError::ExtractionError)?;
824 if result.is_some() {
825 return Ok(result);
826 }
827 }
828 if let Some(xp) = xpath
829 && let Some(texts) =
830 selector::extract_by_xpath(html, xp).map_err(CrwError::ExtractionError)?
831 {
832 let wrapped = texts
833 .into_iter()
834 .map(|text| {
835 let escaped = text
836 .replace('&', "&")
837 .replace('<', "<")
838 .replace('>', ">");
839 format!("<div>{escaped}</div>")
840 })
841 .collect::<Vec<_>>()
842 .join("\n");
843 return Ok(Some(wrapped));
844 }
845 Ok(None)
846}
847
848fn extract_xhr_text(captured: &[CapturedNetworkResponse]) -> Option<String> {
863 const MIN_FIELD_LEN: usize = 120;
864 const MIN_TOTAL_LEN: usize = 400;
865
866 if captured.is_empty() {
867 return None;
868 }
869 let mut paragraphs: Vec<String> = Vec::new();
870 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
871
872 for resp in captured {
873 let body = match resp.body.as_deref() {
874 Some(b) if !b.is_empty() => b,
875 _ => continue,
876 };
877 let value: serde_json::Value = match serde_json::from_str(body) {
878 Ok(v) => v,
879 Err(_) => continue,
880 };
881 walk_json_strings(&value, &mut |s| {
882 if s.len() >= MIN_FIELD_LEN && seen.insert(s.to_string()) {
883 paragraphs.push(s.to_string());
884 }
885 });
886 }
887
888 if paragraphs.is_empty() {
889 return None;
890 }
891 let joined = paragraphs.join("\n\n");
892 if joined.len() < MIN_TOTAL_LEN {
893 return None;
894 }
895 Some(joined)
896}
897
898fn walk_json_strings(value: &serde_json::Value, on_string: &mut dyn FnMut(&str)) {
899 match value {
900 serde_json::Value::String(s) => {
901 let trimmed = s.trim();
903 if trimmed.starts_with("http://")
904 || trimmed.starts_with("https://")
905 || trimmed.starts_with('/')
906 || trimmed.starts_with('<')
907 {
908 return;
909 }
910 on_string(trimmed);
911 }
912 serde_json::Value::Array(arr) => {
913 for v in arr {
914 walk_json_strings(v, on_string);
915 }
916 }
917 serde_json::Value::Object(map) => {
918 for (_, v) in map {
919 walk_json_strings(v, on_string);
920 }
921 }
922 _ => {}
923 }
924}
925
926fn extract_tables_and_lists(html: &str) -> Option<String> {
927 use scraper::{Html, Selector};
928
929 let doc = Html::parse_document(html);
930 let table_sel = Selector::parse("table").ok()?;
931 let list_sel = Selector::parse("ul, ol").ok()?;
932 let row_sel = Selector::parse("tr").ok()?;
933 let item_sel = Selector::parse("li").ok()?;
934
935 let mut chunks: Vec<String> = Vec::new();
936
937 for table in doc.select(&table_sel) {
938 if table.select(&row_sel).count() < 2 {
939 continue;
940 }
941 let html_chunk = table.html();
942 let md = markdown::html_to_markdown(&html_chunk);
943 if md.trim().len() >= 40 {
944 chunks.push(md);
945 }
946 }
947
948 for list in doc.select(&list_sel) {
949 if list.select(&item_sel).count() < 5 {
950 continue;
951 }
952 let in_nav = list
955 .ancestors()
956 .filter_map(scraper::ElementRef::wrap)
957 .any(|el| {
958 let n = el.value().name();
959 n == "nav" || n == "footer" || n == "header"
960 });
961 if in_nav {
962 continue;
963 }
964 let html_chunk = list.html();
965 let md = markdown::html_to_markdown(&html_chunk);
966 if md.trim().len() >= 40 {
967 chunks.push(md);
968 }
969 }
970
971 if chunks.is_empty() {
972 return None;
973 }
974 Some(chunks.join("\n\n"))
975}
976
977#[cfg(test)]
978mod table_list_fallback_tests {
979 use super::*;
980
981 #[test]
982 fn extracts_two_row_table() {
983 let html = "<html><body><nav>x</nav><table>\
984 <tr><th>Name</th><th>Value</th></tr>\
985 <tr><td>Alpha</td><td>1</td></tr>\
986 <tr><td>Bravo</td><td>2</td></tr>\
987 </table></body></html>";
988 let md = extract_tables_and_lists(html).expect("table should extract");
989 assert!(md.contains("Alpha"));
990 assert!(md.contains("Bravo"));
991 }
992
993 #[test]
994 fn skips_short_table() {
995 let html = "<table><tr><td>only</td></tr></table>";
996 assert!(extract_tables_and_lists(html).is_none());
997 }
998
999 #[test]
1000 fn skips_nav_list() {
1001 let html = "<nav><ul>\
1002 <li>a</li><li>b</li><li>c</li><li>d</li><li>e</li><li>f</li>\
1003 </ul></nav>";
1004 assert!(extract_tables_and_lists(html).is_none());
1005 }
1006
1007 #[test]
1008 fn extracts_long_list() {
1009 let html = "<main><ul>\
1010 <li>Job A</li><li>Job B</li><li>Job C</li>\
1011 <li>Job D</li><li>Job E</li><li>Job F</li>\
1012 </ul></main>";
1013 let md = extract_tables_and_lists(html).expect("list should extract");
1014 assert!(md.contains("Job A"));
1015 assert!(md.contains("Job F"));
1016 }
1017}